diff --git a/PG-PuReMD/src/cuda_environment.cu b/PG-PuReMD/src/cuda_environment.cu index 24898ec57ea754e0658d63193b264ea78c406c14..be6101a21af4af5d6f1688ad023a5aaa330263a9 100644 --- a/PG-PuReMD/src/cuda_environment.cu +++ b/PG-PuReMD/src/cuda_environment.cu @@ -1,13 +1,15 @@ #include "cuda_environment.h" + #include "cuda_utils.h" -extern "C" void Setup_Cuda_Environment (int rank, int nprocs, int gpus_per_node) + +extern "C" void Setup_Cuda_Environment(int rank, int nprocs, int gpus_per_node) { int deviceCount; cudaError_t flag; - flag = cudaGetDeviceCount (&deviceCount); + flag = cudaGetDeviceCount(&deviceCount); if ( flag != cudaSuccess ) { @@ -18,32 +20,21 @@ extern "C" void Setup_Cuda_Environment (int rank, int nprocs, int gpus_per_node) //Calculate the # of GPUs per processor //and assign the GPU for each process //TODO: handle condition where # CPU procs > # GPUs - cudaSetDevice ( (rank % (deviceCount)) ); + cudaSetDevice( (rank % (deviceCount)) ); #if defined(__CUDA_DEBUG__) fprintf( stderr, "p:%d is using GPU: %d \n", rank, (rank % deviceCount)); #endif - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - // CHANGE ORIGINAL///////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - //cudaDeviceSetLimit ( cudaLimitStackSize, 8192 ); - //cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 ); - //cudaCheckError (); - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// - /////////////////////////////////////////////// + //CHANGE ORIGINAL + //cudaDeviceSetLimit( cudaLimitStackSize, 8192 ); + //cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 ); + //cudaCheckError(); } -extern "C" void Cleanup_Cuda_Environment () +extern "C" void Cleanup_Cuda_Environment() { - cudaDeviceReset (); - cudaDeviceSynchronize (); + cudaDeviceReset(); + cudaDeviceSynchronize(); } diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h index debe20aac39b241c5064a55b80603a3a3886952f..8c353a7ca600b369ad69b2bdabd8c8e883915d21 100644 --- a/PG-PuReMD/src/reax_types.h +++ b/PG-PuReMD/src/reax_types.h @@ -22,30 +22,26 @@ #if !(defined(__REAX_TYPES_H_) || defined(__CUDA_REAX_TYPES_H_)) #ifdef __CUDACC__ - -#ifndef __CUDA_REAX_TYPES_H_ -#define __CUDA_REAX_TYPES_H_ -#define CUDA_HOST __host__ -#define CUDA_DEVICE __device__ -#define CUDA_GLOBAL __global__ -#define CUDA_HOST_DEVICE __host__ __device__ -#endif - + #ifndef __CUDA_REAX_TYPES_H_ + #define __CUDA_REAX_TYPES_H_ + #define CUDA_HOST __host__ + #define CUDA_DEVICE __device__ + #define CUDA_GLOBAL __global__ + #define CUDA_HOST_DEVICE __host__ __device__ + #endif #else - -#ifndef __REAX_TYPES_H_ -#define __REAX_TYPES_H_ -#define CUDA_HOST -#define CUDA_DEVICE -#define CUDA_GLOBAL -#define CUDA_HOST_DEVICE -#endif - + #ifndef __REAX_TYPES_H_ + #define __REAX_TYPES_H_ + #define CUDA_HOST + #define CUDA_DEVICE + #define CUDA_GLOBAL + #define CUDA_HOST_DEVICE + #endif #endif #if (defined(HAVE_CONFIG_H) && !defined(__CONFIG_H_)) -#define __CONFIG_H_ -#include "config.h" + #define __CONFIG_H_ + #include "config.h" #endif #include <ctype.h> @@ -57,13 +53,13 @@ #include <sys/time.h> #include <time.h> #include <zlib.h> -#define HOST_SCRATCH_SIZE (1024 * 1024 * 20) +#define HOST_SCRATCH_SIZE (1024 * 1024 * 20) #ifdef HAVE_CUDA -#include <cuda.h> + #include <cuda.h> #endif #if defined(__IBMC__) -#define inline __inline__ + #define inline __inline__ #endif /*IBMC*/ #define PURE_REAX diff --git a/PuReMD-GPU/Makefile.am b/PuReMD-GPU/Makefile.am index d433237070eb800b0142d0f81be5f10b42553035..016114db2dfaf3b9af623069b2f5f3ffc677907a 100644 --- a/PuReMD-GPU/Makefile.am +++ b/PuReMD-GPU/Makefile.am @@ -15,32 +15,51 @@ AM_LDFLAGS = NVCCFLAGS += -use_fast_math NVCCFLAGS += -gencode arch=compute_35,code=sm_35 NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict-aliasing" +#NVCCFLAGS += -Xcompiler -fPIC -dc #NVCCFLAGS += --ptxas-options -v bin_PROGRAMS = bin/puremd-gpu -bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c src/reset_utils.c src/param.c src/pdb_tools.c \ - src/GMRES.cu src/QEq.cu src/allocate.cu src/bond_orders.cu \ - src/box.cu src/forces.cu src/four_body_interactions.cu \ - src/grid.cu src/init_md.cu src/integrate.cu src/list.cu \ - src/lookup.cu src/neighbors.cu \ - src/restart.cu src/single_body_interactions.cu \ - src/system_props.cu src/three_body_interactions.cu \ - src/traj.cu src/two_body_interactions.cu src/vector.cu \ - src/testmd.cu \ - src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/reduction.cu \ - src/center_mass.cu src/helpers.cu src/validation.cu src/matvec.cu - +bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \ + src/restart.c src/param.c src/pdb_tools.c src/box.c \ + src/lin_alg.c src/QEq.c src/allocate.c src/bond_orders.c \ + src/forces.c src/four_body_interactions.c \ + src/grid.c src/init_md.c src/integrate.c src/list.c \ + src/lookup.c src/neighbors.c \ + src/reset_utils.c src/single_body_interactions.c \ + src/system_props.c src/three_body_interactions.c \ + src/traj.c src/two_body_interactions.c src/vector.c \ + src/testmd.c \ + src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/cuda_reduction.cu \ + src/cuda_center_mass.cu src/cuda_box.cu src/validation.cu \ + src/cuda_allocate.cu src/cuda_bond_orders.cu \ + src/cuda_lin_alg.cu src/cuda_QEq.cu \ + src/cuda_forces.cu src/cuda_four_body_interactions.cu \ + src/cuda_grid.cu src/cuda_init_md.cu src/cuda_integrate.cu src/cuda_list.cu \ + src/cuda_lookup.cu src/cuda_neighbors.cu \ + src/cuda_reset_utils.cu src/cuda_single_body_interactions.cu \ + src/cuda_system_props.cu src/cuda_three_body_interactions.cu \ + src/cuda_two_body_interactions.cu src/cuda_environment.cu \ + src/cuda_post_evolve.cu include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \ - src/reset_utils.h src/param.h src/pdb_tools.h \ - src/GMRES.h src/QEq.h src/allocate.h src/bond_orders.h \ - src/box.h src/forces.h src/four_body_interactions.h \ + src/restart.h src/param.h src/pdb_tools.h src/box.h \ + src/lin_alg.h src/QEq.h src/allocate.h src/bond_orders.h \ + src/forces.h src/four_body_interactions.h \ src/grid.h src/init_md.h src/integrate.h src/list.h \ src/lookup.h src/neighbors.h \ - src/restart.h src/single_body_interactions.h \ + src/reset_utils.h src/single_body_interactions.h \ src/system_props.h src/three_body_interactions.h \ src/traj.h src/two_body_interactions.h src/vector.h \ - src/cuda_utils.h src/cuda_copy.h src/cuda_init.h src/reduction.h \ - src/center_mass.h src/helpers.h src/validation.h src/matvec.h + src/cuda_utils.h src/cuda_copy.h src/cuda_init.h src/cuda_reduction.h \ + src/cuda_center_mass.h src/cuda_box.h src/validation.h \ + src/cuda_allocate.h src/cuda_bond_orders.h \ + src/cuda_lin_alg.h src/cuda_QEq.h \ + src/cuda_forces.h src/cuda_four_body_interactions.h \ + src/cuda_grid.h src/cuda_init_md.h src/cuda_integrate.h src/cuda_list.h \ + src/cuda_lookup.h src/cuda_neighbors.h \ + src/cuda_reset_utils.h src/cuda_single_body_interactions.h \ + src/cuda_system_props.h src/cuda_three_body_interactions.h \ + src/cuda_two_body_interactions.h src/cuda_environment.h \ + src/cuda_post_evolve.h # dummy source to cause C linking nodist_EXTRA_bin_puremd_gpu_SOURCES = src/dummy.c diff --git a/PuReMD-GPU/configure.ac b/PuReMD-GPU/configure.ac index c947ed021a0544653895f01c59dce33e4a07a902..30e7e0bff6bc3115193c75d3b2d0dcbe29a5cf4b 100644 --- a/PuReMD-GPU/configure.ac +++ b/PuReMD-GPU/configure.ac @@ -42,19 +42,6 @@ AC_SEARCH_LIBS([gzeof], [z]) AC_SEARCH_LIBS([gzgets], [z]) AC_SEARCH_LIBS([gzseek], [z]) AC_SEARCH_LIBS([gzclose, [z]]) -AC_SEARCH_LIBS([cublasCheckError], [cublas]) -AC_SEARCH_LIBS([cublasDnrm2], [cublas]) -AC_SEARCH_LIBS([cublasDaxpy], [cublas]) -AC_SEARCH_LIBS([cublasDscal], [cublas]) -AC_SEARCH_LIBS([cublasDdot], [cublas]) -AC_SEARCH_LIBS([cudaThreadSynchronize], [cuda]) -AC_SEARCH_LIBS([cudaCheckError], [cuda]) -# FIXME: Replace `main' with a function in `-lcudart': -#AC_CHECK_LIB([cudart], [main]) -AC_SEARCH_LIBS([cusparseCheckError], [cusparse]) -AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse]) -AC_SEARCH_LIBS([cusparseSetMatType], [cusparse]) -AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse]) # Checks for typedefs, structures, and compiler characteristics. AC_CHECK_HEADER_STDBOOL @@ -78,10 +65,26 @@ then fi AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.]) -if test "BUILD_PROF" = "true" -then - NVCCFLAGS+=" --compiler-options ${gprof_flags}" -fi +AC_SEARCH_LIBS([cublasDnrm2], [cublas]) +AC_SEARCH_LIBS([cublasDaxpy], [cublas]) +AC_SEARCH_LIBS([cublasDscal], [cublas]) +AC_SEARCH_LIBS([cublasDdot], [cublas]) +AC_SEARCH_LIBS([cudaThreadSynchronize], [cudart]) +AC_SEARCH_LIBS([cudaGetLastError], [cudart]) +AC_CHECK_LIB([cudart], [cudaMalloc]) +AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse]) +AC_SEARCH_LIBS([cusparseSetMatType], [cusparse]) +AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse]) + +AC_SEARCH_LIBS([cublasDnrm2], [cublas], + [CUBLAS_FOUND_LIBS="yes"], [CUBLAS_FOUND_LIBS="no"], [-lcublas]) +AS_IF([test "x${CUBLAS_FOUND_LIBS}" != "xyes"], + [AC_MSG_ERROR([Unable to find CUBLAS library.])]) + +AC_SEARCH_LIBS([cusparseSetMatType], [cusparse], + [CUSPARSE_FOUND_LIBS="yes"], [CUSPARSE_FOUND_LIBS="no"], [-lcusparse]) +AS_IF([test "x${CUSPARSE_FOUND_LIBS}" != "xyes"], + [AC_MSG_ERROR([Unable to find CUSPARSE library.])]) AC_CHECK_TYPES([cublasHandle_t], [], [AC_MSG_FAILURE([cublasHandle_t type not found in cublas.h], [1])], [#include<cublas_v2.h>]) @@ -89,10 +92,12 @@ AC_CHECK_TYPES([cusparseHandle_t], [], [AC_MSG_FAILURE([cusparseHandle_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>]) AC_CHECK_TYPES([cusparseMatDescr_t], [], [AC_MSG_FAILURE([cusparseMatDescr_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>]) -#AC_CHECK_TYPES([CUSPARSE_MATRIX_TYPE_GENERAL], [], -# [AC_MSG_FAILURE([CUSPARSE_MATRIX_TYPE_GENERAL type not found in cusparse.h], [1])], [#include<cusparse_v2.h>]) -#AC_CHECK_TYPES([CUSPARSE_INDEX_BASE_ZERO], [], -# [AC_MSG_FAILURE([CUSPARSE_INDEX_BASE_ZERO type not found in cusparse.h], [1])], [#include<cusparse_v2.h>]) + +if test "BUILD_PROF" = "true" +then + NVCCFLAGS+=" --compiler-options ${gprof_flags}" +fi + AC_CONFIG_FILES([Makefile]) diff --git a/PuReMD-GPU/src/GMRES.cu b/PuReMD-GPU/src/GMRES.cu deleted file mode 100644 index d00100e9ced86d2b1b7a8d1f37b67648576cdca1..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/GMRES.cu +++ /dev/null @@ -1,1138 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#include "GMRES.h" -#include "list.h" -#include "vector.h" -#include "index_utils.h" - -#include "cuda_copy.h" -#include "cuda_utils.h" -#include "reduction.h" -#include "matvec.h" -#include "system_props.h" - -#include "cublas_v2.h" -#include "cusparse_v2.h" - -void Sparse_MatVec( sparse_matrix *A, real *x, real *b ) -{ - int i, j, k, n, si, ei; - real H; - - n = A->n; - for( i = 0; i < n; ++i ) - b[i] = 0; - - for( i = 0; i < n; ++i ) { - si = A->start[i]; - ei = A->start[i+1]-1; - - for( k = si; k < ei; ++k ) { - j = A->entries[k].j; - H = A->entries[k].val; - b[j] += H * x[i]; - b[i] += H * x[j]; - } - - // the diagonal entry is the last one in - b[i] += A->entries[k].val * x[i]; - } -} - - -void Forward_Subs( sparse_matrix *L, real *b, real *y ) -{ - int i, pj, j, si, ei; - real val; - - for( i = 0; i < L->n; ++i ) { - y[i] = b[i]; - si = L->start[i]; - ei = L->start[i+1]; - for( pj = si; pj < ei-1; ++pj ){ - j = L->entries[pj].j; - val = L->entries[pj].val; - y[i] -= val * y[j]; - } - y[i] /= L->entries[pj].val; - } -} - - -void Backward_Subs( sparse_matrix *U, real *y, real *x ) -{ - int i, pj, j, si, ei; - real val; - - for( i = U->n-1; i >= 0; --i ) { - x[i] = y[i]; - si = U->start[i]; - ei = U->start[i+1]; - for( pj = si+1; pj < ei; ++pj ){ - j = U->entries[pj].j; - val = U->entries[pj].val; - x[i] -= val * x[j]; - } - x[i] /= U->entries[si].val; - } -} - - -int GMRES( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout, reax_system* system) -{ - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - - N = H->n; - bnorm = Norm( b, N ); - - /* apply the diagonal pre-conditioner to rhs */ - for( i = 0; i < N; ++i ) - workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; - - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* calculate r0 */ - Sparse_MatVec( H, x, workspace->b_prm ); - - for( i = 0; i < N; ++i ) - workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ - - - Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); - workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); - Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system)], N ); - - /* GMRES inner-loop */ - for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { - /* matvec */ - Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); - - for( k = 0; k < N; ++k ) - workspace->v[ index_wkspace_sys (j+1,k,system)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ - - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i <= j; i++ ) { - workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)], - -workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N ); - } - - - workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], - 1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); - - - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - for( i = 0; i <= j; i++ ) { - if( i == j ) { - cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) ); - workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc; - workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc; - } - - tmp1 = workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + - workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ]; - tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + - workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ]; - - workspace->h[ index_wkspace_res (i,j) ] = tmp1; - workspace->h[ index_wkspace_res (i+1,j) ] = tmp2; - } - - /* apply Givens rotations to the rhs as well */ - tmp1 = workspace->hc[j] * workspace->g[j]; - tmp2 = -workspace->hs[j] * workspace->g[j]; - workspace->g[j] = tmp1; - workspace->g[j+1] = tmp2; - - // fprintf( stderr, "h: " ); - // for( i = 0; i <= j+1; ++i ) - // fprintf( stderr, "%.6f ", workspace->h[i][j] ); - // fprintf( stderr, "\n" ); - //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); - } - - - /* solve Hy = g. - H is now upper-triangular, do back-substitution */ - for( i = j-1; i >= 0; i-- ) { - temp = workspace->g[i]; - for( k = j-1; k > i; k-- ) - temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; - - workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ]; - } - - - /* update x = x_0 + Vy */ - for( i = 0; i < j; i++ ) - Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N ); - - /* stopping condition */ - if( fabs(workspace->g[j]) / bnorm <= tol ) - break; - } - - // Sparse_MatVec( H, x, workspace->b_prm ); - // for( i = 0; i < N; ++i ) - // workspace->b_prm[i] *= workspace->Hdia_inv[i]; - // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); - // for( i = 0; i < N; ++i ) - // fprintf( fout, "%10.5f%15.12f%15.12f\n", - // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ - - // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", - // itr, j, fabs( workspace->g[j] ) / bnorm ); - // data->timing.matvec += itr * RESTART + j; - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - // return -1; - return itr * (RESTART+1) + j + 1; - } - - return itr * (RESTART+1) + j + 1; -} - - -///////////////////////////////////////////////////////////////// -//Cuda Functions for GMRES implementation -///////////////////////////////////////////////////////////////// - -GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv, int entries) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= entries) return; - - b_proc [i] = b[i] * Hdia_inv[i]; -} - -GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j, real *output) -{ - real tmp1, tmp2, cc; - - for( int i = 0; i <= j; i++ ) { - if( i == j ) { - cc = SQRT( SQR(h[ index_wkspace_res (j,j) ])+SQR(h[ index_wkspace_res (j+1,j) ]) ); - hc[j] = h[ index_wkspace_res (j,j) ] / cc; - hs[j] = h[ index_wkspace_res (j+1,j) ] / cc; - } - - tmp1 = hc[i] * h[ index_wkspace_res (i,j) ] + hs[i] * h[ index_wkspace_res (i+1,j) ]; - tmp2 = -hs[i] * h[ index_wkspace_res (i,j) ] + hc[i] * h[ index_wkspace_res (i+1,j) ]; - - h[ index_wkspace_res (i,j) ] = tmp1; - h[ index_wkspace_res (i+1,j) ] = tmp2; - } - - /* apply Givens rotations to the rhs as well */ - tmp1 = hc[j] * g_j; - tmp2 = -hs[j] * g_j; - - output[0] = tmp1; - output[1] = tmp2; -} - -GLOBAL void GMRES_BackSubstitution (int j, real *g, real *h, real *y) -{ - real temp; - for( int i = j-1; i >= 0; i-- ) { - temp = g[i]; - for( int k = j-1; k > i; k-- ) - temp -= h[ index_wkspace_res (i,k) ] * y[k]; - - y[i] = temp / h[ index_wkspace_res (i,i) ]; - } -} - - -int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x ) -{ - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - real v_add_tmp; - sparse_matrix *H = &workspace->H; - - real t_start, t_elapsed; - - real *spad = (real *)scratch; - real *g = (real *) calloc ((RESTART+1), REAL_SIZE); - - N = H->n; - - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); - - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Norm of the array is %e \n", bnorm ); -#endif - - /* apply the diagonal pre-conditioner to rhs */ - GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> - (workspace->b_prc, b, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* calculate r0 */ - //Sparse_MatVec( H, x, workspace->b_prm ); - Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N ); - cudaThreadSynchronize (); - cudaCheckError (); - - GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>> - (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>> - (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); - cudaThreadSynchronize (); - cudaCheckError (); - - //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); - { - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); - - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G); - } - - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - - /* GMRES inner-loop */ -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] ); -#endif - for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) { - /* matvec */ - //Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); - Cuda_Matvec_csr - <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> - ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); - cudaThreadSynchronize (); - cudaCheckError (); - - GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> - (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i <= j; i++ ) { - Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[index_wkspace_sys(j+1,0,N)], - -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - } - - - //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); - - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[index_wkspace_sys(j+1,0,N)], - 1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - GMRES_Givens_Rotation <<<1, 1>>> - (j, workspace->h, workspace->hc, workspace->hs, g[j], spad); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - } - - copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); - - /* solve Hy = g. - H is now upper-triangular, do back-substitution */ - copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); - GMRES_BackSubstitution <<<1, 1>>> - (j, spad, workspace->h, workspace->y); - cudaThreadSynchronize (); - cudaCheckError (); - - /* update x = x_0 + Vy */ - for( i = 0; i < j; i++ ) - { - copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>> - ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - } - - /* stopping condition */ - if( fabs(g[j]) / bnorm <= tol ) - break; - } - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - return itr * (RESTART+1) + j + 1; - } - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); -#endif - return itr * (RESTART+1) + j + 1; -} - - -int Cublas_GMRES(reax_system *system, static_storage *workspace, real *b, real tol, real *x ) -{ - - real CSR_ALPHA = 1, CSR_BETA = 0; - - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - real v_add_tmp; - sparse_matrix *H = &workspace->H; - - real t_start, t_elapsed; - - real *spad = (real *)scratch; - real *g = (real *) calloc ((RESTART+1), REAL_SIZE); - - N = H->n; - - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); - - /* - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - */ - - cublasCheckError (cublasDnrm2 ( cublasHandle, N, b, 1, &bnorm )); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Norm of the array is %e \n", bnorm ); -#endif - - /* apply the diagonal pre-conditioner to rhs */ - GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> - (workspace->b_prc, b, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* calculate r0 */ - //Sparse_MatVec( H, x, workspace->b_prm ); - Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N ); - cudaThreadSynchronize (); - cudaCheckError (); - - GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>> - (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - /* - Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>> - (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); - cudaThreadSynchronize (); - cudaCheckError (); - */ - cuda_memset (workspace->v, 0, REAL_SIZE * (RESTART+1) * N, RES_STORAGE_V); - - double D_ONE = 1.; - double D_MINUS_ONE = -1.; - cublasCheckError (cublasDaxpy (cublasHandle, N, &D_ONE, workspace->b_prc, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); - cublasCheckError (cublasDaxpy (cublasHandle, N, &D_MINUS_ONE, workspace->b_prm, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); - - //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N ); - { - /* - cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); - - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G); - */ - - cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (0, 0, N)], 1, g )); - copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); - } - - /* - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - double D_SCALE = 1.0 / g[0]; - cublasCheckError (cublasDscal (cublasHandle, N, &D_SCALE, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); - - - /* GMRES inner-loop */ -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] ); -#endif - for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) { - /* matvec */ - Cuda_Matvec_csr - <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> - ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); - cudaThreadSynchronize (); - cudaCheckError (); - - GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> - (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N); - cudaThreadSynchronize (); - cudaCheckError (); - - - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i <= j; i++ ) { - - /* - Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - */ - - cublasCheckError (cublasDdot (cublasHandle, N, &workspace->v[index_wkspace_sys(i,0,N)], 1, - &workspace->v[index_wkspace_sys(j+1,0,N)], 1, - &v_add_tmp)); - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); - - /* - Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[index_wkspace_sys(j+1,0,N)], - -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - double NEG_V_ADD_TMP = -v_add_tmp; - cublasCheckError (cublasDaxpy (cublasHandle, N, &NEG_V_ADD_TMP, &workspace->v[index_wkspace_sys(i,0,N)], 1, - &workspace->v[index_wkspace_sys(j+1,0,N)], 1 )); - } - - - //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - /* - cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); - - Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - */ - cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (j+1, 0, N)], 1, &v_add_tmp )); - copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); - - - /* - Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> - ( &workspace->v[index_wkspace_sys(j+1,0,N)], - 1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - */ - double REC_V_ADD_TMP = 1. / v_add_tmp; - cublasCheckError (cublasDscal (cublasHandle, N, &REC_V_ADD_TMP, &workspace->v[index_wkspace_sys(j+1,0,N)], 1)); - - - - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - GMRES_Givens_Rotation <<<1, 1>>> - (j, workspace->h, workspace->hc, workspace->hs, g[j], spad); - cudaThreadSynchronize (); - cudaCheckError (); - copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - } - - copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); - - /* solve Hy = g. - H is now upper-triangular, do back-substitution */ - copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); - GMRES_BackSubstitution <<<1, 1>>> - (j, spad, workspace->h, workspace->y); - cudaThreadSynchronize (); - cudaCheckError (); - - /* update x = x_0 + Vy */ - for( i = 0; i < j; i++ ) - { - /* - copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>> - ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - cublasCheckError (cublasDaxpy (cublasHandle, N, &v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], 1, - x, 1)); - } - - /* stopping condition */ - if( fabs(g[j]) / bnorm <= tol ) - break; - } - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - return itr * (RESTART+1) + j + 1; - } - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); -#endif - return itr * (RESTART+1) + j + 1; -} - -int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout, reax_system *system) -{ - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - real v[10000], z[RESTART+2][10000], w[RESTART+2]; - real u[RESTART+2][10000]; - - N = H->n; - bnorm = Norm( b, N ); - - /* apply the diagonal pre-conditioner to rhs */ - for( i = 0; i < N; ++i ) - workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; - - // memset( x, 0, sizeof(real) * N ); - - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* compute z = r0 */ - Sparse_MatVec( H, x, workspace->b_prm ); - for( i = 0; i < N; ++i ) - workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ - Vector_Sum( z[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); - - Vector_MakeZero( w, RESTART+1 ); - w[0] = Norm( z[0], N ); - - Vector_Copy( u[0], z[0], N ); - u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0]; - Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N ); - - w[0] *= ( u[0][0] < 0.0 ? 1 :-1 ); - // fprintf( stderr, "\n\n%12.6f\n", w[0] ); - - /* GMRES inner-loop */ - for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) { - /* compute v_j */ - Vector_Scale( z[j], -2 * u[j][j], u[j], N ); - z[j][j] += 1.; /* due to e_j */ - - for( i = j-1; i >= 0; --i ) - Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i ); - - - /* matvec */ - Sparse_MatVec( H, z[j], v ); - - for( k = 0; k < N; ++k ) - v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */ - - for( i = 0; i <= j; ++i ) - Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i ); - - - if( !Vector_isZero( v + (j+1), N - (j+1) ) ) { - /* compute the HouseHolder unit vector u_j+1 */ - for( i = 0; i <= j; ++i ) - u[j+1][i] = 0; - - Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) ); - - u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) ); - - Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N ); - - /* overwrite v with P_m+1 * v */ - v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1]; - Vector_MakeZero( v + (j+2), N - (j+2) ); - // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N ); - } - - - /* prev Givens rots on the upper-Hessenberg matrix to make it U */ - for( i = 0; i < j; i++ ) { - tmp1 = workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1]; - tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1]; - - v[i] = tmp1; - v[i+1] = tmp2; - } - - /* apply the new Givens rotation to H and right-hand side */ - if( fabs(v[j+1]) >= ALMOST_ZERO ) { - cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) ); - workspace->hc[j] = v[j] / cc; - workspace->hs[j] = v[j+1] / cc; - - tmp1 = workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1]; - tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1]; - - v[j] = tmp1; - v[j+1] = tmp2; - - /* Givens rotations to rhs */ - tmp1 = workspace->hc[j] * w[j]; - tmp2 = -workspace->hs[j] * w[j]; - w[j] = tmp1; - w[j+1] = tmp2; - } - - /* extend R */ - for( i = 0; i <= j; ++i ) - workspace->h[ index_wkspace_res (i,j) ] = v[i]; - - - // fprintf( stderr, "h:" ); - // for( i = 0; i <= j+1 ; ++i ) - // fprintf( stderr, "%.6f ", h[i][j] ); - // fprintf( stderr, "\n" ); - // fprintf( stderr, "%12.6f\n", w[j+1] ); - } - - - /* solve Hy = w. - H is now upper-triangular, do back-substitution */ - for( i = j-1; i >= 0; i-- ) { - temp = w[i]; - for( k = j-1; k > i; k-- ) - temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; - - workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ]; - } - - // fprintf( stderr, "y: " ); - // for( i = 0; i < RESTART+1; ++i ) - // fprintf( stderr, "%8.3f ", workspace->y[i] ); - - - /* update x = x_0 + Vy */ - // memset( z, 0, sizeof(real) * N ); - // for( i = j-1; i >= 0; i-- ) - // { - // Vector_Copy( v, z, N ); - // v[i] += workspace->y[i]; - // - // Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N ); - // } - // - // fprintf( stderr, "\nz: " ); - // for( k = 0; k < N; ++k ) - // fprintf( stderr, "%6.2f ", z[k] ); - - // fprintf( stderr, "\nx_bef: " ); - // for( i = 0; i < N; ++i ) - // fprintf( stderr, "%6.2f ", x[i] ); - - // Vector_Add( x, 1, z, N ); - for( i = j-1; i >= 0; i-- ) - Vector_Add( x, workspace->y[i], z[i], N ); - - // fprintf( stderr, "\nx_aft: " ); - // for( i = 0; i < N; ++i ) - // fprintf( stderr, "%6.2f ", x[i] ); - - /* stopping condition */ - if( fabs( w[j] ) / bnorm <= tol ) - break; - } - - // Sparse_MatVec( H, x, workspace->b_prm ); - // for( i = 0; i < N; ++i ) - // workspace->b_prm[i] *= workspace->Hdia_inv[i]; - - // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); - // for( i = 0; i < N; ++i ) - // fprintf( fout, "%10.5f%15.12f%15.12f\n", - // workspace->b_prc[i], workspace->b_prm[i], x[i] ); - - //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", - // itr, j, fabs( workspace->g[j] ) / bnorm ); - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - // return -1; - return itr * (RESTART+1) + j + 1; - } - - return itr * (RESTART+1) + j + 1; -} - - -int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, - sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system ) -{ - int i, j, k, itr, N; - real cc, tmp1, tmp2, temp, bnorm; - - N = H->n; - bnorm = Norm( b, N ); - - /* GMRES outer-loop */ - for( itr = 0; itr < MAX_ITR; ++itr ) { - /* calculate r0 */ - Sparse_MatVec( H, x, workspace->b_prm ); - Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system)], 1., b, -1., workspace->b_prm, N ); - Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] ); - Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] ); - workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system)], N ); - Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system)], N ); - //fprintf( stderr, "res: %.15e\n", workspace->g[0] ); - - /* GMRES inner-loop */ - for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { - /* matvec */ - Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system)], &workspace->v[index_wkspace_sys (j+1,0,system)] ); - Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); - Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] ); - - /* apply modified Gram-Schmidt to orthogonalize the new residual */ - for( i = 0; i < j-1; i++ ) workspace->h[ index_wkspace_res (i,j)] = 0; - - //for( i = 0; i <= j; i++ ) { - for( i = MAX(j-1,0); i <= j; i++ ) { - workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N ); - } - - workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system)], N ); - Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], - 1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system)], N ); - // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); - - /* Givens rotations on the upper-Hessenberg matrix to make it U */ - for( i = MAX(j-1,0); i <= j; i++ ) { - if( i == j ) { - cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) ); - workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc; - workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc; - } - - tmp1 = workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + - workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ]; - tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + - workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ]; - - workspace->h[ index_wkspace_res (i,j) ] = tmp1; - workspace->h[ index_wkspace_res (i+1,j) ] = tmp2; - } - - /* apply Givens rotations to the rhs as well */ - tmp1 = workspace->hc[j] * workspace->g[j]; - tmp2 = -workspace->hs[j] * workspace->g[j]; - workspace->g[j] = tmp1; - workspace->g[j+1] = tmp2; - - //fprintf( stderr, "h: " ); - //for( i = 0; i <= j+1; ++i ) - //fprintf( stderr, "%.6f ", workspace->h[i][j] ); - //fprintf( stderr, "\n" ); - //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); - } - - - /* solve Hy = g: H is now upper-triangular, do back-substitution */ - for( i = j-1; i >= 0; i-- ) { - temp = workspace->g[i]; - for( k = j-1; k > i; k-- ) - temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; - - workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)]; - } - - /* update x = x_0 + Vy */ - Vector_MakeZero( workspace->p, N ); - for( i = 0; i < j; i++ ) - Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N ); - //Backward_Subs( U, workspace->p, workspace->p ); - //Forward_Subs( L, workspace->p, workspace->p ); - Vector_Add( x, 1., workspace->p, N ); - - /* stopping condition */ - if( fabs(workspace->g[j]) / bnorm <= tol ) - break; - } - - // Sparse_MatVec( H, x, workspace->b_prm ); - // for( i = 0; i < N; ++i ) - // workspace->b_prm[i] *= workspace->Hdia_inv[i]; - // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); - // for( i = 0; i < N; ++i ) - // fprintf( fout, "%10.5f%15.12f%15.12f\n", - // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ - - // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", - // itr, j, fabs( workspace->g[j] ) / bnorm ); - // data->timing.matvec += itr * RESTART + j; - - if( itr >= MAX_ITR ) { - fprintf( stderr, "GMRES convergence failed\n" ); - // return -1; - return itr * (RESTART+1) + j + 1; - } - - return itr * (RESTART+1) + j + 1; - } - - - - int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, - sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system ) - { - int i, N; - real tmp, alpha, beta, b_norm, r_norm; - real sig0, sig_old, sig_new; - - N = A->n; - b_norm = Norm( b, N ); - //fprintf( stderr, "b_norm: %.15e\n", b_norm ); - - Sparse_MatVec( A, x, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); - r_norm = Norm(workspace->r, N); - //Print_Soln( workspace, x, q, b, N ); - //fprintf( stderr, "res: %.15e\n", r_norm ); - - Forward_Subs( L, workspace->r, workspace->d ); - Backward_Subs( U, workspace->d, workspace->p ); - sig_new = Dot( workspace->r, workspace->p, N ); - sig0 = sig_new; - - for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) { - //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) { - Sparse_MatVec( A, workspace->p, workspace->q ); - tmp = Dot( workspace->q, workspace->p, N ); - alpha = sig_new / tmp; - Vector_Add( x, alpha, workspace->p, N ); - //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n", - // i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp ); - - Vector_Add( workspace->r, -alpha, workspace->q, N ); - r_norm = Norm(workspace->r, N); - //fprintf( stderr, "res: %.15e\n", r_norm ); - - Forward_Subs( L, workspace->r, workspace->d ); - Backward_Subs( U, workspace->d, workspace->d ); - sig_old = sig_new; - sig_new = Dot( workspace->r, workspace->d, N ); - beta = sig_new / sig_old; - Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N ); - } - - //fprintf( fout, "CG took %d iterations\n", i ); - if( i >= 200 ) { - fprintf( stderr, "CG convergence failed!\n" ); - return i; - } - - return i; - } - - - int CG( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout, reax_system *system) - { - int i, j, N; - real tmp, alpha, beta, b_norm; - real sig_old, sig_new, sig0; - - N = H->n; - b_norm = Norm( b, N ); - //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); - - Sparse_MatVec( H, x, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - sig_new = Dot( workspace->r, workspace->d, N ); - sig0 = sig_new; - //Print_Soln( workspace, x, q, b, N ); - //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", - // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) ); - //fprintf( stderr, "sig_new: %f\n", sig_new ); - - for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) { - //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) { - Sparse_MatVec( H, workspace->d, workspace->q ); - tmp = Dot( workspace->d, workspace->q, N ); - //fprintf( stderr, "tmp: %f\n", tmp ); - alpha = sig_new / tmp; - Vector_Add( x, alpha, workspace->d, N ); - //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", - // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); - - Vector_Add( workspace->r, -alpha, workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - sig_old = sig_new; - sig_new = Dot( workspace->r, workspace->p, N ); - beta = sig_new / sig_old; - Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N ); - //fprintf( stderr, "sig_new: %f\n", sig_new ); - } - - fprintf( stderr, "CG took %d iterations\n", i ); - - if( i >= 300 ) { - fprintf( stderr, "CG convergence failed!\n" ); - return i; - } - - return i; - } - - - - /* Steepest Descent */ - int SDM( static_storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, FILE *fout ) - { - int i, j, N; - real tmp, alpha, beta, b_norm; - real sig0, sig; - - N = H->n; - b_norm = Norm( b, N ); - //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); - - Sparse_MatVec( H, x, workspace->q ); - Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - sig = Dot( workspace->r, workspace->d, N ); - sig0 = sig; - - for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) { - Sparse_MatVec( H, workspace->d, workspace->q ); - - sig = Dot( workspace->r, workspace->d, N ); - tmp = Dot( workspace->d, workspace->q, N ); - alpha = sig / tmp; - - Vector_Add( x, alpha, workspace->d, N ); - Vector_Add( workspace->r, -alpha, workspace->q, N ); - for( j = 0; j < N; ++j ) - workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; - - //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", - // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); - } - - fprintf( stderr, "SDM took %d iterations\n", i ); - - if( i >= 300 ) { - fprintf( stderr, "SDM convergence failed!\n" ); - return i; - } - - return i; - } - diff --git a/PuReMD-GPU/src/QEq.c b/PuReMD-GPU/src/QEq.c new file mode 100644 index 0000000000000000000000000000000000000000..8cc638ea90dcc25f86d33f275b162c8e531d82bb --- /dev/null +++ b/PuReMD-GPU/src/QEq.c @@ -0,0 +1,396 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "QEq.h" + +#include "allocate.h" +#include "lin_alg.h" +#include "list.h" +#include "print_utils.h" +#include "index_utils.h" +#include "system_props.h" + +#include "sort.h" + + +int compare_matrix_entry(const void *v1, const void *v2) +{ + return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; +} + + +void Sort_Matrix_Rows( sparse_matrix *A ) +{ + int i, si, ei; + + for( i = 0; i < A->n; ++i ) { + si = A->start[i]; + ei = A->start[i+1]; + qsort( &(A->entries[si]), ei - si, + sizeof(sparse_matrix_entry), compare_matrix_entry ); + } +} + + +void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol ) +{ + int i, j, k; + real val; + + /* init droptol to 0 */ + for( i = 0; i < A->n; ++i ) + droptol[i] = 0; + + /* calculate sqaure of the norm of each row */ + for( i = 0; i < A->n; ++i ) { + for( k = A->start[i]; k < A->start[i+1]-1; ++k ) { + j = A->entries[k].j; + val = A->entries[k].val; + + droptol[i] += val*val; + droptol[j] += val*val; + } + + val = A->entries[k].val; // diagonal entry + droptol[i] += val*val; + } + + /* calculate local droptol for each row */ + //fprintf( stderr, "droptol: " ); + for( i = 0; i < A->n; ++i ) { + //fprintf( stderr, "%f-->", droptol[i] ); + droptol[i] = SQRT( droptol[i] ) * dtol; + //fprintf( stderr, "%f ", droptol[i] ); + } + //fprintf( stderr, "\n" ); +} + + +int Estimate_LU_Fill( sparse_matrix *A, real *droptol ) +{ + int i, j, pj; + int fillin; + real val; + + fillin = 0; + + //fprintf( stderr, "n: %d\n", A->n ); + for( i = 0; i < A->n; ++i ) + for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){ + j = A->entries[pj].j; + val = A->entries[pj].val; + //fprintf( stderr, "i: %d, j: %d", i, j ); + + if( fabs(val) > droptol[i] ) + ++fillin; + } + + return fillin + A->n; +} + + +void ICHOLT( sparse_matrix *A, real *droptol, + sparse_matrix *L, sparse_matrix *U ) +{ + sparse_matrix_entry tmp[1000]; + int i, j, pj, k1, k2, tmptop, Ltop; + real val; + int *Utop; + + Utop = (int*) malloc((A->n+1) * sizeof(int)); + + // clear variables + Ltop = 0; + tmptop = 0; + for( i = 0; i <= A->n; ++i ) + L->start[i] = U->start[i] = 0; + + for( i = 0; i < A->n; ++i ) + Utop[i] = 0; + + //fprintf( stderr, "n: %d\n", A->n ); + for( i = 0; i < A->n; ++i ){ + L->start[i] = Ltop; + tmptop = 0; + + for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){ + j = A->entries[pj].j; + val = A->entries[pj].val; + //fprintf( stderr, "i: %d, j: %d", i, j ); + + if( fabs(val) > droptol[i] ){ + k1 = 0; + k2 = L->start[j]; + while( k1 < tmptop && k2 < L->start[j+1] ){ + if( tmp[k1].j < L->entries[k2].j ) + ++k1; + else if( tmp[k1].j > L->entries[k2].j ) + ++k2; + else + val -= (tmp[k1++].val * L->entries[k2++].val); + } + + // L matrix is lower triangular, + // so right before the start of next row comes jth diagonal + val /= L->entries[L->start[j+1]-1].val; + + tmp[tmptop].j = j; + tmp[tmptop].val = val; + ++tmptop; + } + //fprintf( stderr, " -- done\n" ); + } + + // compute the ith diagonal in L + // sanity check + if( A->entries[pj].j != i ) { + fprintf( stderr, "i=%d, badly built A matrix!\n", i ); + exit(999); + } + + val = A->entries[pj].val; + for( k1 = 0; k1 < tmptop; ++k1 ) + val -= (tmp[k1].val * tmp[k1].val); + + tmp[tmptop].j = i; + tmp[tmptop].val = SQRT(val); + + // apply the dropping rule once again + //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); + //for( k1 = 0; k1<= tmptop; ++k1 ) + // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); + //fprintf( stderr, "\n" ); + //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); + for( k1 = 0; k1 < tmptop; ++k1 ) + if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){ + L->entries[Ltop].j = tmp[k1].j; + L->entries[Ltop].val = tmp[k1].val; + U->start[tmp[k1].j+1]++; + ++Ltop; + //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); + } + // keep the diagonal in any case + L->entries[Ltop].j = tmp[k1].j; + L->entries[Ltop].val = tmp[k1].val; + ++Ltop; + //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); + } + + L->start[i] = Ltop; + //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); + + for( i = 1; i <= U->n; ++i ) + Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; + + for( i = 0; i < L->n; ++i ) + for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ + j = L->entries[pj].j; + U->entries[Utop[j]].j = i; + U->entries[Utop[j]].val = L->entries[pj].val; + Utop[j]++; + } + + //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); +} + + +void Init_MatVec( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list *far_nbrs ) +{ + int i, fillin; + real s_tmp, t_tmp; + //char fname[100]; + + if(control->refactor > 0 && + ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)) + { + //Print_Linear_System( system, control, workspace, data->step ); + Sort_Matrix_Rows( &workspace->H ); + + //fprintf( stderr, "H matrix sorted\n" ); + + Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); + //fprintf( stderr, "drop tolerances calculated\n" ); + + if( workspace->L.entries == NULL ) + { + fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol ); + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, "fillin = %d\n", fillin ); +#endif + + if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 || + Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ) + { + fprintf( stderr, "not enough memory for LU matrices. terminating.\n" ); + exit(INSUFFICIENT_SPACE); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "fillin = %d\n", fillin ); + fprintf( stderr, "allocated memory: L = U = %ldMB\n", + fillin * sizeof(sparse_matrix_entry) / (1024*1024) ); +#endif + } + + ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "icholt-" ); + //sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); + //Print_Sparse_Matrix2( workspace->L, fname ); + //Print_Sparse_Matrix( U ); +#endif + } + + /* extrapolation for s & t */ + for( i = 0; i < system->N; ++i ) { + // no extrapolation + //s_tmp = workspace->s[0][i]; + //t_tmp = workspace->t[0][i]; + + // linear + //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i]; + //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i]; + + // quadratic + //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]); + t_tmp = workspace->t[index_wkspace_sys(2,i,system->N)] + 3*(workspace->t[index_wkspace_sys(0,i,system->N)]-workspace->t[index_wkspace_sys(1,i,system->N)]); + + // cubic + s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system->N)] + workspace->s[index_wkspace_sys(2,i,system->N)]) - + (6 * workspace->s[index_wkspace_sys(1,i,system->N)] + workspace->s[index_wkspace_sys(3,i,system->N)] ); + //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - + // (6 * workspace->t[1][i] + workspace->t[3][i] ); + + // 4th order + //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + + // 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i]; + //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + + // 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i]; + + workspace->s[index_wkspace_sys(4,i,system->N)] = workspace->s[index_wkspace_sys(3,i,system->N)]; + workspace->s[index_wkspace_sys(3,i,system->N)] = workspace->s[index_wkspace_sys(2,i,system->N)]; + workspace->s[index_wkspace_sys(2,i,system->N)] = workspace->s[index_wkspace_sys(1,i,system->N)]; + workspace->s[index_wkspace_sys(1,i,system->N)] = workspace->s[index_wkspace_sys(0,i,system->N)]; + workspace->s[index_wkspace_sys(0,i,system->N)] = s_tmp; + + workspace->t[index_wkspace_sys(4,i,system->N)] = workspace->t[index_wkspace_sys(3,i,system->N)]; + workspace->t[index_wkspace_sys(3,i,system->N)] = workspace->t[index_wkspace_sys(2,i,system->N)]; + workspace->t[index_wkspace_sys(2,i,system->N)] = workspace->t[index_wkspace_sys(1,i,system->N)]; + workspace->t[index_wkspace_sys(1,i,system->N)] = workspace->t[index_wkspace_sys(0,i,system->N)]; + workspace->t[index_wkspace_sys(0,i,system->N)] = t_tmp; + } +} + + +void Calculate_Charges( reax_system *system, static_storage *workspace ) +{ + int i; + real u, s_sum, t_sum; + + s_sum = t_sum = 0.; + for( i = 0; i < system->N; ++i ) { + s_sum += workspace->s[index_wkspace_sys(0,i,system->N)]; + t_sum += workspace->t[index_wkspace_sys(0,i,system->N)]; + } + + u = s_sum / t_sum; + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u ); +#endif + + for( i = 0; i < system->N; ++i ) + { + system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system->N)] - u * workspace->t[index_wkspace_sys(0,i,system->N)]; + } +} + + +void QEq( reax_system *system, control_params *control, simulation_data *data, + static_storage *workspace, list *far_nbrs, + output_controls *out_control ) +{ + int matvecs; + + //real t_start, t_elapsed; + + //t_start = Get_Time (); + Init_MatVec( system, control, data, workspace, far_nbrs ); + //t_elapsed = Get_Timing_Info ( t_start ); + + //fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed ); + + //if( data->step % 10 == 0 ) + // Print_Linear_System( system, control, workspace, far_nbrs, data->step ); + + //t_start = Get_Time ( ); + matvecs = GMRES( workspace, &workspace->H, + workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system ); + matvecs += GMRES( workspace, &workspace->H, + workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system ); + //t_elapsed = Get_Timing_Info ( t_start ); + + //fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed ); + + //matvecs = GMRES_HouseHolder( workspace, workspace->H, + // workspace->b_s, control->q_err, workspace->s[0], out_control->log ); + //matvecs += GMRES_HouseHolder( workspace, workspace->H, + // workspace->b_t, control->q_err, workspace->t[0], out_control->log ); + + //matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err, + // &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log, system ); + //matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err, + // &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log, system ); + + //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, + // workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1; + ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, + // workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1; + + //matvecs = CG( workspace, workspace->H, + // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; + //matvecs += CG( workspace, workspace->H, + // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; + + //matvecs = SDM( workspace, workspace->H, + // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; + //matvecs += SDM( workspace, workspace->H, + // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; + + //fprintf (stderr, " GMRES done with iterations %d \n", matvecs ); + + data->timing.matvecs += matvecs; +#if defined(DEBUG_FOCUS) + fprintf( stderr, "linsolve-" ); +#endif + + Calculate_Charges( system, workspace ); + //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", + // data->step, + // workspace->s[0][0], workspace->t[0][0], + // workspace->s[0][1], workspace->t[0][1], + // workspace->s[0][2], workspace->t[0][2] ); + // if( data->step == control->nsteps ) + //Print_Charges( system, control, workspace, data->step ); +} diff --git a/PuReMD-GPU/src/QEq.cu b/PuReMD-GPU/src/QEq.cu deleted file mode 100644 index 5d849b261b2e8396ec2243f6985f12e335c80430..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/QEq.cu +++ /dev/null @@ -1,1073 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#include "QEq.h" -#include "allocate.h" -#include "GMRES.h" -#include "list.h" -#include "print_utils.h" -#include "index_utils.h" - -#include "cuda_utils.h" -#include "cuda_init.h" -#include "cuda_copy.h" -#include "sort.h" -#include "validation.h" -#include "reduction.h" - -#include "system_props.h" - -HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) -{ - sparse_matrix_entry temp = array[index1]; - array[index1] = array[index2]; - array[index2] = temp; -} - -HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end) -{ - int i = start; - int k = end; - - if (end - start >= 1) - { - int pivot = array[start].j; - - while (k > i) - { - while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++; - while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--; - if (k > i) swap(array, i, k); - } - swap(array, start, k); - quick_sort(array, start, k - 1); - quick_sort(array, k + 1, end); - } -} - -int compare_matrix_entry(const void *v1, const void *v2) -{ - return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j; -} - - -void Sort_Matrix_Rows( sparse_matrix *A ) -{ - int i, si, ei; - - for( i = 0; i < A->n; ++i ) { - si = A->start[i]; - ei = A->start[i+1]; - qsort( &(A->entries[si]), ei - si, - sizeof(sparse_matrix_entry), compare_matrix_entry ); - } -} - -GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A ) -{ - int i; - int si, ei; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= A.n ) return; - - si = A.start[i]; - ei = A.end [i]; - - quick_sort( A.entries + si, 0, ei-si-1 ); -} - - -void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol ) -{ - int i, j, k; - real val; - - /* init droptol to 0 */ - for( i = 0; i < A->n; ++i ) - droptol[i] = 0; - - /* calculate sqaure of the norm of each row */ - for( i = 0; i < A->n; ++i ) { - for( k = A->start[i]; k < A->start[i+1]-1; ++k ) { - j = A->entries[k].j; - val = A->entries[k].val; - - droptol[i] += val*val; - droptol[j] += val*val; - } - - val = A->entries[k].val; // diagonal entry - droptol[i] += val*val; - } - - /* calculate local droptol for each row */ - //fprintf( stderr, "droptol: " ); - for( i = 0; i < A->n; ++i ) { - //fprintf( stderr, "%f-->", droptol[i] ); - droptol[i] = SQRT( droptol[i] ) * dtol; - //fprintf( stderr, "%f ", droptol[i] ); - } - //fprintf( stderr, "\n" ); -} - -GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol ) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - int k, j, offset, x, diagnol; - real val; - sparse_matrix *A = &p_A; - - if ( i < A->n ) { - droptol [i] = 0; - - for (k = A->start[i]; k < A->end[i]; ++k ) { - val = A->entries[k].val; - droptol [i] += val*val; - } - } - - __syncthreads (); - if ( i < A->n ) { - droptol [i] = SQRT (droptol[i]) * dtol; - } - -} - -GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real dtol ) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - int k, j, offset, x, diagnol; - real val; - sparse_matrix *A = &p_A; - - for (x = 0; x < A->n; x ++) - { - if (i < (A->end[i]-1 - A->start[i])) { - offset = A->start [i] + i; - j = A->entries[offset].j; - val = A->entries[offset].val; - droptol [j] += val * val; - } - __syncthreads (); - } -} - -GLOBAL void Cuda_Calculate_Droptol_diagnol ( sparse_matrix p_A, real *droptol, real dtol ) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - int k, j, offset, x, diagnol; - real val; - sparse_matrix *A = &p_A; - - if ( i < A->n ) { - //diagnol element - diagnol = A->end[i]-1; - val = A->entries [diagnol].val; - droptol [i] += val*val; - } - - /*calculate local droptol for each row*/ - if ( i < A->n ) - droptol [i] = SQRT (droptol[i]) * dtol; -} - - -int Estimate_LU_Fill( sparse_matrix *A, real *droptol ) -{ - int i, j, pj; - int fillin; - real val; - - fillin = 0; - - //fprintf( stderr, "n: %d\n", A->n ); - for( i = 0; i < A->n; ++i ) - for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){ - j = A->entries[pj].j; - val = A->entries[pj].val; - //fprintf( stderr, "i: %d, j: %d", i, j ); - - if( fabs(val) > droptol[i] ) - ++fillin; - } - - return fillin + A->n; -} - -GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *fillin) -{ - int i, j, pj; - real val; - sparse_matrix *A = &p_A; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= A->n) return; - - fillin [i] = 0; - - for (pj = A->start[i]; pj < A->end[i]-1; ++pj) - { - j = A->entries [pj].j; - val = A->entries[pj].val; - - if (fabs (val) > droptol [i]) ++fillin [i]; - } -} - -void ICHOLT( sparse_matrix *A, real *droptol, - sparse_matrix *L, sparse_matrix *U ) -{ - sparse_matrix_entry tmp[1000]; - int i, j, pj, k1, k2, tmptop, Ltop; - real val; - int *Utop; - - Utop = (int*) malloc((A->n+1) * sizeof(int)); - - // clear variables - Ltop = 0; - tmptop = 0; - for( i = 0; i <= A->n; ++i ) - L->start[i] = U->start[i] = 0; - - for( i = 0; i < A->n; ++i ) - Utop[i] = 0; - - //fprintf( stderr, "n: %d\n", A->n ); - for( i = 0; i < A->n; ++i ){ - L->start[i] = Ltop; - tmptop = 0; - - for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){ - j = A->entries[pj].j; - val = A->entries[pj].val; - //fprintf( stderr, "i: %d, j: %d", i, j ); - - if( fabs(val) > droptol[i] ){ - k1 = 0; - k2 = L->start[j]; - while( k1 < tmptop && k2 < L->start[j+1] ){ - if( tmp[k1].j < L->entries[k2].j ) - ++k1; - else if( tmp[k1].j > L->entries[k2].j ) - ++k2; - else - val -= (tmp[k1++].val * L->entries[k2++].val); - } - - // L matrix is lower triangular, - // so right before the start of next row comes jth diagonal - val /= L->entries[L->start[j+1]-1].val; - - tmp[tmptop].j = j; - tmp[tmptop].val = val; - ++tmptop; - } - //fprintf( stderr, " -- done\n" ); - } - - // compute the ith diagonal in L - // sanity check - if( A->entries[pj].j != i ) { - fprintf( stderr, "i=%d, badly built A matrix!\n", i ); - exit(999); - } - - val = A->entries[pj].val; - for( k1 = 0; k1 < tmptop; ++k1 ) - val -= (tmp[k1].val * tmp[k1].val); - - tmp[tmptop].j = i; - tmp[tmptop].val = SQRT(val); - - // apply the dropping rule once again - //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); - //for( k1 = 0; k1<= tmptop; ++k1 ) - // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); - //fprintf( stderr, "\n" ); - //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); - for( k1 = 0; k1 < tmptop; ++k1 ) - if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){ - L->entries[Ltop].j = tmp[k1].j; - L->entries[Ltop].val = tmp[k1].val; - U->start[tmp[k1].j+1]++; - ++Ltop; - //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); - } - // keep the diagonal in any case - L->entries[Ltop].j = tmp[k1].j; - L->entries[Ltop].val = tmp[k1].val; - ++Ltop; - //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); - } - - L->start[i] = Ltop; - //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); - - for( i = 1; i <= U->n; ++i ) - Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; - - for( i = 0; i < L->n; ++i ) - for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ - j = L->entries[pj].j; - U->entries[Utop[j]].j = i; - U->entries[Utop[j]].val = L->entries[pj].val; - Utop[j]++; - } - - //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); -} - - - -void Cuda_ICHOLT( sparse_matrix *A, real *droptol, - sparse_matrix *L, sparse_matrix *U ) -{ - sparse_matrix_entry tmp[1000]; - int i, j, pj, k1, k2, tmptop, Ltop; - real val; - int *Utop; - - Utop = (int*) malloc((A->n+1) * sizeof(int)); - - // clear variables - Ltop = 0; - tmptop = 0; - for( i = 0; i <= A->n; ++i ) - L->start[i] = U->start[i] = 0; - - for( i = 0; i < A->n; ++i ) - Utop[i] = 0; - - //fprintf( stderr, "n: %d\n", A->n ); - for( i = 0; i < A->n; ++i ){ - L->start[i] = Ltop; - tmptop = 0; - - for( pj = A->start[i]; pj < A->end[i]-1; ++pj ){ - j = A->entries[pj].j; - val = A->entries[pj].val; - //fprintf( stderr, "i: %d, j: %d", i, j ); - - //CHANGE ORIGINAL - if (j >= i) break; - //CHANGE ORIGINAL - - if( fabs(val) > droptol[i] ){ - k1 = 0; - k2 = L->start[j]; - while( k1 < tmptop && k2 < L->start[j+1] ){ - if( tmp[k1].j < L->entries[k2].j ) - ++k1; - else if( tmp[k1].j > L->entries[k2].j ) - ++k2; - else - val -= (tmp[k1++].val * L->entries[k2++].val); - } - - // L matrix is lower triangular, - // so right before the start of next row comes jth diagonal - val /= L->entries[L->start[j+1]-1].val; - - tmp[tmptop].j = j; - tmp[tmptop].val = val; - ++tmptop; - } - - //fprintf( stderr, " -- done\n" ); - } - - // compute the ith diagonal in L - // sanity check - if( A->entries[pj].j != i ) { - fprintf( stderr, "i=%d, badly built A matrix!\n", i ); - exit(999); - } - - val = A->entries[pj].val; - for( k1 = 0; k1 < tmptop; ++k1 ) - val -= (tmp[k1].val * tmp[k1].val); - - tmp[tmptop].j = i; - tmp[tmptop].val = SQRT(val); - - // apply the dropping rule once again - //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); - //for( k1 = 0; k1<= tmptop; ++k1 ) - // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); - //fprintf( stderr, "\n" ); - //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); - for( k1 = 0; k1 < tmptop; ++k1 ) - if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){ - L->entries[Ltop].j = tmp[k1].j; - L->entries[Ltop].val = tmp[k1].val; - U->start[tmp[k1].j+1]++; - ++Ltop; - //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); - } - // keep the diagonal in any case - L->entries[Ltop].j = tmp[k1].j; - L->entries[Ltop].val = tmp[k1].val; - ++Ltop; - //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); - } - - L->start[i] = Ltop; - //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); - - for( i = 1; i <= U->n; ++i ) - Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; - - for( i = 0; i < L->n; ++i ) - for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ - j = L->entries[pj].j; - U->entries[Utop[j]].j = i; - U->entries[Utop[j]].val = L->entries[pj].val; - Utop[j]++; - } - - //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); -} - - - -/* -//Parallel for each row -//Each kernel will run for 6540 number of times. -GLOBAL void Cuda_ICHOLT( reax_system *system, sparse_matrix p_A, real *droptol, -sparse_matrix p_L, sparse_matrix p_U ) -{ -int start, end, count; -real tempvalue, val; -int i,pj,tmptop, offset; -int j, k1, k2; - -sparse_matrix *A, *L, *U; -sparse_matrix_entry *tmp; - -A = &p_A; -L = &p_L; -U = &p_U; - -real *null_val; -null_val = 0; - -extern __shared__ real tmp_val[]; -extern __shared__ sparse_matrix_entry sh_tmp[]; - -int kid = blockIdx.x * blockDim.x + threadIdx.x; -tmp = (sparse_matrix_entry *) (tmp_val + blockDim.x); - -offset = 0; -for( i = 0; i < 10; ++i ) -{ -//if (kid == 0) L->start[i] = i * system->max_sparse_matrix_entries; -if (kid == 0) L->start[i] = offset; -tmptop = 0; - -start = A->start[i]; -end = A->end[i]-1; //inclusive -count = end - start; //inclusive -tmp_val [kid] = 0; - -if (kid < count) //diagnol not included -{ -pj = start + kid; - -j = A->entries[pj].j; -val = A->entries[pj].val; - -if( fabs(val) > droptol[i] ) -{ -k1 = 0; -k2 = L->start[j]; -while( k1 < tmptop && k2 < L->end[j] ){ -if( tmp[k1].j < L->entries[k2].j ) -++k1; -else if( tmp[k1].j > L->entries[k2].j ) -++k2; -else -tmp_val[kid] = (tmp[k1++].val * L->entries[k2++].val); -} - -//here read the shared memory of all the kernels -if (kid == 0) -{ -for (i = 0; i < count; i++) -tempvalue += tmp_val [i]; - -val -= tempvalue; - -// L matrix is lower triangular, -// so right before the start of next row comes jth diagonal -val /= L->entries[L->end[j]-1].val; - -tmp[tmptop].j = j; -tmp[tmptop].val = val; -++tmptop; -} -} -} -__syncthreads (); - - -// compute the ith diagonal in L -// sanity check -if (kid == 0) -{ - if( A->entries[end].j != i ) { - //intentional core dump here for sanity sake - *null_val = 1; - } -} - -//diagnol element -//val = A->entries[pj].val; -//for( k1 = 0; k1 < tmptop; ++k1 ) -if (kid < count) - tmp_val[kid] = (tmp[kid].val * tmp[kid].val); - - __syncthreads (); - -if (kid == 0) -{ - val = A->entries [end].val; - for (i = 0; i < count; i++) - tempvalue += tmp_val [i]; - - val -= tempvalue; - tmp[tmptop].j = i; - tmp[tmptop].val = SQRT(val); -} -__syncthreads (); - -//Fill in the LU entries -//for( k1 = 0; k1 < count; ++k1 ) -if (kid < count ) -{ - if( fabs(tmp[kid].val) > droptol[i] / tmp[tmptop].val ){ - L->entries[offset + kid].j = tmp[kid].j; - L->entries[offset + kid].val = tmp[kid].val; - U->start[tmp[kid].j+1]++; - } -} -__syncthreads (); - -if (kid == 0) { - // keep the diagonal in any case - offset += count; - L->entries[offset].j = tmp[count].j; - L->entries[offset].val = tmp[count].val; - ++offset; - L->end [i] = offset; -} -__syncthreads (); -} // end of main for loop -} - -void Cuda_Fill_U ( sparse_matrix *A, real *droptol, - sparse_matrix *L, sparse_matrix *U ) -{ - int i, pj, j; - - for( i = 1; i <= U->n; ++i ) - Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; - - for( i = 0; i < L->n; ++i ) - for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ - j = L->entries[pj].j; - U->entries[Utop[j]].j = i; - U->entries[Utop[j]].val = L->entries[pj].val; - Utop[j]++; - } -} -*/ - - -void Init_MatVec( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list *far_nbrs ) -{ - int i, fillin; - real s_tmp, t_tmp; - //char fname[100]; - - if(control->refactor > 0 && - ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)){ - //Print_Linear_System( system, control, workspace, data->step ); - Sort_Matrix_Rows( &workspace->H ); - - //fprintf( stderr, "H matrix sorted\n" ); - - Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); - //fprintf( stderr, "drop tolerances calculated\n" ); - - - if( workspace->L.entries == NULL ) { - fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol ); -#ifdef __DEBUG_CUDA__ - fprintf( stderr, "fillin = %d\n", fillin ); -#endif - if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 || - Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ){ - fprintf( stderr, "not enough memory for LU matrices. terminating.\n" ); - exit(INSUFFICIENT_SPACE); - } -#if defined(DEBUG_FOCUS) - fprintf( stderr, "fillin = %d\n", fillin ); - fprintf( stderr, "allocated memory: L = U = %ldMB\n", - fillin * sizeof(sparse_matrix_entry) / (1024*1024) ); -#endif - } - - ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "icholt-" ); - //sprintf( fname, "%s.L%d.out", control->sim_name, data->step ); - //Print_Sparse_Matrix2( workspace->L, fname ); - //Print_Sparse_Matrix( U ); -#endif - } - - /* extrapolation for s & t */ - for( i = 0; i < system->N; ++i ) { - // no extrapolation - //s_tmp = workspace->s[0][i]; - //t_tmp = workspace->t[0][i]; - - // linear - //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i]; - //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i]; - - // quadratic - //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]); - t_tmp = workspace->t[index_wkspace_sys(2,i,system)] + 3*(workspace->t[index_wkspace_sys(0,i,system)]-workspace->t[index_wkspace_sys(1,i,system)]); - - // cubic - s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system)] + workspace->s[index_wkspace_sys(2,i,system)]) - - (6 * workspace->s[index_wkspace_sys(1,i,system)] + workspace->s[index_wkspace_sys(3,i,system)] ); - //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - - // (6 * workspace->t[1][i] + workspace->t[3][i] ); - - // 4th order - //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + - // 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i]; - //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + - // 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i]; - - workspace->s[index_wkspace_sys(4,i,system)] = workspace->s[index_wkspace_sys(3,i,system)]; - workspace->s[index_wkspace_sys(3,i,system)] = workspace->s[index_wkspace_sys(2,i,system)]; - workspace->s[index_wkspace_sys(2,i,system)] = workspace->s[index_wkspace_sys(1,i,system)]; - workspace->s[index_wkspace_sys(1,i,system)] = workspace->s[index_wkspace_sys(0,i,system)]; - workspace->s[index_wkspace_sys(0,i,system)] = s_tmp; - - workspace->t[index_wkspace_sys(4,i,system)] = workspace->t[index_wkspace_sys(3,i,system)]; - workspace->t[index_wkspace_sys(3,i,system)] = workspace->t[index_wkspace_sys(2,i,system)]; - workspace->t[index_wkspace_sys(2,i,system)] = workspace->t[index_wkspace_sys(1,i,system)]; - workspace->t[index_wkspace_sys(1,i,system)] = workspace->t[index_wkspace_sys(0,i,system)]; - workspace->t[index_wkspace_sys(0,i,system)] = t_tmp; - } -} - -void Cuda_Init_MatVec( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list *far_nbrs ) -{ - int i, fillin; - real s_tmp, t_tmp; - int *spad = (int *)scratch; - real start = 0, end = 0; - - if(control->refactor > 0 && - ((data->step-data->prev_steps)%control->refactor==0 || dev_workspace->L.entries==NULL)){ - - Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>> - ( dev_workspace->H ); - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Sorting done... \n"); -#endif - - Cuda_Calculate_Droptol <<<BLOCKS, BLOCK_SIZE >>> - ( dev_workspace->H, dev_workspace->droptol, control->droptol ); - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Droptol done... \n"); -#endif - - if( dev_workspace->L.entries == NULL ) { - - cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH ); - Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>> - ( dev_workspace->H, dev_workspace->droptol, spad ); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for fill in - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> - (spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); - fillin += dev_workspace->H.n; - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Calculated value of the fill in is --> %d \n ", fillin ); -#endif - - dev_workspace->L.n = far_nbrs->n; - dev_workspace->L.m = fillin; - Cuda_Init_Sparse_Matrix( &dev_workspace->L, fillin, far_nbrs->n ); - - dev_workspace->U.n = far_nbrs->n; - dev_workspace->U.m = fillin; - Cuda_Init_Sparse_Matrix( &dev_workspace->U, fillin, far_nbrs->n ); - } - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "LU matrix done...\n"); -#endif - - //TODO -- This is the ILU Factorization of the H Matrix. - //This is present in the CUDA 5.0 compilation which is not working currently. - //Fix this when CUDA 5.0 is correctly setup. - //TODO - //shared memory is per block - // here we have only one block - - /* - fprintf (stderr, "max sparse matrix entries %d \n", system->max_sparse_matrix_entries ); - Cuda_ICHOLT <<<1, system->max_sparse_matrix_entries, - system->max_sparse_matrix_entries *(REAL_SIZE + SPARSE_MATRIX_ENTRY_SIZE) >>> - ( system, dev_workspace->H, - dev_workspace->droptol, - dev_workspace->L, - dev_workspace->U ); - cudaThreadSynchronize (); - fprintf (stderr, "Cuda_ICHOLT .. done ...-> %d\n ", cudaGetLastError ()); - */ - - //1. copy the H matrix from device to host - //2. Allocate the L/U matrices on the host and device. - //3. Compute the L/U on the host - //4. copy the results to the device - //5. Continue the computation. - sparse_matrix t_H, t_L, t_U; - real *t_droptol; - - t_droptol = (real *) malloc (REAL_SIZE * system->N); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m ); -#endif - start = Get_Time (); - if (!Allocate_Matrix (&t_H, dev_workspace->H.n, dev_workspace->H.m)) { fprintf (stderr, "No space for H matrix \n"); exit (0);} - if (!Allocate_Matrix (&t_L, far_nbrs->n, dev_workspace->L.m)) { fprintf (stderr, "No space for L matrix \n"); exit (0); } - if (!Allocate_Matrix (&t_U, far_nbrs->n, dev_workspace->U.m)) { fprintf (stderr, "No space for U matrix \n"); exit (0); } - - copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( t_H.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY ); - - copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL ); - - //fprintf (stderr, " Done copying LUH .. \n"); - Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U); - - Sync_Host_Device (&t_L, &t_U, cudaMemcpyHostToDevice); - end += Get_Timing_Info (start); - - /* - fprintf (stderr, "Done syncing .... \n"); - free (t_droptol); - fprintf (stderr, "Freed droptol ... \n"); - Deallocate_Matrix (&t_H); - fprintf (stderr, "Freed H ... \n"); - Deallocate_Matrix (&t_L); - fprintf (stderr, "Freed l ... \n"); - Deallocate_Matrix (&t_U); - fprintf (stderr, "Freed u ... \n"); - */ - - //#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Done copying the L/U matrices to the device ---> %f \n", end); - //#endif - - //#ifdef __BUILD_DEBUG__ - // validate_lu (workspace); - //#endif - } -} - -GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N ) -{ - - static_storage *workspace = &p_workspace; - real s_tmp, t_tmp; - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= N) return; - // no extrapolation - //s_tmp = workspace->s[0][i]; - //t_tmp = workspace->t[0][i]; - - // linear - //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i]; - //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i]; - - // quadratic - //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]); - t_tmp = workspace->t[index_wkspace_sys(2,i,N)] + 3*(workspace->t[index_wkspace_sys(0,i,N)]-workspace->t[index_wkspace_sys(1,i,N)]); - - // cubic - s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,N)] + workspace->s[index_wkspace_sys(2,i,N)]) - - (6 * workspace->s[index_wkspace_sys(1,i,N)] + workspace->s[index_wkspace_sys(3,i,N)] ); - //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - - // (6 * workspace->t[1][i] + workspace->t[3][i] ); - - // 4th order - //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + - // 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i]; - //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + - // 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i]; - - workspace->s[index_wkspace_sys(4,i,N)] = workspace->s[index_wkspace_sys(3,i,N)]; - workspace->s[index_wkspace_sys(3,i,N)] = workspace->s[index_wkspace_sys(2,i,N)]; - workspace->s[index_wkspace_sys(2,i,N)] = workspace->s[index_wkspace_sys(1,i,N)]; - workspace->s[index_wkspace_sys(1,i,N)] = workspace->s[index_wkspace_sys(0,i,N)]; - workspace->s[index_wkspace_sys(0,i,N)] = s_tmp; - - workspace->t[index_wkspace_sys(4,i,N)] = workspace->t[index_wkspace_sys(3,i,N)]; - workspace->t[index_wkspace_sys(3,i,N)] = workspace->t[index_wkspace_sys(2,i,N)]; - workspace->t[index_wkspace_sys(2,i,N)] = workspace->t[index_wkspace_sys(1,i,N)]; - workspace->t[index_wkspace_sys(1,i,N)] = workspace->t[index_wkspace_sys(0,i,N)]; - workspace->t[index_wkspace_sys(0,i,N)] = t_tmp; -} - -void Calculate_Charges( reax_system *system, static_storage *workspace ) -{ - int i; - real u, s_sum, t_sum; - - s_sum = t_sum = 0.; - for( i = 0; i < system->N; ++i ) { - s_sum += workspace->s[index_wkspace_sys(0,i,system)]; - t_sum += workspace->t[index_wkspace_sys(0,i,system)]; - } - - u = s_sum / t_sum; - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u ); -#endif - - for( i = 0; i < system->N; ++i ) - system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system)] - u * workspace->t[index_wkspace_sys(0,i,system)]; -} - -GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, int N) -{ - int i = blockIdx.x*blockDim.x + threadIdx.x; - if (i >= N) return; - - atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)]; -} - -void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace) -{ - real *spad = (real *) scratch; - real u, s_sum, t_sum; - - cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); - - //s_sum - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - //t_sum - cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - //fraction here - u = s_sum / t_sum; - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u ); -#endif - - Cuda_Update_Atoms_q <<< BLOCKS, BLOCK_SIZE >>> - ( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N); - cudaThreadSynchronize (); - cudaCheckError (); -} - - -void QEq( reax_system *system, control_params *control, simulation_data *data, - static_storage *workspace, list *far_nbrs, - output_controls *out_control ) -{ - int matvecs; - - //real t_start, t_elapsed; - - //t_start = Get_Time (); - Init_MatVec( system, control, data, workspace, far_nbrs ); - //t_elapsed = Get_Timing_Info ( t_start ); - - //fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed ); - - //if( data->step % 10 == 0 ) - // Print_Linear_System( system, control, workspace, far_nbrs, data->step ); - - //t_start = Get_Time ( ); - matvecs = GMRES( workspace, &workspace->H, - workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system ); - matvecs += GMRES( workspace, &workspace->H, - workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system ); - //t_elapsed = Get_Timing_Info ( t_start ); - - //fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed ); - - //matvecs = GMRES_HouseHolder( workspace, workspace->H, - // workspace->b_s, control->q_err, workspace->s[0], out_control->log ); - //matvecs += GMRES_HouseHolder( workspace, workspace->H, - // workspace->b_t, control->q_err, workspace->t[0], out_control->log ); - - //matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err, - // &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system)], out_control->log, system ); - //matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err, - // &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system)], out_control->log, system ); - - //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, - // workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1; - ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, - // workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1; - - //matvecs = CG( workspace, workspace->H, - // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; - //matvecs += CG( workspace, workspace->H, - // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; - - //matvecs = SDM( workspace, workspace->H, - // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1; - //matvecs += SDM( workspace, workspace->H, - // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1; - - //fprintf (stderr, " GMRES done with iterations %d \n", matvecs ); - - data->timing.matvecs += matvecs; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "linsolve-" ); -#endif - - Calculate_Charges( system, workspace ); - //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", - // data->step, - // workspace->s[0][0], workspace->t[0][0], - // workspace->s[0][1], workspace->t[0][1], - // workspace->s[0][2], workspace->t[0][2] ); - // if( data->step == control->nsteps ) - //Print_Charges( system, control, workspace, data->step ); -} - -void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data, - static_storage *workspace, list *far_nbrs, - output_controls *out_control ) -{ - int matvecs = 0; - real t_start, t_elapsed; - -#ifdef __DEBUG_CUDA__ - t_start = Get_Time (); -#endif - - /* - //Cuda_Init_MatVec( system, control, data, workspace, far_nbrs ); - - Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>> - ( dev_workspace->H ); - cudaThreadSynchronize (); - cudaCheckError (); - - t_elapsed = Get_Timing_Info (t_start); - fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed); - */ - Init_MatVec_Postprocess <<< BLOCKS, BLOCK_SIZE >>> - (*dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info (t_start); - fprintf (stderr, "Done with post processing of init_matvec --> %d with time ---> %f \n", cudaGetLastError (), t_elapsed); -#endif - - //Here goes the GMRES part of the program () - //#ifdef __DEBUG_CUDA__ - t_start = Get_Time (); - //#endif - - //matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s ); - //matvecs += Cuda_GMRES( dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t ); - - matvecs = Cublas_GMRES( system, dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s ); - matvecs += Cublas_GMRES( system, dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t ); - - d_timing.matvecs += matvecs; - -#ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info ( t_start ); - fprintf (stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed ); -#endif - - //Here cuda calculate charges - Cuda_Calculate_Charges (system, workspace); -} diff --git a/PuReMD-GPU/src/QEq.h b/PuReMD-GPU/src/QEq.h index 27eceb9764b3a412728a13f37f6735f086afa546..31dfbf61ba05ec79d32313c3ab648eb259f183f2 100644 --- a/PuReMD-GPU/src/QEq.h +++ b/PuReMD-GPU/src/QEq.h @@ -23,10 +23,39 @@ #include "mytypes.h" + void QEq( reax_system*, control_params*, simulation_data*, static_storage*, - list*, output_controls* ); + list*, output_controls* ); + + +static inline HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) +{ + sparse_matrix_entry temp = array[index1]; + array[index1] = array[index2]; + array[index2] = temp; +} + + +static inline HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end) +{ + int i = start; + int k = end; + + if (end - start >= 1) + { + int pivot = array[start].j; + + while (k > i) + { + while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++; + while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--; + if (k > i) swap(array, i, k); + } + swap(array, start, k); + quick_sort(array, start, k - 1); + quick_sort(array, k + 1, end); + } +} -void Cuda_QEq( reax_system*, control_params*, simulation_data*, static_storage*, - list*, output_controls* ); #endif diff --git a/PuReMD-GPU/src/allocate.c b/PuReMD-GPU/src/allocate.c new file mode 100644 index 0000000000000000000000000000000000000000..65f0eb2a872673259d508f17fc0da43530a7426f --- /dev/null +++ b/PuReMD-GPU/src/allocate.c @@ -0,0 +1,281 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "allocate.h" + +#include "list.h" + + +void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) +{ + Delete_List( far_nbrs ); + if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs )) + { + fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); + exit( INIT_ERR ); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n", + num_intrs, far_nbrs->num_intrs ); + fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", + num_intrs * sizeof(far_neighbor_data) / (1024*1024) ); +#endif +} + + +HOST int Allocate_Matrix( sparse_matrix *H, int n, int m ) +{ + H->n = n; + H->m = m; + if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL ) + return 0; + + if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL ) + return 0; + + if( (H->entries = + (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL ) + return 0; + + return 1; +} + + +void Deallocate_Matrix( sparse_matrix *H ) +{ + free(H->start); + free(H->entries); + free(H->end); +} + + +int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) +{ + Deallocate_Matrix( H ); + if( !Allocate_Matrix( H, n, m ) ) { + fprintf(stderr, "not enough space for %s matrix. terminating!\n", name); + exit( 1 ); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n", + name, n, m ); + fprintf( stderr, "memory allocated: %s = %ldMB\n", + name, m * sizeof(sparse_matrix_entry) / (1024*1024) ); +#endif + return 1; +} + + +int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, + list *hbonds ) +{ + int i, num_hbonds; + + num_hbonds = 0; + /* find starting indexes for each H and the total number of hbonds */ + for( i = 1; i < n; ++i ) + hb_top[i] += hb_top[i-1]; + num_hbonds = hb_top[n-1]; + + if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) + { + fprintf( stderr, "not enough space for hbonds list. terminating!\n" ); + exit( INIT_ERR ); + } + + for( i = 0; i < n; ++i ) + if( h_index[i] == 0 ){ + Set_Start_Index( 0, 0, hbonds ); + Set_End_Index( 0, 0, hbonds ); + } + else if( h_index[i] > 0 ){ + Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); + Set_End_Index( h_index[i], hb_top[i-1], hbonds ); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds ); + fprintf( stderr, "memory allocated: hbonds = %ldMB\n", + num_hbonds * sizeof(hbond_data) / (1024*1024) ); +#endif + return 1; +} + + +int Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) +{ + int i; + int *hb_top; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "reallocating hbonds\n" ); +#endif + hb_top = (int *)calloc( n, sizeof(int) ); + for( i = 0; i < n; ++i ) + if( h_index[i] >= 0 ) + hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS); + + Delete_List( hbonds ); + + Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds ); + + free( hb_top ); + + return 1; +} + + +int Allocate_Bond_List( int n, int *bond_top, list *bonds ) +{ + int i, num_bonds; + + num_bonds = 0; + /* find starting indexes for each atom and the total number of bonds */ + for( i = 1; i < n; ++i ) + bond_top[i] += bond_top[i-1]; + num_bonds = bond_top[n-1]; + + if( !Make_List(n, num_bonds, TYP_BOND, bonds ) ) + { + fprintf( stderr, "not enough space for bonds list. terminating!\n" ); + exit( INIT_ERR ); + } + + Set_Start_Index( 0, 0, bonds ); + Set_End_Index( 0, 0, bonds ); + for( i = 1; i < n; ++i ) { + Set_Start_Index( i, bond_top[i-1], bonds ); + Set_End_Index( i, bond_top[i-1], bonds ); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds ); + fprintf( stderr, "memory allocated: bonds = %ldMB\n", + num_bonds * sizeof(bond_data) / (1024*1024) ); +#endif + return 1; +} + + +int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body ) +{ + int i; + int *bond_top; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "reallocating bonds\n" ); +#endif + bond_top = (int *)calloc( n, sizeof(int) ); + *est_3body = 0; + for( i = 0; i < n; ++i ){ + *est_3body += SQR( Num_Entries( i, bonds ) ); + bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS ); + } + + Delete_List( bonds ); + + Allocate_Bond_List( n, bond_top, bonds ); + *num_bonds = bond_top[n-1]; + + free( bond_top ); + + return 1; +} + + +void Reallocate( reax_system *system, static_storage *workspace, list **lists, + int nbr_flag ) +{ + int num_bonds, est_3body; + reallocate_data *realloc; + grid *g; + + realloc = &(workspace->realloc); + g = &(system->g); + + if( realloc->num_far > 0 && nbr_flag ) { + fprintf (stderr, " Reallocating neighbors \n"); + Reallocate_Neighbor_List( (*lists)+FAR_NBRS, + system->N, realloc->num_far * SAFE_ZONE ); + realloc->num_far = -1; + } + + if( realloc->Htop > 0 ){ + fprintf (stderr, " Reallocating Matrix \n"); + Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H"); + realloc->Htop = -1; + + Deallocate_Matrix( &workspace->L ); + Deallocate_Matrix( &workspace->U ); + } + + if( realloc->hbonds > 0 ){ + fprintf (stderr, " Reallocating hbonds \n"); + Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index, + (*lists)+HBONDS ); + realloc->hbonds = -1; + } + + num_bonds = est_3body = -1; + if( realloc->bonds > 0 ){ + fprintf (stderr, " Reallocating bonds \n"); + Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body ); + realloc->bonds = -1; + realloc->num_3body = MAX( realloc->num_3body, est_3body ); + } + + if( realloc->num_3body > 0 ) { + fprintf (stderr, " Reallocating 3Body \n"); + Delete_List( (*lists)+THREE_BODIES ); + + if( num_bonds == -1 ) + num_bonds = ((*lists)+BONDS)->num_intrs; + realloc->num_3body *= SAFE_ZONE; + + if( !Make_List( num_bonds, realloc->num_3body, + TYP_THREE_BODY, (*lists)+THREE_BODIES ) ) + { + fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); + exit( INIT_ERR ); + } + realloc->num_3body = -1; +#if defined(DEBUG_FOCUS) + fprintf( stderr, "reallocating 3 bodies\n" ); + fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds ); + fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body ); + fprintf( stderr, "reallocated 3body memory: %ldMB\n", + realloc->num_3body*sizeof(three_body_interaction_data)/ + (1024*1024) ); +#endif + } + + if( realloc->gcell_atoms > -1 ){ +#if defined(DEBUG_FOCUS) + fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms); +#endif + + free (g->atoms); + g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2], + sizeof (int) * workspace->realloc.gcell_atoms); + realloc->gcell_atoms = -1; + } +} diff --git a/PuReMD-GPU/src/allocate.h b/PuReMD-GPU/src/allocate.h index 7ab146bbb763073377b2ffb16905b6694a85765d..b03ed80b34f153b9929ccaa80bc5c27fbf6ce540 100644 --- a/PuReMD-GPU/src/allocate.h +++ b/PuReMD-GPU/src/allocate.h @@ -23,6 +23,11 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Reallocate( reax_system*, static_storage*, list**, int ); int Allocate_Matrix( sparse_matrix*, int, int ); @@ -32,13 +37,9 @@ int Allocate_HBond_List( int, int, int*, int*, list* ); int Allocate_Bond_List( int, int*, list* ); -//Cuda Functions -int Cuda_Allocate_Matrix( sparse_matrix*, int, int ); -int Cuda_Allocate_HBond_List( int, int, int*, int*, list* ); -int Cuda_Allocate_Bond_List( int, int*, list* ); -void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int ); +#ifdef __cplusplus +} +#endif -GLOBAL void Init_HBond_Indexes ( int *, int *, list , int ); -GLOBAL void Init_Bond_Indexes ( int *, list , int ); #endif diff --git a/PuReMD-GPU/src/bond_orders.cu b/PuReMD-GPU/src/bond_orders.c similarity index 57% rename from PuReMD-GPU/src/bond_orders.cu rename to PuReMD-GPU/src/bond_orders.c index 57f5baacb761a34cd8012f28b6ecb3e7c63d6081..49eaed6532449ef34ee3dac26a71462b2edbdd36 100644 --- a/PuReMD-GPU/src/bond_orders.cu +++ b/PuReMD-GPU/src/bond_orders.c @@ -19,23 +19,21 @@ ----------------------------------------------------------------------*/ #include "bond_orders.h" + +#include "index_utils.h" #include "list.h" #include "lookup.h" #include "print_utils.h" #include "vector.h" -#include "index_utils.h" -#include "cuda_utils.h" -#include "cuda_helpers.h" - - inline real Cf45( real p1, real p2 ) { return -EXP(-p2 / 2) / ( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) ); } + #ifdef TEST_FORCES void Get_dBO( reax_system *system, list **lists, int i, int pj, real C, rvec *v ) @@ -66,7 +64,8 @@ void Get_dBOpinpi2( reax_system *system, list **lists, start_pj = Start_Index(pj, dBOs); end_pj = End_Index(pj, dBOs); - for( k = start_pj; k < end_pj; ++k ) { + for( k = start_pj; k < end_pj; ++k ) + { dbo_k = &(dBOs->select.dbo_list[k]); rvec_Scale( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi ); rvec_Scale( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 ); @@ -179,7 +178,6 @@ void Add_dDelta_to_Forces( reax_system *system, list **lists, int i, real C ) } - HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace, list p_bonds, list p_dBOs, int *top ) { @@ -367,7 +365,6 @@ HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace, #endif - void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system, simulation_data *data, static_storage *workspace, list **lists ) @@ -521,134 +518,7 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system, temp[0], temp[1], temp[2] ); */ } -///////////////////////////////////////////////////////////// -//Cuda Functions -///////////////////////////////////////////////////////////// - -HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms, - simulation_data *data, static_storage *workspace, - list *bonds ) -{ - bond_data *nbr_j, *nbr_k; - bond_order_data *bo_ij, *bo_ji; - dbond_coefficients coef; - rvec temp, ext_press; - ivec rel_box; - int pk, k, j; - - /* Initializations */ - nbr_j = &(bonds->select.bond_list[pj]); - j = nbr_j->nbr; - bo_ij = &(nbr_j->bo_data); - bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - - coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - - coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - - coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - - coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - - /************************************ - * forces related to atom i * - * first neighbors of atom i * - ************************************/ - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp ); /*2nd,dBO*/ - rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ - rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/ - - /* force */ - rvec_Add( atoms[k].f, temp ); - /* pressure */ - rvec_iMultiply( ext_press, nbr_k->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - } - - /* then atom i itself */ - rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] ); /*2nd, dBO*/ - - rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/ - - rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ - rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ - rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/ - - rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ; /*1st,dBO_pi2*/ - rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBO_pi2*/ - rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/ - - /* force */ - rvec_Add( atoms[i].f, temp ); - /* ext pressure due to i dropped, counting force on j only will be enough */ - - - /**************************************************************************** - * forces and pressure related to atom j * - * first neighbors of atom j * - ***************************************************************************/ - for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - - rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp ); /*3rd,dBO*/ - rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ - rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/ - - /* force */ - rvec_Add( atoms[k].f, temp ); - /* pressure */ - if( k != i ) { - ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box wrt i - rvec_iMultiply( ext_press, rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - } - } - - /* then atom j itself */ - rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] ); /*2nd, dBO*/ - - rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ - rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/ - - rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ - rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ - rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/ - - rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2); /*1st,dBOpi2*/ - rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBOpi2*/ - rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/ - - /* force */ - rvec_Add( atoms[j].f, temp ); - /* pressure */ - rvec_iMultiply( ext_press, nbr_j->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); -} - -///////////////////////////////////////////////////////////// -//Cuda Functions -///////////////////////////////////////////////////////////// void Add_dBond_to_Forces( int i, int pj, reax_system *system, simulation_data *data, static_storage *workspace, list **lists ) @@ -761,154 +631,6 @@ void Add_dBond_to_Forces( int i, int pj, reax_system *system, /*3rd, dBOpi2*/ } -HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms, - static_storage *workspace, list *bonds ) -{ - bond_data *nbr_j, *nbr_k; - bond_order_data *bo_ij, *bo_ji; - dbond_coefficients coef; - int pk, k, j; - rvec t_f; - - /* Initializations */ - nbr_j = &(bonds->select.bond_list[pj]); - j = nbr_j->nbr; - - if (i < j) - { - bo_ij = &(nbr_j->bo_data); - bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - } else { - bo_ji = &(nbr_j->bo_data); - bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); - } - - coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); - - coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); - - coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); - - coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); - - if ( i < j) { - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - rvec_MakeZero (t_f); - - rvec_ScaledAdd( t_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); - /*2nd, dBO*/ - rvec_ScaledAdd( t_f, -coef.C2dDelta, nbr_k->bo_data.dBOp ); - /*dDelta*/ - rvec_ScaledAdd( t_f, -coef.C3dbopi, nbr_k->bo_data.dBOp ); - /*3rd, dBOpi*/ - rvec_ScaledAdd( t_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); - /*3rd, dBOpi2*/ - - //Store in the temp place - rvec_Add (nbr_k->t_f, t_f); - } - - rvec_ScaledAdd( atoms[i].f, coef.C1dbo, bo_ij->dBOp ); - /*1st, dBO*/ - rvec_ScaledAdd( atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] ); - /*2nd, dBO*/ - - rvec_ScaledAdd(atoms[i].f, coef.C1dDelta, bo_ij->dBOp); - /*1st, dBO*/ - rvec_ScaledAdd(atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]); - /*2nd, dBO*/ - - rvec_ScaledAdd( atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi ); - /*1st, dBOpi*/ - rvec_ScaledAdd( atoms[i].f, coef.C2dbopi, bo_ij->dBOp ); - /*2nd, dBOpi*/ - rvec_ScaledAdd( atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]); - /*3rd, dBOpi*/ - - rvec_ScaledAdd( atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); - /*1st, dBO_pi2*/ - rvec_ScaledAdd( atoms[i].f, coef.C2dbopi2, bo_ij->dBOp ); - /*2nd, dBO_pi2*/ - rvec_ScaledAdd( atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]); - /*3rd, dBO_pi2*/ - } - else - { - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - k = nbr_k->nbr; - rvec_MakeZero (t_f); - - rvec_ScaledAdd( t_f, -coef.C3dbo, nbr_k->bo_data.dBOp ); - /*3rd, dBO*/ - rvec_ScaledAdd( t_f, -coef.C3dDelta, nbr_k->bo_data.dBOp ); - /*dDelta*/ - rvec_ScaledAdd( t_f, -coef.C4dbopi, nbr_k->bo_data.dBOp ); - /*4th, dBOpi*/ - rvec_ScaledAdd( t_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); - /*4th, dBOpi2*/ - - //Store in the temp place - rvec_Add (nbr_k->t_f, t_f); - } - - rvec_ScaledAdd( atoms[i].f, -coef.C1dbo, bo_ij->dBOp ); - /*1st, dBO*/ - rvec_ScaledAdd( atoms[i].f, coef.C3dbo, workspace->dDeltap_self[i] ); - /*2nd, dBO*/ - - rvec_ScaledAdd( atoms[i].f, -coef.C1dDelta, bo_ij->dBOp ); - /*1st, dBO*/ - rvec_ScaledAdd(atoms[i].f, coef.C3dDelta, workspace->dDeltap_self[i]); - /*2nd, dBO*/ - - rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi, bo_ij->dln_BOp_pi ); - /*1st, dBOpi*/ - rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi, bo_ij->dBOp ); - /*2nd, dBOpi*/ - rvec_ScaledAdd(atoms[i].f, coef.C4dbopi, workspace->dDeltap_self[i]); - /*3rd, dBOpi*/ - - rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); - /*1st, dBOpi2*/ - rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi2, bo_ij->dBOp ); - /*2nd, dBOpi2*/ - rvec_ScaledAdd(atoms[i].f, coef.C4dbopi2, workspace->dDeltap_self[i]); - /*3rd, dBOpi2*/ - } -} - -HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list *bonds) -{ - int pk; - bond_data *nbr_k, *nbr_k_sym; - - /* - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - rvec_Add (atoms[i].f, nbr_k->t_f); - } - */ - - for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { - nbr_k = &(bonds->select.bond_list[pk]); - nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] ); - - rvec_Add (atoms[i].f, nbr_k_sym->t_f); - } -} /* Locate j on i's list. This function assumes that j is there for sure! @@ -1031,7 +753,7 @@ void Calculate_Bond_Orders( reax_system *system, control_params *control, //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); if( i < j ) { - twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] ); + twbp = &( system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ] ); #ifdef TEST_FORCES Set_Start_Index( pj, top_dbo, dBOs ); /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", @@ -1348,557 +1070,3 @@ bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */ Print_Bond_Orders( system, control, data, workspace, lists, out_control ); #endif } - - -//Cuda Functions -GLOBAL void Cuda_Calculate_Bond_Orders_Init ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, - static_storage workspace, int num_atom_types, int N ) -{ - int i, type_i; - real p_boc1, p_boc2; - single_body_parameters *sbp_i; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - p_boc1 = g_params.l[0]; - p_boc2 = g_params.l[1]; - - /* Calculate Deltaprime, Deltaprime_boc values */ - type_i = atoms[i].type; - sbp_i = &(sbp[type_i]); - workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency; - workspace.Deltap_boc[i] = - workspace.total_bond_order[i] - sbp_i->valency_val; - workspace.total_bond_order[i] = 0; -} - - -/* A very important and crucial assumption here is that each segment - belonging to a different atom in nbrhoods->nbr_list is sorted in its own. - This can either be done in the general coordinator function or here */ - -GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, - two_body_parameters *tbp, static_storage workspace, list bonds, - list dDeltas, list dBOs, int num_atom_types, int N ) -{ - int i, j, pj, type_i, type_j; - int start_i, end_i; - int num_bonds, sym_index; - real p_boc1, p_boc2; - real val_i, Deltap_i, Deltap_boc_i; - real val_j, Deltap_j, Deltap_boc_j; - real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5; - real exp_p1i, exp_p2i, exp_p1j, exp_p2j; - real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji; - real Cf45_ij, Cf45_ji, p_lp1; - real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji; - real explp1; - two_body_parameters *twbp; - bond_order_data *bo_ij, *bo_ji; - single_body_parameters *sbp_i, *sbp_j; - - -#if defined(TEST_FORCES) - int k, pk, start_j, end_j; - int top_dbo=0, top_dDelta=0; - dbond_data *pdbo; - dDelta_data *ptop_dDelta; - -#endif - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - num_bonds = 0; - p_boc1 = g_params.l[0]; - p_boc2 = g_params.l[1]; - - /* Calculate Deltaprime, Deltaprime_boc values */ - //for( i = 0; i < system->N; ++i ) { - /* - if (i < N) { - type_i = atoms[i].type; - sbp_i = &(sbp[type_i]); - workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency; - workspace.Deltap_boc[i] = - workspace.total_bond_order[i] - sbp_i->valency_val; - workspace.total_bond_order[i] = 0; - - } - - __syncthreads (); - */ - - - // fprintf( stderr, "done with uncorrected bond orders\n" ); - - - /* Corrected Bond Order calculations */ - //for( i = 0; i < system->N; ++i ) { - type_i = atoms[i].type; - sbp_i = &(sbp[type_i]); - val_i = sbp_i->valency; - Deltap_i = workspace.Deltap[i]; - Deltap_boc_i = workspace.Deltap_boc[i]; - start_i = Start_Index(i, &bonds); - end_i = End_Index(i, &bonds); - //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n", - // i+1, Deltap_i, Deltap_boc_i, start_i, end_i ); - - for( pj = start_i; pj < end_i; ++pj ) { - j = bonds.select.bond_list[pj].nbr; - type_j = atoms[j].type; - bo_ij = &( bonds.select.bond_list[pj].bo_data ); - //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); - - if( i < j ) { - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); -#ifdef TEST_FORCES - Set_Start_Index( pj, top_dbo, &dBOs ); - /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", - workspace->reverse_map[i], workspace->reverse_map[j], - twbp->ovc, twbp->v13cor, bo_ij->BO ); */ -#endif - if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) { - /* There is no correction to bond orders nor to derivatives of - bond order prime! So we leave bond orders unchanged and - set derivative of bond order coefficients s.t. - dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */ - bo_ij->C1dbo = 1.000000; - bo_ij->C2dbo = 0.000000; - bo_ij->C3dbo = 0.000000; - - bo_ij->C1dbopi = bo_ij->BO_pi; - bo_ij->C2dbopi = 0.000000; - bo_ij->C3dbopi = 0.000000; - bo_ij->C4dbopi = 0.000000; - - bo_ij->C1dbopi2 = bo_ij->BO_pi2; - bo_ij->C2dbopi2 = 0.000000; - bo_ij->C3dbopi2 = 0.000000; - bo_ij->C4dbopi2 = 0.000000; - -#ifdef TEST_FORCES - pdbo = &(dBOs.select.dbo_list[ top_dbo ]); - - // compute dBO_ij/dr_i - pdbo->wrt = i; - rvec_Copy( pdbo->dBO, bo_ij->dBOp ); - rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi ); - rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 ); - - // compute dBO_ij/dr_j - pdbo++; - pdbo->wrt = j; - rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp ); - rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi ); - rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 ); - - top_dbo += 2; -#endif - } - else { - val_j = sbp[type_j].valency; - Deltap_j = workspace.Deltap[j]; - Deltap_boc_j = workspace.Deltap_boc[j]; - - /* on page 1 */ - if( twbp->ovc >= 0.001 ) { - /* Correction for overcoordination */ - exp_p1i = EXP( -p_boc1 * Deltap_i ); - exp_p2i = EXP( -p_boc2 * Deltap_i ); - exp_p1j = EXP( -p_boc1 * Deltap_j ); - exp_p2j = EXP( -p_boc2 * Deltap_j ); - - f2 = exp_p1i + exp_p1j; - f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i + exp_p2j ) ); - f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + - ( val_j + f2 )/( val_j + f2 + f3 ) ); - - /*fprintf( stderr,"%6d%6d\t%g %g j:%g %g p_boc:%g %g\n", - i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 ); - fprintf( stderr,"\tf:%g %g %g, exp:%g %g %g %g\n", - f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/ - - /* Now come the derivates */ - /* Bond Order pages 5-7, derivative of f1 */ - temp = f2 + f3; - u1_ij = val_i + temp; - u1_ji = val_j + temp; - Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji )); - Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + - ( u1_ji - f3 ) / SQR( u1_ji )); - - //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + - // Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j ); - Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - - ((val_i+f2) / SQR(u1_ij)) * - ( -p_boc1 * exp_p1i + - exp_p2i / ( exp_p2i + exp_p2j ) ) + - -p_boc1 * exp_p1i / u1_ji - - ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i + - exp_p2i / ( exp_p2i + exp_p2j ) )); - - Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + - Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); - //fprintf( stderr, "\tCf1:%g %g\n", Cf1_ij, Cf1_ji ); - } - else { - /* No overcoordination correction! */ - f1 = 1.0; - Cf1_ij = Cf1_ji = 0.0; - } - - if( twbp->v13cor >= 0.001 ) { - /* Correction for 1-3 bond orders */ - exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - - Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5); - exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - - Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5); - - f4 = 1. / (1. + exp_f4); - f5 = 1. / (1. + exp_f5); - f4f5 = f4 * f5; - - /* Bond Order pages 8-9, derivative of f4 and f5 */ - /*temp = twbp->p_boc5 - - twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO ); - u_ij = temp + twbp->p_boc3 * Deltap_boc_i; - u_ji = temp + twbp->p_boc3 * Deltap_boc_j; - Cf45_ij = Cf45( u_ij, u_ji ) / f4f5; - Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/ - Cf45_ij = -f4 * exp_f4; - Cf45_ji = -f5 * exp_f5; - } - else { - f4 = f5 = f4f5 = 1.0; - Cf45_ij = Cf45_ji = 0.0; - } - - /* Bond Order page 10, derivative of total bond order */ - A0_ij = f1 * f4f5; - A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * - (Cf45_ij + Cf45_ji); - A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij; - A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji; - A3_ij = A2_ij + Cf1_ij / f1; - A3_ji = A2_ji + Cf1_ji / f1; - - /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f -A2_ji: %f, A3_ij: %f, A3_ji: %f\n", -bo_ij->BO, A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/ - - /* find corrected bond order values and their deriv coefs */ - bo_ij->BO = bo_ij->BO * A0_ij; - bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1; - bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1; - bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - - bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij; - bo_ij->C2dbo = bo_ij->BO * A2_ij; - bo_ij->C3dbo = bo_ij->BO * A2_ji; - - bo_ij->C1dbopi = f1*f1*f4*f5; - bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij; - bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij; - bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji; - - bo_ij->C1dbopi2 = f1*f1*f4*f5; - bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij; - bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij; - -#ifdef TEST_FORCES - /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", - i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/ - - /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", - //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n", - workspace->orig_id[i], workspace->orig_id[j] - A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji - bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s, - bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, - bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi, - bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2 - ); */ - - Calculate_dBO( i, pj, workspace, lists, &top_dbo ); -#endif - } - - /* neglect bonds that are < 1e-10 */ - if( bo_ij->BO < 1e-10 ) - bo_ij->BO = 0.0; - if( bo_ij->BO_s < 1e-10 ) - bo_ij->BO_s = 0.0; - if( bo_ij->BO_pi < 1e-10 ) - bo_ij->BO_pi = 0.0; - if( bo_ij->BO_pi2 < 1e-10 ) - bo_ij->BO_pi2 = 0.0; - - workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO - - - /* fprintf( stderr, "%d %d\t%g %g %g %g\n -Cdbo:\t%g %g %g\n -Cdbopi:\t%g %g %g %g\n -Cdbopi2:%g %g %g %g\n\n", -i+1, j+1, bonds->select.bond_list[ pj ].d, -bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, -bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, -bo_ij->C1dbopi, bo_ij->C2dbopi, bo_ij->C3dbopi, bo_ij->C4dbopi, -bo_ij->C1dbopi2, bo_ij->C2dbopi2, -bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */ - - /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n", - i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */ - -#ifdef TEST_FORCES - Set_End_Index( pj, top_dbo, &dBOs ); - //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta ); -#endif - } - /* - else { - // We only need to update bond orders from bo_ji - // everything else is set in uncorrected_bo calculations - sym_index = bonds.select.bond_list[pj].sym_index; - bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data); - bo_ij->BO = bo_ji->BO; - bo_ij->BO_s = bo_ji->BO_s; - bo_ij->BO_pi = bo_ji->BO_pi; - bo_ij->BO_pi2 = bo_ji->BO_pi2; - - workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO - -#ifdef TEST_FORCES - //Add_dBO( system, lists, j, sym_index, 1.0, workspace.dDelta ); -#endif -} - */ -} - -#ifdef TEST_FORCES -// fprintf( stderr, "dDelta computations\nj:" ); -Set_Start_Index( i, top_dDelta, &dDeltas ); -ptop_dDelta = &( dDeltas.select.dDelta_list[top_dDelta] ); - -for( pj = start_i; pj < end_i; ++pj ) { - j = bonds.select.bond_list[pj].nbr; - // fprintf( stderr, "%d ", j ); - - if( !rvec_isZero( workspace.dDelta[j] ) ) { - ptop_dDelta->wrt = j; - rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[j] ); - rvec_MakeZero( workspace.dDelta[j] ); - ++top_dDelta, ++ptop_dDelta; - } - - start_j = Start_Index(j, &bonds); - end_j = End_Index(j, &bonds); - for( pk = start_j; pk < end_j; ++pk ) { - k = bonds.select.bond_list[pk].nbr; - if( !rvec_isZero( workspace.dDelta[k] ) ) { - ptop_dDelta->wrt = k; - rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[k] ); - rvec_MakeZero( workspace.dDelta[k] ); - ++top_dDelta, ++ptop_dDelta; - } - } -} - -Set_End_Index( i, top_dDelta, &dDeltas ); - -/*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj ) - fprintf( stdout, "dDel: %d %d [%g %g %g]\n", - i+1, dDeltas->select.dDelta_list[pj].wrt+1, - dDeltas->select.dDelta_list[pj].dVal[0], - dDeltas->select.dDelta_list[pj].dVal[1], - dDeltas->select.dDelta_list[pj].dVal[2] );*/ -#endif -//} - -/*fprintf(stderr,"\tCalculated actual bond orders ...\n" ); - fprintf(stderr,"%6s%8s%8s%8s%8s%8s%8s%8s\n", - "atom", "Delta", "Delta_e", "Delta_boc", "nlp", - "Delta_lp", "Clp", "dDelta_lp" );*/ - -/* - p_lp1 = g_params.l[15]; - -//get the kernel ID for the following computation -j = i; - -// Calculate some helper variables that are used at many places -// throughout force calculations -//for( j = 0; j < system->N; ++j ) { -type_j = atoms[j].type; -sbp_j = &(sbp[ type_j ]); - -workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency; -workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e; -workspace.Delta_boc[j] = workspace.total_bond_order[j] - -sbp_j->valency_boc; - -workspace.vlpex[j] = workspace.Delta_e[j] - -2.0 * (int)(workspace.Delta_e[j]/2.0); -explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j])); -workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0); -workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j]; -workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]); -// Adri uses different dDelta_lp values than the ones in notes... // -workspace.dDelta_lp[j] = workspace.Clp[j]; -//workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * -//((fabs(workspace->Delta_e[j]/2.0 - -// (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); - -if( sbp_j->mass > 21.0 ) { -workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); -workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; -workspace.dDelta_lp_temp[j] = 0.; -} -else { -workspace.nlp_temp[j] = workspace.nlp[j]; -workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; -workspace.dDelta_lp_temp[j] = workspace.Clp[j]; -} - -//fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n", -//j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], -//workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt, -//workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] ); -//} - */ - -//Print_Bonds( system, bonds, "sbonds.out" ); - -#if defined(DEBUG) -//fprintf( stderr, "Number of bonds: %d\n", num_bonds ); -//Print_Bond_Orders( system, control, data, workspace, lists, out_control ); -#endif -} - -GLOBAL void Cuda_Update_Uncorrected_BO ( static_storage workspace, list bonds, int N ) -{ - int i, j, pj; - int start_i, end_i; - int sym_index; - - bond_order_data *bo_ij, *bo_ji; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - start_i = Start_Index(i, &bonds); - end_i = End_Index(i, &bonds); - - for( pj = start_i; pj < end_i; ++pj ) { - - j = bonds.select.bond_list[pj].nbr; - bo_ij = &( bonds.select.bond_list[pj].bo_data ); - - if( i >= j ) { - // We only need to update bond orders from bo_ji - // everything else is set in uncorrected_bo calculations - sym_index = bonds.select.bond_list[pj].sym_index; - bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data); - bo_ij->BO = bo_ji->BO; - bo_ij->BO_s = bo_ji->BO_s; - bo_ij->BO_pi = bo_ji->BO_pi; - bo_ij->BO_pi2 = bo_ji->BO_pi2; - - workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO - } - } -} - -GLOBAL void Cuda_Update_Workspace_After_Bond_Orders( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, - static_storage workspace, int N ) -{ - int j, type_j; - real explp1; - real p_lp1; - single_body_parameters *sbp_i, *sbp_j; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - - p_lp1 = g_params.l[15]; - - /* Calculate some helper variables that are used at many places - throughout force calculations */ - //for( j = 0; j < system->N; ++j ) { - type_j = atoms[j].type; - sbp_j = &(sbp[ type_j ]); - - workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency; - workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e; - workspace.Delta_boc[j] = workspace.total_bond_order[j] - - sbp_j->valency_boc; - - workspace.vlpex[j] = workspace.Delta_e[j] - - 2.0 * (int)(workspace.Delta_e[j]/2.0); - explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j])); - workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0); - workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j]; - workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]); - /* Adri uses different dDelta_lp values than the ones in notes... */ - workspace.dDelta_lp[j] = workspace.Clp[j]; - //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * - //((fabs(workspace->Delta_e[j]/2.0 - - // (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); - - if( sbp_j->mass > 21.0 ) { - workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); - workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; - workspace.dDelta_lp_temp[j] = 0.; - } - else { - workspace.nlp_temp[j] = workspace.nlp[j]; - workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; - workspace.dDelta_lp_temp[j] = workspace.Clp[j]; - } - - //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n", - //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], - //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt, - //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] ); - //} - -} - -//Import from the forces file. - -GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data, - static_storage workspace, list p_bonds, int ensemble, int N) -{ - int i, pj; - list *bonds = &p_bonds; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < N) - { - for (pj = Start_Index (i, bonds); pj < End_Index (i, bonds); ++pj) - { - //int j = bonds->select.bond_list[pj].nbr; - if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) - Cuda_Add_dBond_to_Forces (i, pj, atoms, &workspace, bonds ); - else - Cuda_Add_dBond_to_Forces_NPT (i, pj, atoms, data, &workspace, bonds ); - } - } -} - -GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *atoms, simulation_data *data, - static_storage workspace, list p_bonds, int ensemble, int N) -{ - int i, pj; - list *bonds = &p_bonds; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < N) - { - if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) - Cuda_dbond_to_Forces_postprocess (i, atoms, bonds ); - } -} diff --git a/PuReMD-GPU/src/bond_orders.h b/PuReMD-GPU/src/bond_orders.h index 19fea911f129715cec305f0a4f1f0ea433d412d4..476bacfb0704c97d4b3b44b15e76883be4cad9b9 100644 --- a/PuReMD-GPU/src/bond_orders.h +++ b/PuReMD-GPU/src/bond_orders.h @@ -21,8 +21,10 @@ #ifndef __BOND_ORDERS_H_ #define __BOND_ORDERS_H_ + #include "mytypes.h" + typedef struct { real C1dbo, C2dbo, C3dbo; @@ -31,6 +33,7 @@ typedef struct real C1dDelta, C2dDelta, C3dDelta; } dbond_coefficients; + #ifdef TEST_FORCES void Get_dBO( reax_system*, list**, int, int, real, rvec* ); void Get_dBOpinpi2( reax_system*, list**, int, int, real, real, rvec*, rvec* ); @@ -52,16 +55,4 @@ void Add_dBond_to_Forces_NPT( int, int, reax_system*, simulation_data*, void Calculate_Bond_Orders( reax_system*, control_params*, simulation_data*, static_storage*, list**, output_controls* ); -//CUDA Functions -GLOBAL void Cuda_Calculate_Bond_Orders_Init ( reax_atom *, global_parameters , single_body_parameters *, - static_storage , int , int ); -GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *, global_parameters , single_body_parameters *, - two_body_parameters *, static_storage , list , list , list , int , int ); -GLOBAL void Cuda_Update_Uncorrected_BO ( static_storage , list , int ); -GLOBAL void Cuda_Update_Workspace_After_Bond_Orders( reax_atom *, global_parameters , single_body_parameters *, - static_storage , int ); -GLOBAL void Cuda_Compute_Total_Force (reax_atom *, simulation_data *, static_storage , list , int , int ); -GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *, simulation_data *, static_storage , list , int , int ); -//HOST_DEVICE void Cuda_Add_dBond_to_Forces( int, int, reax_atom *, static_storage*, list* ); -//HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int, int, reax_atom *, simulation_data*, static_storage*, list* ); #endif diff --git a/PuReMD-GPU/src/box.cu b/PuReMD-GPU/src/box.c similarity index 100% rename from PuReMD-GPU/src/box.cu rename to PuReMD-GPU/src/box.c diff --git a/PuReMD-GPU/src/box.h b/PuReMD-GPU/src/box.h index ed8cc9f4e1900a71f5a6b319f98e7d425400f928..418aa6208a81fb05ee56ff09afc6ff76751f75c9 100644 --- a/PuReMD-GPU/src/box.h +++ b/PuReMD-GPU/src/box.h @@ -21,11 +21,13 @@ #ifndef __BOX_H__ #define __BOX_H__ + #include "mytypes.h" + /* Initializes box from CRYST1 line of PDB */ void Init_Box_From_CRYST(real, real, real, real, real, real, - simulation_box*/*, int*/); + simulation_box*/*, int*/); /* Initializes box from box rtensor */ void Update_Box(rtensor, simulation_box* /*, int*/); @@ -43,13 +45,11 @@ void Transform( rvec, simulation_box*, char, rvec ); void Transform_to_UnitBox( rvec, simulation_box*, char, rvec ); void Get_NonPeriodic_Far_Neighbors( rvec, rvec, simulation_box*, - control_params*, far_neighbor_data*, int* ); + control_params*, far_neighbor_data*, int* ); void Get_Periodic_Far_Neighbors_Big_Box( rvec, rvec, simulation_box*, - control_params*, far_neighbor_data*, - int* ); + control_params*, far_neighbor_data*, int* ); void Get_Periodic_Far_Neighbors_Small_Box( rvec, rvec, simulation_box*, - control_params*, far_neighbor_data*, - int* ); + control_params*, far_neighbor_data*, int* ); void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec ); void Inc_on_T3_Gen( rvec, rvec, simulation_box* ); @@ -61,7 +61,9 @@ void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );*/ /* this function returns cartesian norm but triclinic distance vector */ real Metric_Product( rvec, rvec, simulation_box* ); -HOST_DEVICE inline real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r) +void Print_Box_Information( simulation_box*, FILE* ); + +static inline HOST_DEVICE real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r) { real norm = 0.0; @@ -94,12 +96,8 @@ HOST_DEVICE inline real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box } -void Print_Box_Information( simulation_box*, FILE* ); - -//CUDA Device Functions -//HOST_DEVICE inline void Inc_on_T3( rvec, rvec, simulation_box* ); -HOST_DEVICE inline void Inc_on_T3( rvec x, rvec dx, simulation_box *box ) +static inline HOST_DEVICE void Inc_on_T3( rvec x, rvec dx, simulation_box *box ) { int i; real tmp; @@ -115,4 +113,5 @@ HOST_DEVICE inline void Inc_on_T3( rvec x, rvec dx, simulation_box *box ) } } + #endif diff --git a/PuReMD-GPU/src/center_mass.h b/PuReMD-GPU/src/center_mass.h deleted file mode 100644 index 4048511e2e4b311f9de74cef94c2a661d51b4b39..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/center_mass.h +++ /dev/null @@ -1,48 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#ifndef __CENTER_MASS_H__ -#define __CENTER_MASS_H__ - -#include "mytypes.h" - -GLOBAL void center_of_mass_blocks (single_body_parameters *, reax_atom *, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n); - -GLOBAL void center_of_mass (rvec *xcm, - rvec *vcm, - rvec *amcm, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n); - -GLOBAL void compute_center_mass (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n); - -GLOBAL void compute_center_mass (real *input, real *output, size_t n); - -#endif diff --git a/PuReMD-GPU/src/cuda_QEq.cu b/PuReMD-GPU/src/cuda_QEq.cu new file mode 100644 index 0000000000000000000000000000000000000000..033945338aa76aa4c02909f90d2ca3eb1dacad58 --- /dev/null +++ b/PuReMD-GPU/src/cuda_QEq.cu @@ -0,0 +1,724 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_QEq.h" + +#include "QEq.h" +#include "allocate.h" +#include "lin_alg.h" +#include "list.h" +#include "print_utils.h" +#include "index_utils.h" +#include "system_props.h" + +#include "cuda_copy.h" +#include "cuda_init.h" +#include "cuda_utils.h" +#include "cuda_lin_alg.h" +#include "cuda_reduction.h" + +#include "sort.h" +#include "validation.h" + + +GLOBAL void Cuda_Sort_Matrix_Rows( sparse_matrix A ) +{ + int i; + int si, ei; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= A.n ) return; + + si = A.start[i]; + ei = A.end [i]; + + quick_sort( A.entries + si, 0, ei-si-1 ); +} + + +GLOBAL void Cuda_Calculate_Droptol( sparse_matrix p_A, real *droptol, real dtol ) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int k, j, offset, x, diagonal; + real val; + sparse_matrix *A = &p_A; + + if ( i < A->n ) { + droptol [i] = 0; + + for (k = A->start[i]; k < A->end[i]; ++k ) { + val = A->entries[k].val; + droptol [i] += val*val; + } + } + + __syncthreads (); + if ( i < A->n ) { + droptol [i] = SQRT (droptol[i]) * dtol; + } + +} + + +GLOBAL void Cuda_Calculate_Droptol_js( sparse_matrix p_A, real *droptol, real dtol ) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int k, j, offset, x, diagonal; + real val; + sparse_matrix *A = &p_A; + + for (x = 0; x < A->n; x ++) + { + if (i < (A->end[i]-1 - A->start[i])) { + offset = A->start [i] + i; + j = A->entries[offset].j; + val = A->entries[offset].val; + droptol [j] += val * val; + } + __syncthreads (); + } +} + + +GLOBAL void Cuda_Calculate_Droptol_diagonal( sparse_matrix p_A, real *droptol, real dtol ) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int k, j, offset, x, diagonal; + real val; + sparse_matrix *A = &p_A; + + if ( i < A->n ) { + //diagonal element + diagonal = A->end[i]-1; + val = A->entries [diagonal].val; + droptol [i] += val*val; + } + + /*calculate local droptol for each row*/ + if ( i < A->n ) + droptol [i] = SQRT (droptol[i]) * dtol; +} + + +GLOBAL void Cuda_Estimate_LU_Fill( sparse_matrix p_A, real *droptol, int *fillin ) +{ + int i, j, pj; + real val; + sparse_matrix *A = &p_A; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= A->n) return; + + fillin [i] = 0; + + for (pj = A->start[i]; pj < A->end[i]-1; ++pj) + { + j = A->entries [pj].j; + val = A->entries[pj].val; + + if (fabs (val) > droptol [i]) ++fillin [i]; + } +} + + +void Cuda_ICHOLT( sparse_matrix *A, real *droptol, + sparse_matrix *L, sparse_matrix *U ) +{ + sparse_matrix_entry tmp[1000]; + int i, j, pj, k1, k2, tmptop, Ltop; + real val; + int *Utop; + + Utop = (int*) malloc((A->n+1) * sizeof(int)); + + // clear variables + Ltop = 0; + tmptop = 0; + for( i = 0; i <= A->n; ++i ) + L->start[i] = U->start[i] = 0; + + for( i = 0; i < A->n; ++i ) + Utop[i] = 0; + + //fprintf( stderr, "n: %d\n", A->n ); + for( i = 0; i < A->n; ++i ){ + L->start[i] = Ltop; + tmptop = 0; + + for( pj = A->start[i]; pj < A->end[i]-1; ++pj ){ + j = A->entries[pj].j; + val = A->entries[pj].val; + //fprintf( stderr, "i: %d, j: %d", i, j ); + + //CHANGE ORIGINAL + if (j >= i) break; + //CHANGE ORIGINAL + + if( fabs(val) > droptol[i] ){ + k1 = 0; + k2 = L->start[j]; + while( k1 < tmptop && k2 < L->start[j+1] ){ + if( tmp[k1].j < L->entries[k2].j ) + ++k1; + else if( tmp[k1].j > L->entries[k2].j ) + ++k2; + else + val -= (tmp[k1++].val * L->entries[k2++].val); + } + + // L matrix is lower triangular, + // so right before the start of next row comes jth diagonal + val /= L->entries[L->start[j+1]-1].val; + + tmp[tmptop].j = j; + tmp[tmptop].val = val; + ++tmptop; + } + + //fprintf( stderr, " -- done\n" ); + } + + // compute the ith diagonal in L + // sanity check + if( A->entries[pj].j != i ) { + fprintf( stderr, "i=%d, badly built A matrix!\n", i ); + exit(999); + } + + val = A->entries[pj].val; + for( k1 = 0; k1 < tmptop; ++k1 ) + val -= (tmp[k1].val * tmp[k1].val); + + tmp[tmptop].j = i; + tmp[tmptop].val = SQRT(val); + + // apply the dropping rule once again + //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop ); + //for( k1 = 0; k1<= tmptop; ++k1 ) + // fprintf( stderr, "%d(%f) ", tmp[k1].j, tmp[k1].val ); + //fprintf( stderr, "\n" ); + //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] ); + for( k1 = 0; k1 < tmptop; ++k1 ) + if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){ + L->entries[Ltop].j = tmp[k1].j; + L->entries[Ltop].val = tmp[k1].val; + U->start[tmp[k1].j+1]++; + ++Ltop; + //fprintf( stderr, "%d(%.4f) ", tmp[k1].j+1, tmp[k1].val ); + } + // keep the diagonal in any case + L->entries[Ltop].j = tmp[k1].j; + L->entries[Ltop].val = tmp[k1].val; + ++Ltop; + //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1, tmp[k1].val ); + } + + L->start[i] = Ltop; + //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 ); + + for( i = 1; i <= U->n; ++i ) + Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; + + for( i = 0; i < L->n; ++i ) + for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ + j = L->entries[pj].j; + U->entries[Utop[j]].j = i; + U->entries[Utop[j]].val = L->entries[pj].val; + Utop[j]++; + } + + //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 ); +} + + +/* +//Parallel for each row +//Each kernel will run for 6540 number of times. +GLOBAL void Cuda_ICHOLT( reax_system *system, sparse_matrix p_A, real *droptol, +sparse_matrix p_L, sparse_matrix p_U ) +{ +int start, end, count; +real tempvalue, val; +int i,pj,tmptop, offset; +int j, k1, k2; + +sparse_matrix *A, *L, *U; +sparse_matrix_entry *tmp; + +A = &p_A; +L = &p_L; +U = &p_U; + +real *null_val; +null_val = 0; + +extern __shared__ real tmp_val[]; +extern __shared__ sparse_matrix_entry sh_tmp[]; + +int kid = blockIdx.x * blockDim.x + threadIdx.x; +tmp = (sparse_matrix_entry *) (tmp_val + blockDim.x); + +offset = 0; +for( i = 0; i < 10; ++i ) +{ +//if (kid == 0) L->start[i] = i * system->max_sparse_matrix_entries; +if (kid == 0) L->start[i] = offset; +tmptop = 0; + +start = A->start[i]; +end = A->end[i]-1; //inclusive +count = end - start; //inclusive +tmp_val [kid] = 0; + +if (kid < count) //diagonal not included +{ +pj = start + kid; + +j = A->entries[pj].j; +val = A->entries[pj].val; + +if( fabs(val) > droptol[i] ) +{ +k1 = 0; +k2 = L->start[j]; +while( k1 < tmptop && k2 < L->end[j] ){ +if( tmp[k1].j < L->entries[k2].j ) +++k1; +else if( tmp[k1].j > L->entries[k2].j ) +++k2; +else +tmp_val[kid] = (tmp[k1++].val * L->entries[k2++].val); +} + +//here read the shared memory of all the kernels +if (kid == 0) +{ +for (i = 0; i < count; i++) +tempvalue += tmp_val [i]; + +val -= tempvalue; + +// L matrix is lower triangular, +// so right before the start of next row comes jth diagonal +val /= L->entries[L->end[j]-1].val; + +tmp[tmptop].j = j; +tmp[tmptop].val = val; +++tmptop; +} +} +} +__syncthreads (); + + +// compute the ith diagonal in L +// sanity check +if (kid == 0) +{ + if( A->entries[end].j != i ) { + //intentional core dump here for sanity sake + *null_val = 1; + } +} + +//diagonal element +//val = A->entries[pj].val; +//for( k1 = 0; k1 < tmptop; ++k1 ) +if (kid < count) + tmp_val[kid] = (tmp[kid].val * tmp[kid].val); + + __syncthreads (); + +if (kid == 0) +{ + val = A->entries [end].val; + for (i = 0; i < count; i++) + tempvalue += tmp_val [i]; + + val -= tempvalue; + tmp[tmptop].j = i; + tmp[tmptop].val = SQRT(val); +} +__syncthreads (); + +//Fill in the LU entries +//for( k1 = 0; k1 < count; ++k1 ) +if (kid < count ) +{ + if( fabs(tmp[kid].val) > droptol[i] / tmp[tmptop].val ){ + L->entries[offset + kid].j = tmp[kid].j; + L->entries[offset + kid].val = tmp[kid].val; + U->start[tmp[kid].j+1]++; + } +} +__syncthreads (); + +if (kid == 0) { + // keep the diagonal in any case + offset += count; + L->entries[offset].j = tmp[count].j; + L->entries[offset].val = tmp[count].val; + ++offset; + L->end [i] = offset; +} +__syncthreads (); +} // end of main for loop +} + +void Cuda_Fill_U ( sparse_matrix *A, real *droptol, + sparse_matrix *L, sparse_matrix *U ) +{ + int i, pj, j; + + for( i = 1; i <= U->n; ++i ) + Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1; + + for( i = 0; i < L->n; ++i ) + for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){ + j = L->entries[pj].j; + U->entries[Utop[j]].j = i; + U->entries[Utop[j]].val = L->entries[pj].val; + Utop[j]++; + } +} +*/ + + +void Cuda_Init_MatVec( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, list *far_nbrs ) +{ + int i, fillin; + real s_tmp, t_tmp; + int *spad = (int *)scratch; + real start = 0, end = 0; + + if( control->refactor > 0 && + ((data->step-data->prev_steps)%control->refactor==0 || + dev_workspace->L.entries==NULL) ) + { + Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>> + ( dev_workspace->H ); + cudaThreadSynchronize( ); + cudaCheckError( ); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Sorting done... \n"); +#endif + + Cuda_Calculate_Droptol<<<BLOCKS, BLOCK_SIZE >>> + ( dev_workspace->H, dev_workspace->droptol, control->droptol ); + cudaThreadSynchronize( ); + cudaCheckError( ); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Droptol done... \n"); +#endif + + if( dev_workspace->L.entries == NULL ) + { + cuda_memset( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH ); + Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>> + ( dev_workspace->H, dev_workspace->droptol, spad ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + //Reduction for fill in + Cuda_reduction_int<<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); + + Cuda_reduction_int<<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> + (spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device( &fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); + fillin += dev_workspace->H.n; + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Calculated value of the fill in is --> %d \n ", fillin ); +#endif + + dev_workspace->L.n = far_nbrs->n; + dev_workspace->L.m = fillin; + Cuda_Init_Sparse_Matrix( &dev_workspace->L, fillin, far_nbrs->n ); + + dev_workspace->U.n = far_nbrs->n; + dev_workspace->U.m = fillin; + Cuda_Init_Sparse_Matrix( &dev_workspace->U, fillin, far_nbrs->n ); + } + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "LU matrix done...\n"); +#endif + + //TODO -- This is the ILU Factorization of the H Matrix. + //This is present in the CUDA 5.0 compilation which is not working currently. + //Fix this when CUDA 5.0 is correctly setup. + //TODO + //shared memory is per block + // here we have only one block - + /* + fprintf (stderr, "max sparse matrix entries %d \n", system->max_sparse_matrix_entries ); + Cuda_ICHOLT <<<1, system->max_sparse_matrix_entries, + system->max_sparse_matrix_entries *(REAL_SIZE + SPARSE_MATRIX_ENTRY_SIZE) >>> + ( system, dev_workspace->H, + dev_workspace->droptol, + dev_workspace->L, + dev_workspace->U ); + cudaThreadSynchronize (); + fprintf (stderr, "Cuda_ICHOLT .. done ...-> %d\n ", cudaGetLastError ()); + */ + + //1. copy the H matrix from device to host + //2. Allocate the L/U matrices on the host and device. + //3. Compute the L/U on the host + //4. copy the results to the device + //5. Continue the computation. + sparse_matrix t_H, t_L, t_U; + real *t_droptol; + + t_droptol = (real *) malloc( REAL_SIZE * system->N ); + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m ); +#endif + + start = Get_Time( ); + if( !Allocate_Matrix(&t_H, dev_workspace->H.n, dev_workspace->H.m) ) + { + fprintf(stderr, "No space for H matrix \n"); + exit( 0 ); + } + if( !Allocate_Matrix(&t_L, far_nbrs->n, dev_workspace->L.m) ) + { + fprintf( stderr, "No space for L matrix \n" ); + exit( 0 ); + } + if( !Allocate_Matrix(&t_U, far_nbrs->n, dev_workspace->U.m) ) + { + fprintf( stderr, "No space for U matrix \n" ); + exit( 0 ); + } + + copy_host_device( t_H.start, dev_workspace->H.start, INT_SIZE * + (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, + RES_SPARSE_MATRIX_INDEX ); + copy_host_device( t_H.end, dev_workspace->H.end, INT_SIZE * + (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, + RES_SPARSE_MATRIX_INDEX ); + copy_host_device( t_H.entries, dev_workspace->H.entries, + SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, + cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY ); + + copy_host_device( t_droptol, dev_workspace->droptol, REAL_SIZE * + system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL ); + + //fprintf (stderr, " Done copying LUH .. \n"); + Cuda_ICHOLT( &t_H, t_droptol, &t_L, &t_U ); + + Sync_Host_Device_Mat( &t_L, &t_U, cudaMemcpyHostToDevice ); + end += Get_Timing_Info( start ); + + /* + fprintf (stderr, "Done syncing .... \n"); + free (t_droptol); + fprintf (stderr, "Freed droptol ... \n"); + Deallocate_Matrix (&t_H); + fprintf (stderr, "Freed H ... \n"); + Deallocate_Matrix (&t_L); + fprintf (stderr, "Freed l ... \n"); + Deallocate_Matrix (&t_U); + fprintf (stderr, "Freed u ... \n"); + */ + + //#ifdef __DEBUG_CUDA__ + fprintf( stderr, "Done copying the L/U matrices to the device ---> %f \n", end ); + //#endif + + //#ifdef __BUILD_DEBUG__ + // validate_lu (workspace); + //#endif + } +} + + +GLOBAL void Init_MatVec_Postprocess( static_storage p_workspace, int N ) +{ + + static_storage *workspace = &p_workspace; + real s_tmp, t_tmp; + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= N) return; + // no extrapolation + //s_tmp = workspace->s[0][i]; + //t_tmp = workspace->t[0][i]; + + // linear + //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i]; + //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i]; + + // quadratic + //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]); + t_tmp = workspace->t[index_wkspace_sys(2,i,N)] + 3*(workspace->t[index_wkspace_sys(0,i,N)]-workspace->t[index_wkspace_sys(1,i,N)]); + + // cubic + s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,N)] + workspace->s[index_wkspace_sys(2,i,N)]) - + (6 * workspace->s[index_wkspace_sys(1,i,N)] + workspace->s[index_wkspace_sys(3,i,N)] ); + //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - + // (6 * workspace->t[1][i] + workspace->t[3][i] ); + + // 4th order + //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + + // 10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i]; + //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + + // 10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i]; + + workspace->s[index_wkspace_sys(4,i,N)] = workspace->s[index_wkspace_sys(3,i,N)]; + workspace->s[index_wkspace_sys(3,i,N)] = workspace->s[index_wkspace_sys(2,i,N)]; + workspace->s[index_wkspace_sys(2,i,N)] = workspace->s[index_wkspace_sys(1,i,N)]; + workspace->s[index_wkspace_sys(1,i,N)] = workspace->s[index_wkspace_sys(0,i,N)]; + workspace->s[index_wkspace_sys(0,i,N)] = s_tmp; + + workspace->t[index_wkspace_sys(4,i,N)] = workspace->t[index_wkspace_sys(3,i,N)]; + workspace->t[index_wkspace_sys(3,i,N)] = workspace->t[index_wkspace_sys(2,i,N)]; + workspace->t[index_wkspace_sys(2,i,N)] = workspace->t[index_wkspace_sys(1,i,N)]; + workspace->t[index_wkspace_sys(1,i,N)] = workspace->t[index_wkspace_sys(0,i,N)]; + workspace->t[index_wkspace_sys(0,i,N)] = t_tmp; +} + + +GLOBAL void Cuda_Update_Atoms_q( reax_atom *atoms, real *s, real u, real *t, int N ) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + + if (i >= N) + { + return; + } + + atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)]; +} + + +void Cuda_Calculate_Charges( reax_system *system, static_storage *workspace ) +{ + real *spad = (real *) scratch; + real u, s_sum, t_sum; + + cuda_memset( spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); + + //s_sum + Cuda_reduction<<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad, system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); + + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device( &s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + //t_sum + cuda_memset( spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad, system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); + + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + + copy_host_device( &t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + //fraction here + u = s_sum / t_sum; + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u ); +#endif + + Cuda_Update_Atoms_q<<< BLOCKS, BLOCK_SIZE >>> + ( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + +void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data, + static_storage *workspace, list *far_nbrs, + output_controls *out_control ) +{ + int matvecs = 0; + real t_start, t_elapsed; + +#ifdef __DEBUG_CUDA__ + t_start = Get_Time( ); +#endif + + /* + //Cuda_Init_MatVec( system, control, data, workspace, far_nbrs ); + + Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>> + ( dev_workspace->H ); + cudaThreadSynchronize(); + cudaCheckError(); + + t_elapsed = Get_Timing_Info (t_start); + fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed); + */ + + Init_MatVec_Postprocess<<< BLOCKS, BLOCK_SIZE >>> + (*dev_workspace, system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); + +#ifdef __DEBUG_CUDA__ + t_elapsed = Get_Timing_Info( t_start ); + fprintf( stderr, "Done with post processing of init_matvec --> %d with time ---> %f \n", cudaGetLastError (), t_elapsed ); +#endif + + //Here goes the GMRES part of the program () + //#ifdef __DEBUG_CUDA__ + t_start = Get_Time( ); + //#endif + + //matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s ); + //matvecs += Cuda_GMRES( dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t ); + + matvecs = Cublas_GMRES( system, dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s ); + matvecs += Cublas_GMRES( system, dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t ); + + d_timing.matvecs += matvecs; + +#ifdef __DEBUG_CUDA__ + t_elapsed = Get_Timing_Info( t_start ); + fprintf( stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed ); +#endif + + Cuda_Calculate_Charges( system, workspace ); +} diff --git a/PuReMD-GPU/src/cuda_QEq.h b/PuReMD-GPU/src/cuda_QEq.h new file mode 100644 index 0000000000000000000000000000000000000000..f62ab1157e3812837cd3a8a65e07856a78b22bf2 --- /dev/null +++ b/PuReMD-GPU/src/cuda_QEq.h @@ -0,0 +1,39 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_QEq_H_ +#define __CUDA_QEq_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_QEq( reax_system*, control_params*, simulation_data*, static_storage*, + list*, output_controls* ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/allocate.cu b/PuReMD-GPU/src/cuda_allocate.cu similarity index 64% rename from PuReMD-GPU/src/allocate.cu rename to PuReMD-GPU/src/cuda_allocate.cu index 37b80693217700dc20bbffdb515cfe5e1f6eb986..d0e1f22b6f970038820f82c504dc3472d50d1d7a 100644 --- a/PuReMD-GPU/src/allocate.cu +++ b/PuReMD-GPU/src/cuda_allocate.cu @@ -18,32 +18,24 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "allocate.h" -#include "list.h" +#include "cuda_allocate.h" #include "cuda_utils.h" -#include "reduction.h" +#include "cuda_list.h" +#include "cuda_reduction.h" -void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) -{ - Delete_List( far_nbrs ); - if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs )){ - fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); - exit( INIT_ERR ); - } +#include "list.h" + + +GLOBAL void Init_HBond_Indexes ( int *, int *, list , int ); +GLOBAL void Init_Bond_Indexes ( int *, list , int ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n", - num_intrs, far_nbrs->num_intrs ); - fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", - num_intrs * sizeof(far_neighbor_data) / (1024*1024) ); -#endif -} void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) { - Delete_List( far_nbrs, TYP_DEVICE ); - if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE )){ + Cuda_Delete_List( far_nbrs ); + if(!Cuda_Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs )) + { fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); exit( INIT_ERR ); } @@ -57,23 +49,6 @@ void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) } -int Allocate_Matrix( sparse_matrix *H, int n, int m ) -{ - H->n = n; - H->m = m; - if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL ) - return 0; - - if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL ) - return 0; - - if( (H->entries = - (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL ) - return 0; - - return 1; -} - int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m ) { H->n = n; @@ -87,13 +62,6 @@ int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m ) } -void Deallocate_Matrix( sparse_matrix *H ) -{ - free(H->start); - free(H->entries); - free(H->end); -} - void Cuda_Deallocate_Matrix( sparse_matrix *H ) { cuda_free(H->start, RES_SPARSE_MATRIX_INDEX); @@ -106,23 +74,6 @@ void Cuda_Deallocate_Matrix( sparse_matrix *H ) } -int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) -{ - Deallocate_Matrix( H ); - if( !Allocate_Matrix( H, n, m ) ) { - fprintf(stderr, "not enough space for %s matrix. terminating!\n", name); - exit( 1 ); - } - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n", - name, n, m ); - fprintf( stderr, "memory allocated: %s = %ldMB\n", - name, m * sizeof(sparse_matrix_entry) / (1024*1024) ); -#endif - return 1; -} - int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) { Cuda_Deallocate_Matrix( H ); @@ -142,56 +93,6 @@ int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name ) } -int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, - list *hbonds ) -{ - int i, num_hbonds; - - num_hbonds = 0; - /* find starting indexes for each H and the total number of hbonds */ - for( i = 1; i < n; ++i ) - hb_top[i] += hb_top[i-1]; - num_hbonds = hb_top[n-1]; - - if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) { - fprintf( stderr, "not enough space for hbonds list. terminating!\n" ); - exit( INIT_ERR ); - } - - for( i = 0; i < n; ++i ) - if( h_index[i] == 0 ){ - Set_Start_Index( 0, 0, hbonds ); - Set_End_Index( 0, 0, hbonds ); - } - else if( h_index[i] > 0 ){ - Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); - Set_End_Index( h_index[i], hb_top[i-1], hbonds ); - } - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds ); - fprintf( stderr, "memory allocated: hbonds = %ldMB\n", - num_hbonds * sizeof(hbond_data) / (1024*1024) ); -#endif - return 1; -} - -GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N ) -{ - int index = blockIdx.x * blockDim.x + threadIdx.x; - - if (index >= N) return; - - if( h_index[index] == 0 ){ - Set_Start_Index( 0, 0, &hbonds ); - Set_End_Index( 0, 0, &hbonds ); - } - else if( h_index[index] > 0 ){ - Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); - Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); - } -} - int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list *hbonds ) { int i, num_hbonds; @@ -204,7 +105,7 @@ int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list hb_top[i] += hb_top[i-1]; num_hbonds = hb_top[n-1]; - if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds , TYP_DEVICE) ) { + if( !Cuda_Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) { fprintf( stderr, "not enough space for hbonds list. terminating!\n" ); exit( INIT_ERR ); } @@ -225,27 +126,6 @@ int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list return 1; } -int Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) -{ - int i; - int *hb_top; - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating hbonds\n" ); -#endif - hb_top = (int *)calloc( n, sizeof(int) ); - for( i = 0; i < n; ++i ) - if( h_index[i] >= 0 ) - hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS); - - Delete_List( hbonds ); - - Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds ); - - free( hb_top ); - - return 1; -} int Cuda_Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) { @@ -270,7 +150,7 @@ int Cuda_Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) //if( h_index[i] >= 0 ) hb_top[i] = MAX((hb_end [i] - hb_start[i])*SAFE_HBONDS, MIN_HBONDS); - Delete_List( hbonds, TYP_DEVICE ); + Cuda_Delete_List( hbonds ); Cuda_Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds ); @@ -281,21 +161,6 @@ int Cuda_Reallocate_HBonds_List( int n, int num_h, int *h_index, list *hbonds ) return 1; } -GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N ) -{ - int index = blockIdx.x * blockDim.x + threadIdx.x; - - if (index >= N) return; - - if( index == 0 ){ - Set_Start_Index( 0, 0, &bonds ); - Set_End_Index( 0, 0, &bonds ); - } - else if( index > 0 ){ - Set_Start_Index( index, b_top[index-1], &bonds ); - Set_End_Index( index, b_top[index-1], &bonds ); - } -} int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds ) { @@ -308,7 +173,7 @@ int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds ) b_top[i] += b_top[i-1]; num_bonds = b_top[num_b-1]; - if( !Make_List(num_b, num_bonds, TYP_BOND, bonds, TYP_DEVICE) ) { + if( !Cuda_Make_List(num_b, num_bonds, TYP_BOND, bonds ) ) { fprintf( stderr, "not enough space for bonds list. terminating!\n" ); exit( INIT_ERR ); } @@ -326,93 +191,6 @@ int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds ) } -int Allocate_Bond_List( int n, int *bond_top, list *bonds ) -{ - int i, num_bonds; - - num_bonds = 0; - /* find starting indexes for each atom and the total number of bonds */ - for( i = 1; i < n; ++i ) - bond_top[i] += bond_top[i-1]; - num_bonds = bond_top[n-1]; - - if( !Make_List(n, num_bonds, TYP_BOND, bonds ) ) { - fprintf( stderr, "not enough space for bonds list. terminating!\n" ); - exit( INIT_ERR ); - } - - Set_Start_Index( 0, 0, bonds ); - Set_End_Index( 0, 0, bonds ); - for( i = 1; i < n; ++i ) { - Set_Start_Index( i, bond_top[i-1], bonds ); - Set_End_Index( i, bond_top[i-1], bonds ); - } - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds ); - fprintf( stderr, "memory allocated: bonds = %ldMB\n", - num_bonds * sizeof(bond_data) / (1024*1024) ); -#endif - return 1; -} - - -int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body ) -{ - int i; - int *bond_top; - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating bonds\n" ); -#endif - bond_top = (int *)calloc( n, sizeof(int) ); - *est_3body = 0; - for( i = 0; i < n; ++i ){ - *est_3body += SQR( Num_Entries( i, bonds ) ); - bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS ); - } - - Delete_List( bonds ); - - Allocate_Bond_List( n, bond_top, bonds ); - *num_bonds = bond_top[n-1]; - - free( bond_top ); - - return 1; -} - -void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n) -{ - extern __shared__ int sh_input[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - { - x = SQR (Num_Entries( i, &bonds ) ); - bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS ); - } - sh_input[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sh_input[threadIdx.x] += sh_input[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sh_input[0]; - } -} - - int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body ) { int i; @@ -437,7 +215,7 @@ int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body ) b_top[i] = MAX((b_end [i] - b_start[i])*2, MIN_BONDS); } - Delete_List( bonds, TYP_DEVICE ); + Cuda_Delete_List( bonds ); Cuda_Allocate_Bond_List(n, b_top, bonds ); @@ -450,6 +228,7 @@ int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body ) return i; } + int Cuda_Reallocate_ThreeBody_List ( list *thblist, int count ) { int i; @@ -479,10 +258,10 @@ int Cuda_Reallocate_ThreeBody_List ( list *thblist, int count ) new_total = thb_total; new_count = count; - Delete_List( thblist, TYP_DEVICE ); + Cuda_Delete_List( thblist ); /*Allocate the list */ - if(!Make_List( new_count, new_total, TYP_THREE_BODY, thblist, TYP_DEVICE )){ + if(!Cuda_Make_List( new_count, new_total, TYP_THREE_BODY, thblist )){ fprintf(stderr, "Problem in reallocating three-body list. Terminating!\n"); exit( INIT_ERR ); } @@ -523,14 +302,14 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH ); cudaThreadSynchronize (); cudaCheckError (); - Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> + Cuda_reduction_int<<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> (d_bond_top + n, d_bond_top + n + BLOCKS_POW_2, BLOCKS_POW_2); cudaThreadSynchronize (); copy_host_device (bond_top, d_bond_top, n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); copy_host_device (est_3body, d_bond_top + n + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, __LINE__); - Delete_List( bonds, TYP_DEVICE ); + Cuda_Delete_List( bonds ); Cuda_Allocate_Bond_List( n, bond_top, bonds ); *num_bonds = bond_top[n-1]; @@ -542,83 +321,6 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH ); */ -void Reallocate( reax_system *system, static_storage *workspace, list **lists, - int nbr_flag ) -{ - int num_bonds, est_3body; - reallocate_data *realloc; - grid *g; - - realloc = &(workspace->realloc); - g = &(system->g); - - if( realloc->num_far > 0 && nbr_flag ) { - fprintf (stderr, " Reallocating neighbors \n"); - Reallocate_Neighbor_List( (*lists)+FAR_NBRS, - system->N, realloc->num_far * SAFE_ZONE ); - realloc->num_far = -1; - } - - if( realloc->Htop > 0 ){ - fprintf (stderr, " Reallocating Matrix \n"); - Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H"); - realloc->Htop = -1; - - Deallocate_Matrix( &workspace->L ); - Deallocate_Matrix( &workspace->U ); - } - - if( realloc->hbonds > 0 ){ - fprintf (stderr, " Reallocating hbonds \n"); - Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index, - (*lists)+HBONDS ); - realloc->hbonds = -1; - } - - num_bonds = est_3body = -1; - if( realloc->bonds > 0 ){ - fprintf (stderr, " Reallocating bonds \n"); - Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body ); - realloc->bonds = -1; - realloc->num_3body = MAX( realloc->num_3body, est_3body ); - } - - if( realloc->num_3body > 0 ) { - fprintf (stderr, " Reallocating 3Body \n"); - Delete_List( (*lists)+THREE_BODIES ); - - if( num_bonds == -1 ) - num_bonds = ((*lists)+BONDS)->num_intrs; - realloc->num_3body *= SAFE_ZONE; - - if( !Make_List( num_bonds, realloc->num_3body, - TYP_THREE_BODY, (*lists)+THREE_BODIES ) ) { - fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); - exit( INIT_ERR ); - } - realloc->num_3body = -1; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "reallocating 3 bodies\n" ); - fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds ); - fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body ); - fprintf( stderr, "reallocated 3body memory: %ldMB\n", - realloc->num_3body*sizeof(three_body_interaction_data)/ - (1024*1024) ); -#endif - } - - if( realloc->gcell_atoms > -1 ){ -#if defined(DEBUG_FOCUS) - fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms); -#endif - - free (g->atoms); - g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2], - sizeof (int) * workspace->realloc.gcell_atoms); - realloc->gcell_atoms = -1; - } -} - void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *lists, int nbr_flag, int step ) { @@ -724,3 +426,68 @@ void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *list realloc->gcell_atoms = -1; } } + + +GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N ) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= N) return; + + if( h_index[index] == 0 ){ + Set_Start_Index( 0, 0, &hbonds ); + Set_End_Index( 0, 0, &hbonds ); + } + else if( h_index[index] > 0 ){ + Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); + Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); + } +} + + +GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N ) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + + if (index >= N) return; + + if( index == 0 ){ + Set_Start_Index( 0, 0, &bonds ); + Set_End_Index( 0, 0, &bonds ); + } + else if( index > 0 ){ + Set_Start_Index( index, b_top[index-1], &bonds ); + Set_End_Index( index, b_top[index-1], &bonds ); + } +} + + +void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n) +{ + extern __shared__ int sh_input[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + + if(i < n) + { + x = SQR (Num_Entries( i, &bonds ) ); + bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS ); + } + sh_input[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sh_input[threadIdx.x] += sh_input[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sh_input[0]; + } +} diff --git a/PuReMD-GPU/src/cuda_allocate.h b/PuReMD-GPU/src/cuda_allocate.h new file mode 100644 index 0000000000000000000000000000000000000000..dc672d3bd76c1afa7a468aa2fdda3dd0ca3d3ec9 --- /dev/null +++ b/PuReMD-GPU/src/cuda_allocate.h @@ -0,0 +1,41 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_ALLOCATE_H_ +#define __CUDA_ALLOCATE_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +int Cuda_Allocate_Matrix( sparse_matrix*, int, int ); +int Cuda_Allocate_HBond_List( int, int, int*, int*, list* ); +int Cuda_Allocate_Bond_List( int, int*, list* ); +void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/cuda_bond_orders.cu b/PuReMD-GPU/src/cuda_bond_orders.cu new file mode 100644 index 0000000000000000000000000000000000000000..81a7462033a3da94d8ae601288efb83a93e387f2 --- /dev/null +++ b/PuReMD-GPU/src/cuda_bond_orders.cu @@ -0,0 +1,857 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_bond_orders.h" + +#include "bond_orders.h" +#include "list.h" +#include "lookup.h" +#include "print_utils.h" +#include "vector.h" +#include "index_utils.h" + +#include "cuda_utils.h" +#include "cuda_helpers.h" + + +HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms, + simulation_data *data, static_storage *workspace, + list *bonds ) +{ + bond_data *nbr_j, *nbr_k; + bond_order_data *bo_ij, *bo_ji; + dbond_coefficients coef; + rvec temp, ext_press; + ivec rel_box; + int pk, k, j; + + /* Initializations */ + nbr_j = &(bonds->select.bond_list[pj]); + j = nbr_j->nbr; + bo_ij = &(nbr_j->bo_data); + bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + + coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + + coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + + coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + + coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + + + /************************************ + * forces related to atom i * + * first neighbors of atom i * + ************************************/ + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp ); /*2nd,dBO*/ + rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ + rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/ + + /* force */ + rvec_Add( atoms[k].f, temp ); + /* pressure */ + rvec_iMultiply( ext_press, nbr_k->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + } + + /* then atom i itself */ + rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] ); /*2nd, dBO*/ + + rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/ + + rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ + rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ + rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/ + + rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ; /*1st,dBO_pi2*/ + rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBO_pi2*/ + rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/ + + /* force */ + rvec_Add( atoms[i].f, temp ); + /* ext pressure due to i dropped, counting force on j only will be enough */ + + + /**************************************************************************** + * forces and pressure related to atom j * + * first neighbors of atom j * + ***************************************************************************/ + for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + + rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp ); /*3rd,dBO*/ + rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ + rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/ + + /* force */ + rvec_Add( atoms[k].f, temp ); + /* pressure */ + if( k != i ) { + ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box wrt i + rvec_iMultiply( ext_press, rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + } + } + + /* then atom j itself */ + rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] ); /*2nd, dBO*/ + + rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp ); /*1st, dBO*/ + rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/ + + rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi ); /*1st,dBOpi*/ + rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp ); /*2nd,dBOpi*/ + rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/ + + rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2); /*1st,dBOpi2*/ + rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp); /*2nd,dBOpi2*/ + rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/ + + /* force */ + rvec_Add( atoms[j].f, temp ); + /* pressure */ + rvec_iMultiply( ext_press, nbr_j->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); +} + + +HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms, + static_storage *workspace, list *bonds ) +{ + bond_data *nbr_j, *nbr_k; + bond_order_data *bo_ij, *bo_ji; + dbond_coefficients coef; + int pk, k, j; + rvec t_f; + + /* Initializations */ + nbr_j = &(bonds->select.bond_list[pj]); + j = nbr_j->nbr; + + if (i < j) + { + bo_ij = &(nbr_j->bo_data); + bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + } else { + bo_ji = &(nbr_j->bo_data); + bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data); + } + + coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo); + + coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi); + + coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2); + + coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]); + + if ( i < j) { + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + rvec_MakeZero (t_f); + + rvec_ScaledAdd( t_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); + /*2nd, dBO*/ + rvec_ScaledAdd( t_f, -coef.C2dDelta, nbr_k->bo_data.dBOp ); + /*dDelta*/ + rvec_ScaledAdd( t_f, -coef.C3dbopi, nbr_k->bo_data.dBOp ); + /*3rd, dBOpi*/ + rvec_ScaledAdd( t_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp ); + /*3rd, dBOpi2*/ + + //Store in the temp place + rvec_Add (nbr_k->t_f, t_f); + } + + rvec_ScaledAdd( atoms[i].f, coef.C1dbo, bo_ij->dBOp ); + /*1st, dBO*/ + rvec_ScaledAdd( atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] ); + /*2nd, dBO*/ + + rvec_ScaledAdd(atoms[i].f, coef.C1dDelta, bo_ij->dBOp); + /*1st, dBO*/ + rvec_ScaledAdd(atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]); + /*2nd, dBO*/ + + rvec_ScaledAdd( atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi ); + /*1st, dBOpi*/ + rvec_ScaledAdd( atoms[i].f, coef.C2dbopi, bo_ij->dBOp ); + /*2nd, dBOpi*/ + rvec_ScaledAdd( atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]); + /*3rd, dBOpi*/ + + rvec_ScaledAdd( atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); + /*1st, dBO_pi2*/ + rvec_ScaledAdd( atoms[i].f, coef.C2dbopi2, bo_ij->dBOp ); + /*2nd, dBO_pi2*/ + rvec_ScaledAdd( atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]); + /*3rd, dBO_pi2*/ + } + else + { + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + k = nbr_k->nbr; + rvec_MakeZero (t_f); + + rvec_ScaledAdd( t_f, -coef.C3dbo, nbr_k->bo_data.dBOp ); + /*3rd, dBO*/ + rvec_ScaledAdd( t_f, -coef.C3dDelta, nbr_k->bo_data.dBOp ); + /*dDelta*/ + rvec_ScaledAdd( t_f, -coef.C4dbopi, nbr_k->bo_data.dBOp ); + /*4th, dBOpi*/ + rvec_ScaledAdd( t_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp ); + /*4th, dBOpi2*/ + + //Store in the temp place + rvec_Add (nbr_k->t_f, t_f); + } + + rvec_ScaledAdd( atoms[i].f, -coef.C1dbo, bo_ij->dBOp ); + /*1st, dBO*/ + rvec_ScaledAdd( atoms[i].f, coef.C3dbo, workspace->dDeltap_self[i] ); + /*2nd, dBO*/ + + rvec_ScaledAdd( atoms[i].f, -coef.C1dDelta, bo_ij->dBOp ); + /*1st, dBO*/ + rvec_ScaledAdd(atoms[i].f, coef.C3dDelta, workspace->dDeltap_self[i]); + /*2nd, dBO*/ + + rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi, bo_ij->dln_BOp_pi ); + /*1st, dBOpi*/ + rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi, bo_ij->dBOp ); + /*2nd, dBOpi*/ + rvec_ScaledAdd(atoms[i].f, coef.C4dbopi, workspace->dDeltap_self[i]); + /*3rd, dBOpi*/ + + rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 ); + /*1st, dBOpi2*/ + rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi2, bo_ij->dBOp ); + /*2nd, dBOpi2*/ + rvec_ScaledAdd(atoms[i].f, coef.C4dbopi2, workspace->dDeltap_self[i]); + /*3rd, dBOpi2*/ + } +} + + +HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list *bonds) +{ + int pk; + bond_data *nbr_k, *nbr_k_sym; + + /* + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + rvec_Add (atoms[i].f, nbr_k->t_f); + } + */ + + for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) { + nbr_k = &(bonds->select.bond_list[pk]); + nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] ); + + rvec_Add (atoms[i].f, nbr_k_sym->t_f); + } +} + + +GLOBAL void Cuda_Calculate_Bond_Orders_Init ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, + static_storage workspace, int num_atom_types, int N ) +{ + int i, type_i; + real p_boc1, p_boc2; + single_body_parameters *sbp_i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + p_boc1 = g_params.l[0]; + p_boc2 = g_params.l[1]; + + /* Calculate Deltaprime, Deltaprime_boc values */ + type_i = atoms[i].type; + sbp_i = &(sbp[type_i]); + workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency; + workspace.Deltap_boc[i] = + workspace.total_bond_order[i] - sbp_i->valency_val; + workspace.total_bond_order[i] = 0; +} + + +/* A very important and crucial assumption here is that each segment + belonging to a different atom in nbrhoods->nbr_list is sorted in its own. + This can either be done in the general coordinator function or here */ +GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, + two_body_parameters *tbp, static_storage workspace, list bonds, + list dDeltas, list dBOs, int num_atom_types, int N ) +{ + int i, j, pj, type_i, type_j; + int start_i, end_i; + int num_bonds, sym_index; + real p_boc1, p_boc2; + real val_i, Deltap_i, Deltap_boc_i; + real val_j, Deltap_j, Deltap_boc_j; + real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5; + real exp_p1i, exp_p2i, exp_p1j, exp_p2j; + real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji; + real Cf45_ij, Cf45_ji, p_lp1; + real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji; + real explp1; + two_body_parameters *twbp; + bond_order_data *bo_ij, *bo_ji; + single_body_parameters *sbp_i, *sbp_j; + +#if defined(TEST_FORCES) + int k, pk, start_j, end_j; + int top_dbo=0, top_dDelta=0; + dbond_data *pdbo; + dDelta_data *ptop_dDelta; + +#endif + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + num_bonds = 0; + p_boc1 = g_params.l[0]; + p_boc2 = g_params.l[1]; + + /* Calculate Deltaprime, Deltaprime_boc values */ + //for( i = 0; i < system->N; ++i ) { + /* + if (i < N) { + type_i = atoms[i].type; + sbp_i = &(sbp[type_i]); + workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency; + workspace.Deltap_boc[i] = + workspace.total_bond_order[i] - sbp_i->valency_val; + workspace.total_bond_order[i] = 0; + + } + + __syncthreads (); + */ + + + // fprintf( stderr, "done with uncorrected bond orders\n" ); + + /* Corrected Bond Order calculations */ + //for( i = 0; i < system->N; ++i ) { + type_i = atoms[i].type; + sbp_i = &(sbp[type_i]); + val_i = sbp_i->valency; + Deltap_i = workspace.Deltap[i]; + Deltap_boc_i = workspace.Deltap_boc[i]; + start_i = Start_Index(i, &bonds); + end_i = End_Index(i, &bonds); + //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n", + // i+1, Deltap_i, Deltap_boc_i, start_i, end_i ); + + for( pj = start_i; pj < end_i; ++pj ) { + j = bonds.select.bond_list[pj].nbr; + type_j = atoms[j].type; + bo_ij = &( bonds.select.bond_list[pj].bo_data ); + //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO ); + + if( i < j ) { + twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); +#ifdef TEST_FORCES + Set_Start_Index( pj, top_dbo, &dBOs ); + /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", + workspace->reverse_map[i], workspace->reverse_map[j], + twbp->ovc, twbp->v13cor, bo_ij->BO ); */ +#endif + if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) { + /* There is no correction to bond orders nor to derivatives of + bond order prime! So we leave bond orders unchanged and + set derivative of bond order coefficients s.t. + dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */ + bo_ij->C1dbo = 1.000000; + bo_ij->C2dbo = 0.000000; + bo_ij->C3dbo = 0.000000; + + bo_ij->C1dbopi = bo_ij->BO_pi; + bo_ij->C2dbopi = 0.000000; + bo_ij->C3dbopi = 0.000000; + bo_ij->C4dbopi = 0.000000; + + bo_ij->C1dbopi2 = bo_ij->BO_pi2; + bo_ij->C2dbopi2 = 0.000000; + bo_ij->C3dbopi2 = 0.000000; + bo_ij->C4dbopi2 = 0.000000; + +#ifdef TEST_FORCES + pdbo = &(dBOs.select.dbo_list[ top_dbo ]); + + // compute dBO_ij/dr_i + pdbo->wrt = i; + rvec_Copy( pdbo->dBO, bo_ij->dBOp ); + rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi ); + rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 ); + + // compute dBO_ij/dr_j + pdbo++; + pdbo->wrt = j; + rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp ); + rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi ); + rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 ); + + top_dbo += 2; +#endif + } + else { + val_j = sbp[type_j].valency; + Deltap_j = workspace.Deltap[j]; + Deltap_boc_j = workspace.Deltap_boc[j]; + + /* on page 1 */ + if( twbp->ovc >= 0.001 ) { + /* Correction for overcoordination */ + exp_p1i = EXP( -p_boc1 * Deltap_i ); + exp_p2i = EXP( -p_boc2 * Deltap_i ); + exp_p1j = EXP( -p_boc1 * Deltap_j ); + exp_p2j = EXP( -p_boc2 * Deltap_j ); + + f2 = exp_p1i + exp_p1j; + f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i + exp_p2j ) ); + f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + + ( val_j + f2 )/( val_j + f2 + f3 ) ); + + /*fprintf( stderr,"%6d%6d\t%g %g j:%g %g p_boc:%g %g\n", + i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 ); + fprintf( stderr,"\tf:%g %g %g, exp:%g %g %g %g\n", + f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/ + + /* Now come the derivates */ + /* Bond Order pages 5-7, derivative of f1 */ + temp = f2 + f3; + u1_ij = val_i + temp; + u1_ji = val_j + temp; + Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji )); + Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + + ( u1_ji - f3 ) / SQR( u1_ji )); + + //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + + // Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j ); + Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - + ((val_i+f2) / SQR(u1_ij)) * + ( -p_boc1 * exp_p1i + + exp_p2i / ( exp_p2i + exp_p2j ) ) + + -p_boc1 * exp_p1i / u1_ji - + ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i + + exp_p2i / ( exp_p2i + exp_p2j ) )); + + Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + + Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); + //fprintf( stderr, "\tCf1:%g %g\n", Cf1_ij, Cf1_ji ); + } + else { + /* No overcoordination correction! */ + f1 = 1.0; + Cf1_ij = Cf1_ji = 0.0; + } + + if( twbp->v13cor >= 0.001 ) { + /* Correction for 1-3 bond orders */ + exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - + Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5); + exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - + Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5); + + f4 = 1. / (1. + exp_f4); + f5 = 1. / (1. + exp_f5); + f4f5 = f4 * f5; + + /* Bond Order pages 8-9, derivative of f4 and f5 */ + /*temp = twbp->p_boc5 - + twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO ); + u_ij = temp + twbp->p_boc3 * Deltap_boc_i; + u_ji = temp + twbp->p_boc3 * Deltap_boc_j; + Cf45_ij = Cf45( u_ij, u_ji ) / f4f5; + Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/ + Cf45_ij = -f4 * exp_f4; + Cf45_ji = -f5 * exp_f5; + } + else { + f4 = f5 = f4f5 = 1.0; + Cf45_ij = Cf45_ji = 0.0; + } + + /* Bond Order page 10, derivative of total bond order */ + A0_ij = f1 * f4f5; + A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * + (Cf45_ij + Cf45_ji); + A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij; + A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji; + A3_ij = A2_ij + Cf1_ij / f1; + A3_ji = A2_ji + Cf1_ji / f1; + + /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f +A2_ji: %f, A3_ij: %f, A3_ji: %f\n", +bo_ij->BO, A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/ + + /* find corrected bond order values and their deriv coefs */ + bo_ij->BO = bo_ij->BO * A0_ij; + bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1; + bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1; + bo_ij->BO_s = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); + + bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij; + bo_ij->C2dbo = bo_ij->BO * A2_ij; + bo_ij->C3dbo = bo_ij->BO * A2_ji; + + bo_ij->C1dbopi = f1*f1*f4*f5; + bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij; + bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij; + bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji; + + bo_ij->C1dbopi2 = f1*f1*f4*f5; + bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij; + bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij; + +#ifdef TEST_FORCES + /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", + i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/ + + /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", + //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n", + workspace->orig_id[i], workspace->orig_id[j] + A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji + bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s, + bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, + bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi, + bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2 + ); */ + + Calculate_dBO( i, pj, workspace, lists, &top_dbo ); +#endif + } + + /* neglect bonds that are < 1e-10 */ + if( bo_ij->BO < 1e-10 ) + bo_ij->BO = 0.0; + if( bo_ij->BO_s < 1e-10 ) + bo_ij->BO_s = 0.0; + if( bo_ij->BO_pi < 1e-10 ) + bo_ij->BO_pi = 0.0; + if( bo_ij->BO_pi2 < 1e-10 ) + bo_ij->BO_pi2 = 0.0; + + workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO + + + /* fprintf( stderr, "%d %d\t%g %g %g %g\n +Cdbo:\t%g %g %g\n +Cdbopi:\t%g %g %g %g\n +Cdbopi2:%g %g %g %g\n\n", +i+1, j+1, bonds->select.bond_list[ pj ].d, +bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, +bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, +bo_ij->C1dbopi, bo_ij->C2dbopi, bo_ij->C3dbopi, bo_ij->C4dbopi, +bo_ij->C1dbopi2, bo_ij->C2dbopi2, +bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */ + + /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n", + i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */ + +#ifdef TEST_FORCES + Set_End_Index( pj, top_dbo, &dBOs ); + //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta ); +#endif + } + /* + else { + // We only need to update bond orders from bo_ji + // everything else is set in uncorrected_bo calculations + sym_index = bonds.select.bond_list[pj].sym_index; + bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data); + bo_ij->BO = bo_ji->BO; + bo_ij->BO_s = bo_ji->BO_s; + bo_ij->BO_pi = bo_ji->BO_pi; + bo_ij->BO_pi2 = bo_ji->BO_pi2; + + workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO + +#ifdef TEST_FORCES + //Add_dBO( system, lists, j, sym_index, 1.0, workspace.dDelta ); +#endif +} + */ + } + +#ifdef TEST_FORCES + // fprintf( stderr, "dDelta computations\nj:" ); + Set_Start_Index( i, top_dDelta, &dDeltas ); + ptop_dDelta = &( dDeltas.select.dDelta_list[top_dDelta] ); + + for( pj = start_i; pj < end_i; ++pj ) { + j = bonds.select.bond_list[pj].nbr; + // fprintf( stderr, "%d ", j ); + + if( !rvec_isZero( workspace.dDelta[j] ) ) { + ptop_dDelta->wrt = j; + rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[j] ); + rvec_MakeZero( workspace.dDelta[j] ); + ++top_dDelta, ++ptop_dDelta; + } + + start_j = Start_Index(j, &bonds); + end_j = End_Index(j, &bonds); + for( pk = start_j; pk < end_j; ++pk ) { + k = bonds.select.bond_list[pk].nbr; + if( !rvec_isZero( workspace.dDelta[k] ) ) { + ptop_dDelta->wrt = k; + rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[k] ); + rvec_MakeZero( workspace.dDelta[k] ); + ++top_dDelta, ++ptop_dDelta; + } + } + } + + Set_End_Index( i, top_dDelta, &dDeltas ); + + /*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj ) + fprintf( stdout, "dDel: %d %d [%g %g %g]\n", + i+1, dDeltas->select.dDelta_list[pj].wrt+1, + dDeltas->select.dDelta_list[pj].dVal[0], + dDeltas->select.dDelta_list[pj].dVal[1], + dDeltas->select.dDelta_list[pj].dVal[2] );*/ +#endif + //} + + /*fprintf(stderr,"\tCalculated actual bond orders ...\n" ); + fprintf(stderr,"%6s%8s%8s%8s%8s%8s%8s%8s\n", + "atom", "Delta", "Delta_e", "Delta_boc", "nlp", + "Delta_lp", "Clp", "dDelta_lp" );*/ + + /* + p_lp1 = g_params.l[15]; + + //get the kernel ID for the following computation + j = i; + + // Calculate some helper variables that are used at many places + // throughout force calculations + //for( j = 0; j < system->N; ++j ) { + type_j = atoms[j].type; + sbp_j = &(sbp[ type_j ]); + + workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency; + workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e; + workspace.Delta_boc[j] = workspace.total_bond_order[j] - + sbp_j->valency_boc; + + workspace.vlpex[j] = workspace.Delta_e[j] - + 2.0 * (int)(workspace.Delta_e[j]/2.0); + explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j])); + workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0); + workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j]; + workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]); + // Adri uses different dDelta_lp values than the ones in notes... // + workspace.dDelta_lp[j] = workspace.Clp[j]; + //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * + //((fabs(workspace->Delta_e[j]/2.0 - + // (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); + + if( sbp_j->mass > 21.0 ) { + workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); + workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; + workspace.dDelta_lp_temp[j] = 0.; + } + else { + workspace.nlp_temp[j] = workspace.nlp[j]; + workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; + workspace.dDelta_lp_temp[j] = workspace.Clp[j]; + } + + //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n", + //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], + //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt, + //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] ); + //} + */ + + //Print_Bonds( system, bonds, "sbonds.out" ); + +#if defined(DEBUG) + //fprintf( stderr, "Number of bonds: %d\n", num_bonds ); + //Print_Bond_Orders( system, control, data, workspace, lists, out_control ); +#endif +} + + +GLOBAL void Cuda_Update_Uncorrected_BO ( static_storage workspace, list bonds, int N ) +{ + int i, j, pj; + int start_i, end_i; + int sym_index; + + bond_order_data *bo_ij, *bo_ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + start_i = Start_Index(i, &bonds); + end_i = End_Index(i, &bonds); + + for( pj = start_i; pj < end_i; ++pj ) { + + j = bonds.select.bond_list[pj].nbr; + bo_ij = &( bonds.select.bond_list[pj].bo_data ); + + if( i >= j ) { + // We only need to update bond orders from bo_ji + // everything else is set in uncorrected_bo calculations + sym_index = bonds.select.bond_list[pj].sym_index; + bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data); + bo_ij->BO = bo_ji->BO; + bo_ij->BO_s = bo_ji->BO_s; + bo_ij->BO_pi = bo_ji->BO_pi; + bo_ij->BO_pi2 = bo_ji->BO_pi2; + + workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO + } + } +} + + +GLOBAL void Cuda_Update_Workspace_After_Bond_Orders( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, + static_storage workspace, int N ) +{ + int j, type_j; + real explp1; + real p_lp1; + single_body_parameters *sbp_i, *sbp_j; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + + p_lp1 = g_params.l[15]; + + /* Calculate some helper variables that are used at many places + throughout force calculations */ + //for( j = 0; j < system->N; ++j ) { + type_j = atoms[j].type; + sbp_j = &(sbp[ type_j ]); + + workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency; + workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e; + workspace.Delta_boc[j] = workspace.total_bond_order[j] - + sbp_j->valency_boc; + + workspace.vlpex[j] = workspace.Delta_e[j] - + 2.0 * (int)(workspace.Delta_e[j]/2.0); + explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j])); + workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0); + workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j]; + workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]); + /* Adri uses different dDelta_lp values than the ones in notes... */ + workspace.dDelta_lp[j] = workspace.Clp[j]; + //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) * + //((fabs(workspace->Delta_e[j]/2.0 - + // (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 ); + + if( sbp_j->mass > 21.0 ) { + workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency); + workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; + workspace.dDelta_lp_temp[j] = 0.; + } + else { + workspace.nlp_temp[j] = workspace.nlp[j]; + workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j]; + workspace.dDelta_lp_temp[j] = workspace.Clp[j]; + } + + //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n", + //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], + //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt, + //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] ); + //} + +} + + +//Import from the forces file. +GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data, + static_storage workspace, list p_bonds, int ensemble, int N) +{ + int i, pj; + list *bonds = &p_bonds; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) + { + for (pj = Start_Index (i, bonds); pj < End_Index (i, bonds); ++pj) + { + //int j = bonds->select.bond_list[pj].nbr; + if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) + Cuda_Add_dBond_to_Forces (i, pj, atoms, &workspace, bonds ); + else + Cuda_Add_dBond_to_Forces_NPT (i, pj, atoms, data, &workspace, bonds ); + } + } +} + + +GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *atoms, simulation_data *data, + static_storage workspace, list p_bonds, int ensemble, int N) +{ + int i, pj; + list *bonds = &p_bonds; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) + { + if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) + Cuda_dbond_to_Forces_postprocess (i, atoms, bonds ); + } +} diff --git a/PuReMD-GPU/src/cuda_bond_orders.h b/PuReMD-GPU/src/cuda_bond_orders.h new file mode 100644 index 0000000000000000000000000000000000000000..4015b9fa34aa1e66843efc8dc1a49f1c75da749c --- /dev/null +++ b/PuReMD-GPU/src/cuda_bond_orders.h @@ -0,0 +1,48 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_BOND_ORDERS_H_ +#define __CUDA_BOND_ORDERS_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void Cuda_Calculate_Bond_Orders_Init ( reax_atom *, global_parameters , single_body_parameters *, + static_storage , int , int ); +GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *, global_parameters , single_body_parameters *, + two_body_parameters *, static_storage , list , list , list , int , int ); +GLOBAL void Cuda_Update_Uncorrected_BO ( static_storage , list , int ); +GLOBAL void Cuda_Update_Workspace_After_Bond_Orders( reax_atom *, global_parameters , single_body_parameters *, + static_storage , int ); +GLOBAL void Cuda_Compute_Total_Force (reax_atom *, simulation_data *, static_storage , list , int , int ); +GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *, simulation_data *, static_storage , list , int , int ); +//HOST_DEVICE void Cuda_Add_dBond_to_Forces( int, int, reax_atom *, static_storage*, list* ); +//HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int, int, reax_atom *, simulation_data*, static_storage*, list* ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/helpers.cu b/PuReMD-GPU/src/cuda_box.cu similarity index 87% rename from PuReMD-GPU/src/helpers.cu rename to PuReMD-GPU/src/cuda_box.cu index 29ae31e305598009053100ae239ac6dbeed59d44..a9eb16faac9970f6bc89eeec4a42241cd30f0724 100644 --- a/PuReMD-GPU/src/helpers.cu +++ b/PuReMD-GPU/src/cuda_box.cu @@ -18,11 +18,13 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "cuda_helpers.h" -#include "helpers.h" #include "box.h" -GLOBAL void compute_Inc_on_T3 (reax_atom *atoms, unsigned int N, simulation_box *box, real d1, real d2, real d3) + +GLOBAL void k_compute_Inc_on_T3(reax_atom *atoms, unsigned int N, + simulation_box *box, real d1, real d2, real d3) { int index = blockIdx.x * blockDim.x + threadIdx.x; rvec dx; @@ -31,5 +33,7 @@ GLOBAL void compute_Inc_on_T3 (reax_atom *atoms, unsigned int N, simulation_box dx[2] = d3; if (index < N ) + { Inc_on_T3( atoms[index].x, dx, box ); + } } diff --git a/PuReMD-GPU/src/matvec.h b/PuReMD-GPU/src/cuda_box.h similarity index 83% rename from PuReMD-GPU/src/matvec.h rename to PuReMD-GPU/src/cuda_box.h index e032febd4ef79968a7334e43c07bfb1db5e6f49c..913abc15867cf5ec9d8e87dad0fa56198de9b1ff 100644 --- a/PuReMD-GPU/src/matvec.h +++ b/PuReMD-GPU/src/cuda_box.h @@ -18,13 +18,14 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ - -#ifndef __MATVEC__H_ -#define __MATVEC__H_ +#ifndef __CUDA_BOX_H__ +#define __CUDA_BOX_H__ #include "mytypes.h" -GLOBAL void Cuda_Matvec (sparse_matrix , real *, real *, int ); -GLOBAL void Cuda_Matvec_csr (sparse_matrix , real *, real *, int ); + +GLOBAL void k_compute_Inc_on_T3 (reax_atom *atoms, unsigned int N, + simulation_box *box, real d1, real d2, real d3); + #endif diff --git a/PuReMD-GPU/src/center_mass.cu b/PuReMD-GPU/src/cuda_center_mass.cu similarity index 91% rename from PuReMD-GPU/src/center_mass.cu rename to PuReMD-GPU/src/cuda_center_mass.cu index ea8f799846b0d8c794007762ba18869bc9686787..158d3a16489f20362bd854312eb39aa0dcec57e8 100644 --- a/PuReMD-GPU/src/center_mass.cu +++ b/PuReMD-GPU/src/cuda_center_mass.cu @@ -18,17 +18,13 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "cuda_center_mass.h" - - -#include "center_mass.h" #include "vector.h" -GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) + +GLOBAL void k_center_of_mass_blocks( single_body_parameters *sbp, reax_atom *atoms, + rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ) { extern __shared__ rvec xcm[]; extern __shared__ rvec vcm[]; @@ -76,13 +72,9 @@ GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms } } -GLOBAL void center_of_mass (rvec *xcm, - rvec *vcm, - rvec *amcm, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) + +GLOBAL void k_center_of_mass( rvec *xcm, rvec *vcm, rvec *amcm, + rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ) { extern __shared__ rvec sh_xcm[]; extern __shared__ rvec sh_vcm[]; @@ -131,11 +123,9 @@ GLOBAL void center_of_mass (rvec *xcm, } } -GLOBAL void compute_center_mass (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) + +GLOBAL void k_compute_center_mass_sbp( single_body_parameters *sbp, reax_atom *atoms, + real *results, real xcm0, real xcm1, real xcm2, size_t n ) { extern __shared__ real xx[]; extern __shared__ real xy[]; @@ -160,11 +150,11 @@ GLOBAL void compute_center_mass (single_body_parameters *sbp, xcm[1] = xcm1; xcm[2] = xcm2; - xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; - if (i < n){ + if (i < n) + { m = sbp[ atoms[i].type ].mass; rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); xx[ xx_i ] = diff[0] * diff[0] * m; @@ -176,8 +166,10 @@ GLOBAL void compute_center_mass (single_body_parameters *sbp, } __syncthreads (); - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset) + { index = threadIdx.x + offset; xx[ threadIdx.x ] += xx[ index ]; xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ]; @@ -189,7 +181,8 @@ GLOBAL void compute_center_mass (single_body_parameters *sbp, __syncthreads (); } - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { results [ blockIdx.x*6 ] = xx [ 0 ]; results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ]; results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ]; @@ -199,7 +192,8 @@ GLOBAL void compute_center_mass (single_body_parameters *sbp, } } -GLOBAL void compute_center_mass (real *input, real *output, size_t n) + +GLOBAL void k_compute_center_mass( real *input, real *output, size_t n ) { extern __shared__ real xx[]; extern __shared__ real xy[]; diff --git a/PuReMD-GPU/src/cuda_center_mass.h b/PuReMD-GPU/src/cuda_center_mass.h new file mode 100644 index 0000000000000000000000000000000000000000..0c1d76ec63defe3cb6a0738a2843b10d02104bb5 --- /dev/null +++ b/PuReMD-GPU/src/cuda_center_mass.h @@ -0,0 +1,44 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_CENTER_MASS_H__ +#define __CUDA_CENTER_MASS_H__ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void k_center_of_mass_blocks( single_body_parameters *, reax_atom *, + rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ); +GLOBAL void k_center_of_mass( rvec *xcm, + rvec *vcm, rvec *amcm, rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ); +GLOBAL void k_compute_center_mass_sbp( single_body_parameters *sbp, + reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, size_t n ); +GLOBAL void k_compute_center_mass( real *input, real *output, size_t n ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/cuda_copy.cu b/PuReMD-GPU/src/cuda_copy.cu index 2db79e3718e0c5ea3c54913e29147c71735552f9..1f50dbf3c74fae4d9ab73149d02c51dbec8ca10e 100644 --- a/PuReMD-GPU/src/cuda_copy.cu +++ b/PuReMD-GPU/src/cuda_copy.cu @@ -18,91 +18,96 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ +#include "cuda_copy.h" +#include "cuda_list.h" - -#include "cuda_copy.h" #include "vector.h" -void Sync_Host_Device (grid *host, grid *dev, enum cudaMemcpyKind dir) + +void Sync_Host_Device_Grid( grid *host, grid *dev, enum cudaMemcpyKind dir ) { - copy_host_device (host->top, dev->top, - INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP); + copy_host_device( host->top, dev->top, + INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP ); - copy_host_device (host->mark, dev->mark, - INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_MARK); + copy_host_device( host->mark, dev->mark, + INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_MARK ); - copy_host_device (host->start, dev->start, - INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_START); + copy_host_device( host->start, dev->start, + INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_START ); - copy_host_device (host->end, dev->end, - INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_END); + copy_host_device( host->end, dev->end, + INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_END ); - copy_host_device (host->atoms, dev->atoms, - INT_SIZE * host->max_atoms*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_ATOMS); + copy_host_device( host->atoms, dev->atoms, + INT_SIZE * host->max_atoms*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_ATOMS ); - copy_host_device (host->nbrs, dev->nbrs, - IVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS); + copy_host_device( host->nbrs, dev->nbrs, + IVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS ); - copy_host_device (host->nbrs_cp, dev->nbrs_cp, - RVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS_CP); + copy_host_device( host->nbrs_cp, dev->nbrs_cp, + RVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS_CP ); } -void Sync_Host_Device (reax_system *sys, enum cudaMemcpyKind dir) +void Sync_Host_Device_Sys( reax_system *sys, enum cudaMemcpyKind dir ) { - copy_host_device (sys->atoms, sys->d_atoms, - REAX_ATOM_SIZE * sys->N, dir, RES_SYSTEM_ATOMS); + copy_host_device( sys->atoms, sys->d_atoms, + REAX_ATOM_SIZE * sys->N, dir, RES_SYSTEM_ATOMS ); - copy_host_device (&(sys->box), sys->d_box, SIMULATION_BOX_SIZE, dir, RES_SYSTEM_SIMULATION_BOX ); + copy_host_device( &(sys->box), sys->d_box, SIMULATION_BOX_SIZE, dir, RES_SYSTEM_SIMULATION_BOX ); //synch bonds here. - copy_host_device (sys->reaxprm.sbp, sys->reaxprm.d_sbp, SBP_SIZE * sys->reaxprm.num_atom_types, + copy_host_device( sys->reaxprm.sbp, sys->reaxprm.d_sbp, SBP_SIZE * sys->reaxprm.num_atom_types, dir, RES_REAX_INT_SBP ); - copy_host_device (sys->reaxprm.tbp, sys->reaxprm.d_tbp, TBP_SIZE * pow (sys->reaxprm.num_atom_types, 2), + copy_host_device( sys->reaxprm.tbp, sys->reaxprm.d_tbp, TBP_SIZE * pow (sys->reaxprm.num_atom_types, 2), dir, RES_REAX_INT_TBP ); - copy_host_device (sys->reaxprm.thbp, sys->reaxprm.d_thbp, THBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), + copy_host_device( sys->reaxprm.thbp, sys->reaxprm.d_thbp, THBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), dir, RES_REAX_INT_THBP ); - copy_host_device (sys->reaxprm.hbp, sys->reaxprm.d_hbp, HBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), + copy_host_device( sys->reaxprm.hbp, sys->reaxprm.d_hbp, HBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), dir, RES_REAX_INT_HBP ); - copy_host_device (sys->reaxprm.fbp, sys->reaxprm.d_fbp, FBP_SIZE * pow (sys->reaxprm.num_atom_types, 4), + copy_host_device( sys->reaxprm.fbp, sys->reaxprm.d_fbp, FBP_SIZE * pow (sys->reaxprm.num_atom_types, 4), dir, RES_REAX_INT_FBP ); - copy_host_device (sys->reaxprm.gp.l, sys->reaxprm.d_gp.l, REAL_SIZE * sys->reaxprm.gp.n_global, + copy_host_device( sys->reaxprm.gp.l, sys->reaxprm.d_gp.l, REAL_SIZE * sys->reaxprm.gp.n_global, dir, RES_GLOBAL_PARAMS ); sys->reaxprm.d_gp.n_global = sys->reaxprm.gp.n_global; sys->reaxprm.d_gp.vdw_type = sys->reaxprm.gp.vdw_type; } -void Sync_Host_Device (simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir) + +void Sync_Host_Device_Data( simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir ) { - copy_host_device (host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA ); + copy_host_device( host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA ); } -void Sync_Host_Device (sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir ) + +void Sync_Host_Device_Mat( sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir ) { - copy_host_device ( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( L->entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * L->m, dir, RES_SPARSE_MATRIX_ENTRY ); + copy_host_device( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); + copy_host_device( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); + copy_host_device( L->entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * L->m, dir, RES_SPARSE_MATRIX_ENTRY ); - copy_host_device ( U->start, dev_workspace->U.start, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( U->end, dev_workspace->U.end, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( U->entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * U->m, dir, RES_SPARSE_MATRIX_ENTRY ); + copy_host_device( U->start, dev_workspace->U.start, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); + copy_host_device( U->end, dev_workspace->U.end, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); + copy_host_device( U->entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * U->m, dir, RES_SPARSE_MATRIX_ENTRY ); } -void Sync_Host_Device (output_controls *, control_params *, enum cudaMemcpyKind) + +void Sync_Host_Device_Control( output_controls *, control_params *, enum cudaMemcpyKind ) { } -void Sync_Host_Device (control_params *host, control_params *device, enum cudaMemcpyKind) + +void Sync_Host_Device_Params( control_params *host, control_params *device, enum cudaMemcpyKind ) { - copy_host_device (host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS ); + copy_host_device( host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS ); } -void Prep_Device_For_Output (reax_system *system, simulation_data *data ) +void Prep_Device_For_Output( reax_system *system, simulation_data *data ) { //int size = sizeof (simulation_data) - (2*sizeof (reax_timing) + sizeof (void *)); //unsigned long start_address = (unsigned long)data->d_simulation_data + (unsigned long) (2 * INT_SIZE + REAL_SIZE); @@ -112,7 +117,7 @@ void Prep_Device_For_Output (reax_system *system, simulation_data *data ) //fprintf (stderr, "size to copy --> %d \n", size ); //copy_host_device (data, (simulation_data *)data->d_simulation_data, size, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost ); + //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost ); /* copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, REAL_SIZE * 13, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); @@ -126,7 +131,7 @@ void Prep_Device_For_Output (reax_system *system, simulation_data *data ) */ simulation_data local_data; - copy_host_device (&local_data, (simulation_data *)data->d_simulation_data, + copy_host_device( &local_data, (simulation_data *)data->d_simulation_data, SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); data->E_BE = local_data.E_BE; data->E_Ov = local_data.E_Ov; @@ -141,43 +146,46 @@ void Prep_Device_For_Output (reax_system *system, simulation_data *data ) data->E_vdW = local_data.E_vdW; data->E_Ele = local_data.E_Ele; data->E_Kin = local_data.E_Kin; - rvec_Copy (data->int_press, local_data.int_press); - rvec_Copy (data->ext_press, local_data.ext_press); + rvec_Copy( data->int_press, local_data.int_press); + rvec_Copy( data->ext_press, local_data.ext_press); data->kin_press = local_data.kin_press; data->therm.T = local_data.therm.T; - //Sync_Host_Device (&system.g, &system.d_g, cudaMemcpyDeviceToHost ); - Sync_Host_Device (system, cudaMemcpyDeviceToHost ); + //Sync_Host_Device_Sys( &system.g, &system.d_g, cudaMemcpyDeviceToHost ); + Sync_Host_Device_Sys( system, cudaMemcpyDeviceToHost ); } -void Sync_Host_Device (list *host, list *device, int type) + +void Sync_Host_Device_List( list *host, list *device, int type ) { //list is already allocated -- discard it first if (host->n > 0) - Delete_List (host, TYP_HOST); + { + Cuda_Delete_List( host ); + } //memory is allocated on the host - Make_List(device->n, device->num_intrs, type, host, TYP_HOST ); + Cuda_Make_List( device->n, device->num_intrs, type, host ); //memcpy the entries from device to host - copy_host_device (host->index, device->index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_INDEX ); - copy_host_device (host->end_index, device->end_index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_END_INDEX ); + copy_host_device( host->index, device->index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_INDEX ); + copy_host_device( host->end_index, device->end_index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_END_INDEX ); switch (type) { case TYP_BOND: - copy_host_device (host->select.bond_list, device->select.bond_list, + copy_host_device( host->select.bond_list, device->select.bond_list, BOND_DATA_SIZE * device->num_intrs, cudaMemcpyDeviceToHost, LIST_BOND_DATA ); break; case TYP_THREE_BODY: - copy_host_device (host->select.three_body_list, device->select.three_body_list, - sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, LIST_THREE_BODY_DATA ); + copy_host_device( host->select.three_body_list, device->select.three_body_list, + sizeof( three_body_interaction_data ) * device->num_intrs, cudaMemcpyDeviceToHost, LIST_THREE_BODY_DATA ); break; default: - fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type ); - exit (1); + fprintf( stderr, "Unknown list synching from device to host ---- > %d \n", type ); + exit( 1 ); break; } } diff --git a/PuReMD-GPU/src/cuda_copy.h b/PuReMD-GPU/src/cuda_copy.h index 561a49fbcce0fbb7b4ff03a38874b0fcadb5d7e8..6b6f38ae3d12d7d00d8cd38eef996d61cb765bd5 100644 --- a/PuReMD-GPU/src/cuda_copy.h +++ b/PuReMD-GPU/src/cuda_copy.h @@ -18,24 +18,32 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ - - #ifndef __CUDA_COPY_H_ #define __CUDA_COPY_H_ #include "cuda_utils.h" -#include "cuda.h" + #include "mytypes.h" #include "list.h" -void Sync_Host_Device (grid *, grid *, enum cudaMemcpyKind); -void Sync_Host_Device (reax_system *, enum cudaMemcpyKind); -void Sync_Host_Device (control_params *, control_params *, enum cudaMemcpyKind); -void Sync_Host_Device (simulation_data *, simulation_data *, enum cudaMemcpyKind); -void Sync_Host_Device (sparse_matrix *, sparse_matrix *, enum cudaMemcpyKind); -void Sync_Host_Device (output_controls *, enum cudaMemcpyKind); -void Prep_Device_For_Output (reax_system *, simulation_data *); -void Sync_Host_Device (list *host, list *device, int type); +#ifdef __cplusplus +extern "C" { +#endif + +void Sync_Host_Device_Grid( grid *, grid *, enum cudaMemcpyKind ); +void Sync_Host_Device_Sys( reax_system *, enum cudaMemcpyKind ); +void Sync_Host_Device_Params( control_params *, control_params *, enum cudaMemcpyKind ); +void Sync_Host_Device_Data( simulation_data *, simulation_data *, enum cudaMemcpyKind ); +void Sync_Host_Device_Mat( sparse_matrix *, sparse_matrix *, enum cudaMemcpyKind ); +void Sync_Host_Device_Control( output_controls *, enum cudaMemcpyKind ); + +void Prep_Device_For_Output( reax_system *, simulation_data * ); +void Sync_Host_Device_List( list *host, list *device, int type ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_environment.cu b/PuReMD-GPU/src/cuda_environment.cu new file mode 100644 index 0000000000000000000000000000000000000000..cd6ae50d82b4716aef1d16f2f957dc5f4976cca7 --- /dev/null +++ b/PuReMD-GPU/src/cuda_environment.cu @@ -0,0 +1,71 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_environment.h" + +#include "cuda_utils.h" + + +void Setup_Cuda_Environment( int rank, int nprocs, int gpus_per_node ) +{ + + int deviceCount = 0; + cudaError_t flag; + cublasHandle_t cublasHandle; + cusparseHandle_t cusparseHandle; + cusparseMatDescr_t matdescriptor; + + flag = cudaGetDeviceCount( &deviceCount ); + + if ( flag != cudaSuccess || deviceCount < 1 ) + { + fprintf( stderr, "ERROR: no CUDA capable device(s) found. Terminating...\n" ); + exit( 1 ); + } + + //Calculate the # of GPUs per processor + //and assign the GPU for each process + //TODO: handle condition where # CPU procs > # GPUs + cudaSetDevice( rank % deviceCount ); + +#if defined(__CUDA_DEBUG__) + fprintf( stderr, "p:%d is using GPU: %d \n", rank, rank % deviceCount ); +#endif + + //CHANGE ORIGINAL + //cudaDeviceSetLimit( cudaLimitStackSize, 8192 ); + //cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 ); + //cudaCheckError( ); + + cublasCheckError( cublasCreate(&cublasHandle) ); + + cusparseCheckError( cusparseCreate(&cusparseHandle) ); + cusparseCheckError( cusparseCreateMatDescr(&matdescriptor) ); + cusparseSetMatType( matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL ); + cusparseSetMatIndexBase( matdescriptor, CUSPARSE_INDEX_BASE_ZERO ); + +} + + +void Cleanup_Cuda_Environment( ) +{ + cudaDeviceReset( ); + cudaDeviceSynchronize( ); +} diff --git a/PuReMD-GPU/src/cuda_environment.h b/PuReMD-GPU/src/cuda_environment.h new file mode 100644 index 0000000000000000000000000000000000000000..61f811db3354628a3067a3423442f875f16e6871 --- /dev/null +++ b/PuReMD-GPU/src/cuda_environment.h @@ -0,0 +1,39 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_ENVIRONMENT_H__ +#define __CUDA_ENVIRONMENT_H__ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void Setup_Cuda_Environment( int, int, int ); +void Cleanup_Cuda_Environment( ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/cuda_forces.cu b/PuReMD-GPU/src/cuda_forces.cu new file mode 100644 index 0000000000000000000000000000000000000000..bf277b391ce0df0c5336ea0a0653b6863ca14fec --- /dev/null +++ b/PuReMD-GPU/src/cuda_forces.cu @@ -0,0 +1,2002 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_forces.h" + +#include "box.h" +#include "forces.h" +#include "index_utils.h" +#include "list.h" +#include "print_utils.h" +#include "system_props.h" +#include "vector.h" + +#include "cuda_utils.h" +#include "cuda_init.h" +#include "cuda_bond_orders.h" +#include "cuda_single_body_interactions.h" +#include "cuda_two_body_interactions.h" +#include "cuda_three_body_interactions.h" +#include "cuda_four_body_interactions.h" +#include "cuda_list.h" +#include "cuda_QEq.h" +#include "cuda_reduction.h" +#include "cuda_system_props.h" +#include "validation.h" + +#include "cudaProfiler.h" + + +void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + real t_start, t_elapsed; + real *spad = (real *)scratch; + rvec *rvec_spad; + + //Compute the bonded for interaction here. + //Step 1. +#ifdef __DEBUG_CUDA__ + t_start = Get_Time( ); + fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE); +#endif + + Cuda_Calculate_Bond_Orders_Init<<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, + *dev_workspace, system->reaxprm.num_atom_types, system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); + + Cuda_Calculate_Bond_Orders<<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, + system->reaxprm.d_tbp, *dev_workspace, + *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), + system->reaxprm.num_atom_types, system->N ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + Cuda_Update_Uncorrected_BO<<<BLOCKS, BLOCK_SIZE>>> + (*dev_workspace, *(dev_lists + BONDS), system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); + + Cuda_Update_Workspace_After_Bond_Orders<<<BLOCKS, BLOCK_SIZE>>> + (system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, + *dev_workspace, system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); + +#ifdef __DEBUG_CUDA__ + t_elapsed = Get_Timing_Info( t_start ); + fprintf( stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" ); +#endif + + //Step 2. +#ifdef __DEBUG_CUDA__ + t_start = Get_Time( ); +#endif + //cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE + system->N * REAL_SIZE + 16 * REAL_SIZE), RES_SCRATCH ); + cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE ) , RES_SCRATCH ); + + Cuda_Bond_Energy <<< BLOCKS, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, system->reaxprm.d_tbp, + (simulation_data *)data->d_simulation_data, *dev_workspace, *(dev_lists + BONDS), + system->N, system->reaxprm.num_atom_types, spad ); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_BE + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + //(spad + system->N, spad + system->N + 16, 16); + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_BE, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + +#ifdef __DEBUG_CUDA__ + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf (stderr, "Cuda_Bond_Energy Done... \n"); +#endif + + //Step 3. +#ifdef __DEBUG_CUDA__ + t_start = Get_Time( ); +#endif + cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N ), RES_SCRATCH ); + + test_LonePair_OverUnder_Coordination_Energy_LP <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + *dev_workspace, (simulation_data *)data->d_simulation_data, + *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, + spad, spad + 2 * system->N, spad + 4*system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + test_LonePair_OverUnder_Coordination_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + *dev_workspace, (simulation_data *)data->d_simulation_data, + *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, + spad, spad + 2 * system->N, spad + 4*system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + test_LonePair_Postprocess <<<BLOCKS, BLOCK_SIZE, 0>>>( system->d_atoms, system->reaxprm.d_gp, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + *dev_workspace, (simulation_data *)data->d_simulation_data, + *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types); + cudaThreadSynchronize (); + cudaCheckError (); + + + //Reduction for E_Lp + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Lp, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Ov + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 2*system->N, spad + 3*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ov, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Un + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 4*system->N, spad + 5*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Un, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + +#ifdef __DEBUG_CUDA__ + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf (stderr, "test_LonePair_postprocess Done... \n"); +#endif + + //Step 4. +#ifdef __DEBUG_CUDA__ + t_start = Get_Time( ); +#endif + + cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH); + k_Three_Body_Estimate<<<BLOCKS, BLOCK_SIZE>>> + (system->d_atoms, + (control_params *)control->d_control, + *(dev_lists + BONDS), + system->N, (int *)spad); + cudaThreadSynchronize (); + cudaCheckError (); + +#ifdef __DEBUG_CUDA__ + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Three_Body_Estimate... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); +#endif + + int *thbody = (int *) malloc (sizeof (int) * (dev_lists + BONDS)->num_intrs); + memset (thbody, 0, sizeof (int) * (dev_lists + BONDS)->num_intrs); + copy_host_device (thbody, spad, (dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, RES_SCRATCH); + + int total_3body = thbody [0] * SAFE_ZONE; + for (int x = 1; x < (dev_lists + BONDS)->num_intrs; x++) { + total_3body += thbody [x]*SAFE_ZONE; + thbody [x] += thbody [x-1]; + } + system->num_thbodies = thbody [(dev_lists+BONDS)->num_intrs-1]; + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Total Three body estimate is %d (bonds: %d) \n", total_3body, (dev_lists+BONDS)->num_intrs); +#endif + + if (!system->init_thblist) + { + system->init_thblist = TRUE; + if(!Cuda_Make_List( (dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES )) { + fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); + exit( INIT_ERR ); + } +#ifdef __CUDA_MEM__ + fprintf (stderr, "Device memory allocated: three body list = %d MB\n", + sizeof (three_body_interaction_data) * total_3body / (1024*1024)); +#endif + } else { + if ((dev_workspace->realloc.bonds > 0) || (system->num_thbodies > (dev_lists+THREE_BODIES)->num_intrs )) { + int size = MAX (dev_workspace->realloc.num_bonds, (dev_lists+BONDS)->num_intrs); + + /*Delete Three-body list*/ + Cuda_Delete_List( dev_lists + THREE_BODIES ); + +#ifdef __CUDA_MEM__ + fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", + data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies); +#endif + /*Recreate Three-body list */ + if(!Cuda_Make_List( size, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES )) { + fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); + exit( INIT_ERR ); + } + } + } + + //copy the indexes into the thb list; + copy_host_device (thbody, ((dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), + cudaMemcpyHostToDevice, LIST_INDEX); + copy_host_device (thbody, ((dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), + cudaMemcpyHostToDevice, LIST_END_INDEX); + + free (thbody ); + +#ifdef __DEBUG_CUDA__ + t_start = Get_Time( ); +#endif + + cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); + + k_Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, + system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *dev_workspace, + *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), + system->N, system->reaxprm.num_atom_types, + spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N)); + cudaThreadSynchronize (); + cudaCheckError (); + + //Not necessary to validate three-body list anymore, + // Estimate is already done at the beginning which makes sure that + // we have sufficient size for this list + //Cuda_Threebody_List( system, workspace, dev_lists + THREE_BODIES, data->step ); + + //Reduction for E_Ang + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Ang, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Pen + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 2*system->N, spad + 3*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Pen, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Coa + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 4*system->N, spad + 5*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + + //Reduction for ext_pres + rvec_spad = (rvec *) (spad + 6*system->N); + Cuda_reduction_rvec<<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> + (rvec_spad, rvec_spad + system->N, system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); + + Cuda_reduction_rvec<<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> + (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); + cudaThreadSynchronize( ); + cudaCheckError( ); + + real t_1, t_2; + t_1 = Get_Time( ); + //Sum up the f vector for each atom and collect the CdDelta from all the bonds + k_Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, (control_params *)control->d_control, + *dev_workspace, *(dev_lists + BONDS), system->N ); + cudaThreadSynchronize( ); + cudaCheckError( ); + t_2 = Get_Timing_Info( t_1 ); + +#ifdef __DEBUG_CUDA__ + t_elapsed = Get_Timing_Info( t_start ); + fprintf( stderr, "Three_Body_Interactions post process Timing %lf \n", t_2 ); + fprintf( stderr, "Three_Body_Interactions ... Timing %lf \n", t_elapsed ); + fprintf( stderr, "Three_Body_Interactions Done... \n" ); +#endif + + //Step 5. +#ifdef __DEBUG_CUDA__ + t_start = Get_Time( ); +#endif + + cuda_memset( spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); + //k_Four_Body_Interactions<<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>> + k_Four_Body_Interactions<<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_fbp, + (control_params *)control->d_control, *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), + (simulation_box *)system->d_box, (simulation_data *)data->d_simulation_data, + *dev_workspace, system->N, system->reaxprm.num_atom_types, + spad, spad + 2*system->N, (rvec *) (spad + 4*system->N) ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + //Reduction for E_Tor + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Tor, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for E_Con + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad + 2*system->N, spad + 3*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Con, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Reduction for ext_pres + rvec_spad = (rvec *) (spad + 4*system->N); + Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> + (rvec_spad, rvec_spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> + (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Post process here + k_Four_Body_Postprocess<<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, *dev_workspace, *(dev_lists + BONDS), + system->N ); + cudaThreadSynchronize( ); + cudaCheckError( ); + +#ifdef __DEBUG_CUDA__ + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf (stderr, " Four_Body_ Done... \n"); +#endif + + //Step 6. + if (control->hb_cut > 0) { +#ifdef __DEBUG_CUDA__ + t_start = Get_Time( ); +#endif + cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH ); + + /* + k_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>> + ( system->d_atoms, + system->reaxprm.d_sbp, + system->reaxprm.d_hbp, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *dev_workspace, + *(dev_lists + BONDS), *(dev_lists + HBONDS), + system->N, system->reaxprm.num_atom_types, + spad, (rvec *) (spad + 2*system->N), NULL); + cudaThreadSynchronize (); + cudaCheckError (); + */ + +#ifdef __DEBUG_CUDA__ + real test1,test2; + test1 = Get_Time (); +#endif + + int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + + (((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1); + k_Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE ) >>> + ( system->d_atoms, + system->reaxprm.d_sbp, + system->reaxprm.d_hbp, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *dev_workspace, + *(dev_lists + BONDS), *(dev_lists + HBONDS), + system->N, system->reaxprm.num_atom_types, + spad, (rvec *) (spad + 2*system->N), NULL); + cudaThreadSynchronize (); + cudaCheckError (); + +#ifdef __DEBUG_CUDA__ + test2 = Get_Timing_Info (test1); + fprintf (stderr, "Timing for the hb and forces ---> %f \n", test2); +#endif + + //Reduction for E_HB + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_HB, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + + //Reduction for ext_pres + rvec_spad = (rvec *) (spad + 2*system->N); + Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> + (rvec_spad, rvec_spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> + (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //Post process here +#ifdef __DEBUG_CUDA__ + real t_1, t_2; + t_1 = Get_Time (); +#endif + + k_Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>> + ( system->d_atoms, + system->reaxprm.d_sbp, + *dev_workspace, + *(dev_lists + BONDS), + *(dev_lists + HBONDS), + *(dev_lists + FAR_NBRS), + system->N, + spad); //this is for the fix to use the shared memory + cudaThreadSynchronize (); + cudaCheckError (); + +#ifdef __DEBUG_CUDA__ + t_2 = Get_Timing_Info ( t_1 ); + fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2); + t_1 = Get_Time (); +#endif + + //k_Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>> + k_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>> + ( system->d_atoms, + system->reaxprm.d_sbp, + *dev_workspace, + *(dev_lists + BONDS), + *(dev_lists + HBONDS), + *(dev_lists + FAR_NBRS), + system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + t_2 = Get_Timing_Info ( t_1 ); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2); + t_elapsed = Get_Timing_Info( t_start ); + fprintf (stderr, "Hydrogen bonds post process return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf (stderr, "Hydrogen_Bond Done... \n"); +#endif + } + return; +} + + +void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, + simulation_data *data,static_storage *workspace, + list** lists, output_controls *out_control ) +{ + real t_start, t_elapsed; + real t1 = 0, t2 = 0; + real *spad = (real *) scratch; + rvec *rvec_spad; + int cblks; + + t_start = Get_Time( ); + Cuda_QEq( system, control, data, workspace, lists[FAR_NBRS], out_control ); + t_elapsed = Get_Timing_Info( t_start ); + d_timing.QEq += t_elapsed; + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " Cuda_QEq done with timing %lf \n", t_elapsed ); +#endif + + cuda_memset (spad, 0, system->N * ( 4 * REAL_SIZE + 2 * RVEC_SIZE), RES_SCRATCH ); + + t_start = Get_Time (); + if ( control->tabulate == 0) + { + cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + + ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1); + Cuda_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE * ( 2*REAL_SIZE + RVEC_SIZE) >>> + ( system->d_atoms, + system->reaxprm.d_tbp, + system->reaxprm.d_gp, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *(dev_lists + FAR_NBRS), + spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), + system->reaxprm.num_atom_types, + system->N ) ; + cudaThreadSynchronize (); + cudaCheckError (); + } + else + { + cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + + ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1); + Cuda_Tabulated_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE* (2*REAL_SIZE + RVEC_SIZE)>>> + ( (reax_atom *)system->d_atoms, + (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, + *(dev_lists + FAR_NBRS), + spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), + d_LR, + system->reaxprm.num_atom_types, + out_control->energy_update_freq, + system->N ) ; + + cudaThreadSynchronize (); + cudaCheckError (); + } + + t_elapsed = Get_Timing_Info (t_start ); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed - t2)); + fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed)); +#endif + + //Reduction on E_vdW + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (spad, spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_vdW, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + //reduction on E_Ele + Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (spad + 2*system->N, spad + 3*system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ele, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + rvec_spad = (rvec *) (spad + 4*system->N); + + //reduction on ext_press + Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE>>> + (rvec_spad, rvec_spad + system->N, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2>>> + (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); +} + + +void Cuda_Validate_Lists( reax_system *system, static_storage *workspace, list **lists, int step, int n, + int num_bonds, int num_hbonds ) +{ + int i, flag; + list *bonds, *hbonds, *thblist; + int *bonds_start, *bonds_end; + int *hbonds_start, *hbonds_end; + int *mat_start, *mat_end; + int max_sparse_entries = 0; + + bonds = *lists + BONDS; + hbonds = *lists + HBONDS; + + bonds_start = (int *) calloc (bonds->n, INT_SIZE); + bonds_end = (int *) calloc (bonds->n, INT_SIZE); + + hbonds_start = (int *) calloc (hbonds->n, INT_SIZE ); + hbonds_end = (int *) calloc (hbonds->n, INT_SIZE ); + + mat_start = (int *) calloc (workspace->H.n, INT_SIZE ); + mat_end = (int *) calloc (workspace->H.n, INT_SIZE ); + + copy_host_device (bonds_start, bonds->index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (bonds_end, bonds->end_index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + copy_host_device (hbonds_start, hbonds->index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (hbonds_end, hbonds->end_index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + copy_host_device (mat_start, workspace->H.start, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (mat_end, workspace->H.end, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + /* Sparse Matrix entries */ + +#ifdef __CUDA_TEST__ + /* + workspace->realloc.Htop = 0; + for (i = 0; i < workspace->H.n-1; i++) { + if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])){ + workspace->realloc.Htop = mat_end[i] - mat_start[i]; + } + } + */ +#endif + + flag = -1; + workspace->realloc.Htop = 0; + for ( i = 0; i < n-1; i ++){ + + if( (mat_end[i] - mat_start[i]) > + (system->max_sparse_matrix_entries * DANGER_ZONE )) { + //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index: %d (%d %d) \n", + // step, i, mat_start[i], mat_end[i]); + if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])) + workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ; + } + + if ( (mat_end[i] > mat_start[i+1]) ){ + fprintf( stderr, "step%d-matcheck failed: i=%d end(i)=%d start(i+1)=%d\n", + step, flag, mat_end[i], mat_start[i+1]); + exit(INSUFFICIENT_SPACE); + } + } + + if( (mat_end[i] - mat_start[i]) > system->max_sparse_matrix_entries * DANGER_ZONE ) { + if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])) + workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ; + //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index %d (%d %d) -- %d \n", + // step, i, mat_start[i], mat_end[i], + // (int) (system->max_sparse_matrix_entries * DANGER_ZONE)); + + if( mat_end[i] > system->N * system->max_sparse_matrix_entries ) { + fprintf( stderr, "step%d-matchk failed: i=%d end(i)=%d mat_end=%d\n", + step, flag, mat_end[i], system->N * system->max_sparse_matrix_entries); + exit(INSUFFICIENT_SPACE); + } + } + + /* bond list */ +#ifdef __CUDA_TEST__ + //workspace->realloc.bonds = 1; +#endif + flag = -1; + workspace->realloc.num_bonds = 0; + for( i = 0; i < n-1; ++i ) { + workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS ); + if( bonds_end[i] >= bonds_start[i+1]-2 ) { + workspace->realloc.bonds = 1; + //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", + // step, i, bonds_start [i], bonds_end[i]); + if( bonds_end[i] > bonds_start[i+1] ) + flag = i; + } + } + + if( flag > -1 ) { + fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n", + step, flag, bonds_end[flag], bonds_start[flag+1] ); + exit(INSUFFICIENT_SPACE); + } + + workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS ); + if( bonds_end[i] >= bonds->num_intrs-2 ) { + workspace->realloc.bonds = 1; + //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", + // step, i, bonds_start [i], bonds_end[i]); + + if( bonds_end[i] > bonds->num_intrs ) { + fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n", + step, flag, bonds_end[i], bonds->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + + //fprintf (stderr, "step:%d Total bonds: %d \n", step, workspace->realloc.num_bonds); + + /* hbonds list */ + if( workspace->num_H > 0 ) { +#ifdef __CUDA_TEST__ + //workspace->realloc.hbonds = 1; +#endif + flag = -1; + workspace->realloc.num_hbonds = 0; + for( i = 0; i < workspace->num_H-1; ++i ) { + workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS ); + + if( (hbonds_end[i] - hbonds_start[i]) >= + (hbonds_start[i+1] - hbonds_start[i]) * DANGER_ZONE ) { + workspace->realloc.hbonds = 1; + //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", + // step, i, hbonds_start [i], hbonds_end[i]); + if( hbonds_end[i] > hbonds_start[i+1] ) + flag = i; + } + } + + if( flag > -1 ) { + fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d,end(i)=%d str(i+1)=%d\n", + step, flag, hbonds_start[(flag)],hbonds_end[(flag)], hbonds_start[(flag+1)] ); + exit(INSUFFICIENT_SPACE); + } + + workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS ); + if( (hbonds_end[i] - hbonds_start[i]) >= + (hbonds->num_intrs - hbonds_start[i]) * DANGER_ZONE ) { + workspace->realloc.hbonds = 1; + //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", + // step, i, hbonds_start [i], hbonds_end[i]); + + if( hbonds_end[i] > hbonds->num_intrs ) { + fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n", + step, flag, hbonds_end[i], hbonds->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + } + + //fprintf (stderr, "step:%d Total Hbonds: %d \n", step, workspace->realloc.num_hbonds); + + free (bonds_start); + free (bonds_end ); + + free (hbonds_start ); + free (hbonds_end ); + + free (mat_start ); + free (mat_end ); +} + + +void Cuda_Threebody_List( reax_system *system, static_storage *workspace, list *thblist, int step ) +{ + int *thb_start, *thb_end; + int i, flag; + + thb_start = (int *) calloc (thblist->n, INT_SIZE); + thb_end = (int *) calloc (thblist->n, INT_SIZE ); + + copy_host_device (thb_start, thblist->index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (thb_end, thblist->end_index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + /*three_body list*/ + flag = -1; + workspace->realloc.num_3body = 0; + for( i = 0; i < thblist->n-1; ++i ){ + if( (thb_end[i] - thb_start[i]) >= (thb_start[i+1] - thb_start[i])*DANGER_ZONE ) { + workspace->realloc.thbody = 1; + if( thb_end[i] > thb_end[i+1] || thb_end[i] > thblist->num_intrs ) { + flag = i; + break; + } + } + } + + if( flag > -1 ) { + //fprintf( stderr, "step%d-thbchk failed: i=%d end(i)=%d str(i+1)=%d\n", + // step, flag, thb_end[flag], thb_start[flag+1] ); + fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", + step, flag-1, thb_start[flag-1], thb_end[flag-1], thblist->num_intrs ); + fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", + step, flag, thb_start[flag], thb_end[flag], thblist->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + + if( (thb_end[i]-thb_start[i]) >= (thblist->num_intrs - thb_start[i])*DANGER_ZONE ) { + workspace->realloc.thbody = 1; + + if( thb_end[i] > thblist->num_intrs ) { + fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", + step, i-1, thb_start[i-1], thb_end[i-1], thblist->num_intrs ); + fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", + step, i, thb_start[i], thb_end[i], thblist->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + + free (thb_start); + free (thb_end); +} + + +GLOBAL void k_Estimate_Sparse_Matrix_Entries ( reax_atom *atoms, control_params *control, + simulation_data *data, simulation_box *box, list far_nbrs, int N, int *indices ) { + + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop; + int flag; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + + int temp; + + Htop = 0; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + atom_i = &(atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, &far_nbrs); + end_i = End_Index(i, &far_nbrs); + indices[i] = Htop; + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(atoms[j]); + + //CHANGE ORIGINAL + //if (i < j) continue; + //CHANGE ORIGINAL + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if( nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec)) <= + SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + + if( flag ){ + ++Htop; + } + } + + ++Htop; + + // mark the end of j list + indices[i] = Htop; +} + + +GLOBAL void k_Init_Forces( reax_atom *atoms, global_parameters g_params, control_params *control, + single_body_parameters *sbp, two_body_parameters *tbp, + simulation_data *data, simulation_box *box, static_storage workspace, + list far_nbrs, list bonds, list hbonds, + int N, int max_sparse_entries, int num_atom_types ) +{ + + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop, btop_i, btop_j, num_bonds, num_hbonds; + int ihb, jhb, ihb_top, jhb_top; + int flag; + real r_ij, r2, self_coef; + real dr3gamij_1, dr3gamij_3, Tap; + //real val, dif, base; + real C12, C34, C56; + real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + sparse_matrix *H; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + //LR_lookup_table *t; + reax_atom *atom_i, *atom_j; + bond_data *ibond, *jbond; + bond_order_data *bo_ij, *bo_ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + H = &( workspace.H ); + //CHANGE ORIGINAL + //Htop = 0; + Htop = i * max_sparse_entries; + //CHANGE ORIGINAL + num_bonds = 0; + num_hbonds = 0; + btop_i = btop_j = 0; + p_boc1 = g_params.l[0]; + p_boc2 = g_params.l[1]; + + //for( i = 0; i < system->N; ++i ) + atom_i = &(atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, &far_nbrs); + end_i = End_Index(i, &far_nbrs); + + H->start[i] = Htop; + H->end[i] = Htop; + + btop_i = End_Index( i, &bonds ); + sbp_i = &(sbp[type_i]); + ihb = ihb_top = -1; + + ihb = sbp_i->p_hbond; + + if( control->hb_cut > 0 && (ihb==1 || ihb == 2)) + ihb_top = End_Index( workspace.hbond_index[i], &hbonds ); + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(atoms[j]); + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if( nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if (i > j) { + if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + } else if (i < j) { + if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + } + + if( flag ){ + + type_j = atoms[j].type; + r_ij = nbr_pj->d; + sbp_j = &(sbp[type_j]); + twbp = &(tbp[ index_tbp (type_i,type_j, num_atom_types) ]); + self_coef = (i == j) ? 0.5 : 1.0; + + /* H matrix entry */ + + //CHANGE ORIGINAL + //if (i > j) { + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + H->entries[Htop].j = j; + H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3; + + ++Htop; + //} + //CHANGE ORIGINAL + + /* hydrogen bond lists */ + if( control->hb_cut > 0 && (ihb==1 || ihb == 2) && + nbr_pj->d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + jhb = sbp_j->p_hbond; + + if (ihb == 1 && jhb == 2) { + if (i > j) { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = 1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } else { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = -1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } + } else if (ihb == 2 && jhb == 1) { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = 1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + //TODO + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } + } + + /* uncorrected bond orders */ + if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) { + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + + if( BO >= control->bo_cut ) { + //CHANGE ORIGINAL + num_bonds += 1; + //CHANGE ORIGINAL + + /****** bonds i-j and j-i ******/ + + /* Bond Order page2-3, derivative of total bond order prime */ + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + + + if (i > j) + { + ibond = &( bonds.select.bond_list[btop_i] ); + ibond->nbr = j; + ibond->d = r_ij; + rvec_Copy( ibond->dvec, nbr_pj->dvec ); + ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); + + //ibond->dbond_index = btop_i; + //ibond->sym_index = btop_j; + ++btop_i; + + bo_ij = &( ibond->bo_data ); + bo_ij->BO = BO; + bo_ij->BO_s = BO_s; + bo_ij->BO_pi = BO_pi; + bo_ij->BO_pi2 = BO_pi2; + + //Auxilary data structures + ibond->scratch = 0; + ibond->CdDelta_ij = 0; + rvec_MakeZero (ibond->f); + + ibond->l = -1; + ibond->CdDelta_jk = 0; + ibond->Cdbo_kl = 0; + rvec_MakeZero (ibond->i_f); + rvec_MakeZero (ibond->k_f); + + rvec_MakeZero (ibond->h_f); + + rvec_MakeZero (ibond->t_f); + + // Only dln_BOp_xx wrt. dr_i is stored here, note that + // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 + rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi2, + -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); + + // Only dBOp wrt. dr_i is stored here, note that + // dBOp/dr_i = -dBOp/dr_j and all others are 0 + rvec_Scale( bo_ij->dBOp, + -(bo_ij->BO_s * Cln_BOp_s + + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); + + rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp ); + + bo_ij->BO_s -= control->bo_cut; + bo_ij->BO -= control->bo_cut; + workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp + + bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; + + + } else if ( i < j ) + { + rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2; + rvec dBOp; + + btop_j = btop_i; + + jbond = &(bonds.select.bond_list[btop_j]); + jbond->nbr = j; + jbond->d = r_ij; + rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); + ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); + + btop_i ++; + //jbond->dbond_index = btop_i; + //jbond->sym_index = btop_i; + + bo_ji = &( jbond->bo_data ); + bo_ji->BO = BO; + bo_ji->BO_s = BO_s; + bo_ji->BO_pi = BO_pi; + bo_ji->BO_pi2 = BO_pi2; + + //Auxilary data structures + jbond->scratch = 0; + jbond->CdDelta_ij = 0; + rvec_MakeZero (jbond->f); + + jbond->l = -1; + jbond->CdDelta_jk = 0; + jbond->Cdbo_kl = 0; + rvec_MakeZero (jbond->i_f); + rvec_MakeZero (jbond->k_f); + + rvec_MakeZero (jbond->h_f); + + rvec_MakeZero (jbond->t_f); + + // Only dln_BOp_xx wrt. dr_i is stored here, note that + // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 + rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec); + rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec); + rvec_Scale(dln_BOp_pi2, + -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec); + + rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s); + rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi ); + rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 ); + + // Only dBOp wrt. dr_i is stored here, note that + // dBOp/dr_i = -dBOp/dr_j and all others are 0 + rvec_Scale( dBOp, + -(BO_s * Cln_BOp_s + + BO_pi * Cln_BOp_pi + + BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec ); + rvec_Scale( bo_ji->dBOp, -1., dBOp ); + + rvec_Add( workspace.dDeltap_self[i] , bo_ji->dBOp ); + + bo_ji->BO_s -= control->bo_cut; + bo_ji->BO -= control->bo_cut; + workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp + + bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; + + } + } + } + } + } + + H->entries[Htop].j = i; + H->entries[Htop].val = sbp[type_i].eta; + ++Htop; + + H->end[i] = Htop; + + Set_End_Index( i, btop_i, &bonds ); + if( ihb == 1 || ihb == 2) + Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds ); + + //fprintf( stderr, "%d bonds start: %d, end: %d\n", + // i, Start_Index( i, bonds ), End_Index( i, bonds ) ); + //} + + // mark the end of j list + //H->start[i] = Htop; + /* validate lists - decide if reallocation is required! */ + //Validate_Lists( workspace, lists, + // data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); +} + + +GLOBAL void k_Init_Forces_Tab ( reax_atom *atoms, global_parameters g_params, control_params *control, + single_body_parameters *sbp, two_body_parameters *tbp, + simulation_data *data, simulation_box *box, static_storage workspace, + list far_nbrs, list bonds, list hbonds, + int N, int max_sparse_entries, int num_atom_types, + LR_lookup_table *d_LR) +{ + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop, btop_i, btop_j, num_bonds, num_hbonds; + int tmin, tmax, r; + int ihb, jhb, ihb_top, jhb_top; + int flag; + real r_ij, r2, self_coef; + real val, dif, base; + real C12, C34, C56; + real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + sparse_matrix *H; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + reax_atom *atom_i, *atom_j; + bond_data *ibond, *jbond; + bond_order_data *bo_ij, *bo_ji; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + H = &(workspace.H); + //CHANGE ORIGINAL + Htop = i * max_sparse_entries; + //CHANGE ORIGINAL + num_bonds = 0; + num_hbonds = 0; + btop_i = btop_j = 0; + p_boc1 = g_params.l[0]; + p_boc2 = g_params.l[1]; + + //for( i = 0; i < system->N; ++i ) + atom_i = &(atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, &far_nbrs); + end_i = End_Index(i, &far_nbrs); + H->start[i] = Htop; + H->end[i] = Htop; + btop_i = End_Index( i, &bonds ); + sbp_i = &(sbp[type_i]); + ihb = ihb_top = -1; + + ihb = sbp_i->p_hbond; + + if( control->hb_cut > 0 && (ihb==1 || ihb == 2)) + ihb_top = End_Index( workspace.hbond_index[i], &hbonds ); + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(atoms[j]); + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if(nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if (i > j) { + if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + } + else if ( i < j) { + if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + } + + if( flag ){ + type_j = atoms[j].type; + r_ij = nbr_pj->d; + sbp_j = &(sbp[type_j]); + twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( d_LR[ index_lr (tmin, tmax, num_atom_types) ]); + + /* cubic spline interpolation */ + //CHANGE ORIGINAL + //if (i > j) { + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + val *= EV_to_KCALpMOL / C_ele; + + H->entries[Htop].j = j; + H->entries[Htop].val = self_coef * val; + //H->j [Htop] = j; + //H->val [Htop] = self_coef * val; + ++Htop; + //} + //CHANGE ORIGINAL + + /* hydrogen bond lists */ + if( control->hb_cut > 0 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + jhb = sbp_j->p_hbond; + + if ( ihb == 1 && jhb == 2 ) { + if (i > j) { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = 1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } else { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = -1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } + } else if (ihb == 2 && jhb == 1) { + hbonds.select.hbond_list[ihb_top].nbr = j; + hbonds.select.hbond_list[ihb_top].scl = 1; + hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; + + //Auxilary data structures + rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); + hbonds.select.hbond_list[ihb_top].sym_index= -1; + ++ihb_top; + ++num_hbonds; + } + } + + /* uncorrected bond orders */ + if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) { + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + + //CHANGE ORIGINAL + num_bonds += 1; + //CHANGE ORIGINAL + + /****** bonds i-j and j-i ******/ + if ( i > j ) + { + ibond = &( bonds.select.bond_list[btop_i] ); + ibond->nbr = j; + ibond->d = r_ij; + + rvec_Copy( ibond->dvec, nbr_pj->dvec ); + ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); + + //ibond->dbond_index = btop_i; + //ibond->sym_index = btop_j; + + ++btop_i; + + bo_ij = &( ibond->bo_data ); + bo_ij->BO = BO; + bo_ij->BO_s = BO_s; + bo_ij->BO_pi = BO_pi; + bo_ij->BO_pi2 = BO_pi2; + + //Auxilary data strucutres to resolve dependencies + ibond->scratch = 0; + ibond->CdDelta_ij = 0; + rvec_MakeZero (ibond->f); + + ibond->l = -1; + ibond->CdDelta_jk = 0; + ibond->Cdbo_kl = 0; + rvec_MakeZero (ibond->i_f); + rvec_MakeZero (ibond->k_f); + + rvec_MakeZero (ibond->h_f); + + rvec_MakeZero (ibond->t_f); + + /* Bond Order page2-3, derivative of total bond order prime */ + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + + /* Only dln_BOp_xx wrt. dr_i is stored here, note that + dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ + rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi2, + -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); + + /* Only dBOp wrt. dr_i is stored here, note that + dBOp/dr_i = -dBOp/dr_j and all others are 0 */ + rvec_Scale( bo_ij->dBOp, + -(bo_ij->BO_s * Cln_BOp_s + + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); + + rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp ); + + bo_ij->BO_s -= control->bo_cut; + bo_ij->BO -= control->bo_cut; + + workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp + + bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; + } + else { + rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2; + rvec dBOp; + + btop_j = btop_i; + + jbond = &( bonds.select.bond_list[btop_j] ); + jbond->nbr = j; + jbond->d = r_ij; + + rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); + ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); + + //jbond->dbond_index = btop_i; + //jbond->sym_index = btop_i; + + ++btop_i; + + bo_ji = &( jbond->bo_data ); + + bo_ji->BO = BO; + bo_ji->BO_s = BO_s; + bo_ji->BO_pi = BO_pi; + bo_ji->BO_pi2 = BO_pi2; + + // Auxilary data structures to resolve dependencies + jbond->scratch = 0; + jbond->CdDelta_ij = 0; + rvec_MakeZero (jbond->f); + + jbond->l = -1; + jbond->CdDelta_jk = 0; + jbond->Cdbo_kl = 0; + rvec_MakeZero (jbond->i_f); + rvec_MakeZero (jbond->k_f); + + rvec_MakeZero (jbond->h_f); + + rvec_MakeZero (jbond->t_f); + + // Bond Order page2-3, derivative of total bond order prime + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + + // Only dln_BOp_xx wrt. dr_i is stored here, note that + // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 + + rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec); + rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec); + rvec_Scale(dln_BOp_pi2, -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec); + + rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s); + rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi ); + rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 ); + + // Only dBOp wrt. dr_i is stored here, note that + // dBOp/dr_i = -dBOp/dr_j and all others are 0 + //CHANGE ORIGINAL + rvec_Scale( dBOp, + -(BO_s * Cln_BOp_s + + BO_pi * Cln_BOp_pi + + BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec); + rvec_Scale( bo_ji->dBOp, -1., dBOp); + //CHANGE ORIGINAL + + rvec_Add( workspace.dDeltap_self[i], bo_ji->dBOp ); + + bo_ji->BO_s -= control->bo_cut; + bo_ji->BO -= control->bo_cut; + + workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp + + bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; + } + } + } + } + } + + H->entries[Htop].j = i; + H->entries[Htop].val = sbp[type_i].eta; + + //H->j [Htop] = i; + //H->val [Htop] = sbp[type_i].eta; + + ++Htop; + + H->end[i] = Htop; + Set_End_Index( i, btop_i, &bonds ); + if( ihb == 1 || ihb == 2) + Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds ); +} + + +GLOBAL void k_fix_sym_dbond_indices (list pbonds, int N) +{ + int i, nbr; + bond_data *ibond, *jbond; + int atom_j; + + list *bonds = &pbonds; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) + { + ibond = &( bonds->select.bond_list [j] ); + nbr = ibond->nbr; + + for (int k = Start_Index (nbr, bonds); k < End_Index (nbr, bonds); k ++) + { + jbond = &( bonds->select.bond_list[ k ] ); + atom_j = jbond->nbr; + + if ( (atom_j == i) ) + { + if (i > nbr) { + ibond->dbond_index = j; + jbond->dbond_index = j; + + ibond->sym_index = k; + jbond->sym_index = j; + } + } + } + } +} + + +GLOBAL void k_fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N) +{ + static_storage *workspace = &p_workspace; + hbond_data *ihbond, *jhbond; + int nbr; + + //int i = (blockIdx.x * blockDim.x + threadIdx.x) >> 4; + int i = (blockIdx.x); + int start = Start_Index (workspace->hbond_index[i], &hbonds); + int end = End_Index (workspace->hbond_index[i], &hbonds); + //int j = start + threadIdx.x; + //int j = start + (threadIdx.x % 16); + + //for (int j = Start_Index (workspace->hbond_index[i], &hbonds); + // j < End_Index (workspace->hbond_index[i], &hbonds); j++) + int j = start + threadIdx.x; + while (j < end) + //for (int j = start; j < end; j++) + { + ihbond = &( hbonds.select.hbond_list [j] ); + nbr = ihbond->nbr; + + int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds); + int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds); + + for (int k = nbrstart; k < nbrend; k++) + //k = nbrstart + threadIdx.x; + //while (k < nbrend) + { + jhbond = &( hbonds.select.hbond_list [k] ); + + if (jhbond->nbr == i){ + ihbond->sym_index = k; + jhbond->sym_index = j; + break; + } + + //k += blockDim.x; + } + + j += 32; + } +} + + +GLOBAL void k_New_fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N ) +{ + + static_storage *workspace = &p_workspace; + hbond_data *ihbond, *jhbond; + + int __THREADS_PER_ATOM__ = HBONDS_SYM_THREADS_PER_ATOM; + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); + int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; + + if (warp_id >= N) return; + + int i = warp_id; + int nbr; + int k; + int start = Start_Index (workspace->hbond_index[i], &hbonds); + int end = End_Index (workspace->hbond_index[i], &hbonds); + int j = start + lane_id; + //for (int j = start; j < end; j++) + while (j < end) + { + ihbond = &( hbonds.select.hbond_list [j] ); + nbr = ihbond->nbr; + + int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds); + int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds); + + //k = nbrstart + lane_id; + //if (lane_id == 0) found [my_bucket] = 0; + //while (k < nbrend) + for (k = nbrstart; k < nbrend; k++) + { + jhbond = &( hbonds.select.hbond_list [k] ); + + if (jhbond->nbr == i){ + ihbond->sym_index = k; + jhbond->sym_index = j; + break; + } + } + + j += __THREADS_PER_ATOM__; + } +} + + +GLOBAL void k_Estimate_Storage_Sizes(reax_atom *atoms, + int N, single_body_parameters *sbp, + two_body_parameters *tbp, + global_parameters gp, + control_params *control, + list far_nbrs, + int num_atom_types, int *results) +{ + int *Htop = &results[0]; + int *num_3body = &results[1]; + int *hb_top = &results [ 2 ]; + int *bond_top = &results [ 2 + N ]; + + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int ihb, jhb; + real r_ij, r2; + real C12, C34, C56; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + + p_boc1 = gp.l[0]; + p_boc2 = gp.l[1]; + + //for( i = 0; i < N; ++i ) { + i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= N ) return ; + + atom_i = &(atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, &far_nbrs); + end_i = End_Index(i, &far_nbrs); + sbp_i = &(sbp[type_i]); + ihb = sbp_i->p_hbond; + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &( atoms[j] ); + type_j = atom_j->type; + sbp_j = &( sbp[type_j] ); + twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); + + + if( nbr_pj->d <= control->r_cut ) { + //++(*Htop); + atomicAdd(Htop, 1); + + /* hydrogen bond lists */ + //TODO - CHANGE ORIGINAL + if( control->hb_cut > 0 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + jhb = sbp_j->p_hbond; + if( ihb == 1 && jhb == 2 ) + //++hb_top[i]; + atomicAdd(&hb_top[i], 1); + else if( ihb == 2 && jhb == 1 ) + //++hb_top[j]; + //atomicAdd(&hb_top[j], 1); + atomicAdd(&hb_top[i], 1); + } + //TODO -- CHANGE ORIGINAL + + //CHANGE ORIGINAL + if (i < j) continue; + //CHANGE ORIGINAL + + + /* uncorrected bond orders */ + if( nbr_pj->d <= control->nbr_cut ) { + r_ij = nbr_pj->d; + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + //++bond_top[i]; + //++bond_top[j]; + atomicAdd(&bond_top[i], 1); + atomicAdd(&bond_top[j], 1); + } + } + } + } + //} +} + + +void Cuda_Estimate_Storage_Sizes (reax_system *system, control_params *control, int *output) +{ + int *Htop, *num_3body, input_size; + int *hb_top, *bond_top; + int *input = (int *) scratch; + int max_3body = 0; + + Htop = 0; + num_3body = 0; + input_size = INT_SIZE * (2 * system->N + 1 + 1); + + //cuda_malloc ((void **) &input, input_size, 1, __LINE__); + cuda_memset (input, 0, input_size, RES_SCRATCH ); + + k_Estimate_Storage_Sizes <<<BLOCKS_POW_2, BLOCK_SIZE>>> + (system->d_atoms, system->N, system->reaxprm.d_sbp, system->reaxprm.d_tbp, + system->reaxprm.d_gp, (control_params *)control->d_control, *(dev_lists + FAR_NBRS), + system->reaxprm.num_atom_types, input); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device (output, input, input_size, cudaMemcpyDeviceToHost, __LINE__ ); + + Htop = &output[0]; + num_3body = &output[1]; + hb_top = &output[ 2 ]; + bond_top = &output[ 2 + system->N ]; + + *Htop += system->N; + *Htop *= SAFE_ZONE; + + for( int i = 0; i < system->N; ++i ) { + hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS ); + + if (max_3body <= SQR (bond_top[i])) + max_3body = SQR (bond_top[i]); + + *num_3body += SQR(bond_top[i]); + bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS ); + } + + *num_3body = max_3body * SAFE_ZONE; +} + + +void Cuda_Compute_Forces( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list** lists, output_controls *out_control ) +{ + real t_start, t_elapsed; + real t_1, t_2; + int *indices; + int *Htop; + int max_sparse_entries = 0; + list *far_nbrs = dev_lists + FAR_NBRS; + int hblocks; + + t_start = Get_Time (); + if ( !control->tabulate ) { + k_Init_Forces <<<BLOCKS, BLOCK_SIZE>>> + (system->d_atoms, system->reaxprm.d_gp, (control_params *)control->d_control, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace, + *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), + system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types ); + cudaThreadSynchronize (); + cudaCheckError (); + } + else + { + k_Init_Forces_Tab <<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, (control_params *)control->d_control, + system->reaxprm.d_sbp, system->reaxprm.d_tbp, + (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace, + *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), + system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types, + d_LR ); + cudaThreadSynchronize (); + cudaCheckError (); + } + + /*This is for bonds processing to fix dbond and sym_indexes */ + t_1 = Get_Time (); + k_fix_sym_dbond_indices <<<BLOCKS, BLOCK_SIZE>>> (*(dev_lists + BONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + t_2 = Get_Timing_Info ( t_1 ); + + //FIX -1 HYDROGEN BOND fix for cases where there are no hbonds. + if ((control->hb_cut > 0) && (dev_workspace->num_H > 0)) + { + + hblocks = (system->N * HBONDS_SYM_THREADS_PER_ATOM / HBONDS_SYM_BLOCK_SIZE) + + ((system->N * HBONDS_SYM_THREADS_PER_ATOM % HBONDS_SYM_BLOCK_SIZE) == 0 ? 0 : 1); + t_1 = Get_Time (); + /* + int bs = system->N; + int ss = 32; + fix_sym_hbond_indices <<<bs, ss>>> (*dev_workspace, *(dev_lists + HBONDS), system->N); + */ + k_New_fix_sym_hbond_indices <<<hblocks, HBONDS_SYM_BLOCK_SIZE>>> (*dev_workspace, *(dev_lists + HBONDS), system->N); + cudaThreadSynchronize (); + cudaCheckError (); + } + t_2 = Get_Timing_Info ( t_1 ); + + t_elapsed = Get_Timing_Info (t_start); + d_timing.init_forces+= t_elapsed; + + Cuda_Validate_Lists( system, dev_workspace, &dev_lists, data->step, system->N, + system->num_bonds, system->num_hbonds ); +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Done with Cuda List Validation \n"); +#endif + + //Bonded Force Calculations here. + t_start = Get_Time (); + Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); + t_elapsed = Get_Timing_Info (t_start); + d_timing.bonded += t_elapsed; + + //Compute the Non Bonded Forces here. + t_start = Get_Time (); + Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control ); + t_elapsed = Get_Timing_Info (t_start); + d_timing.nonb += t_elapsed; + + //Compute Total Forces here + Cuda_Compute_Total_Force<<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, + *(dev_lists + BONDS), control->ensemble, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Compute_Total_Force_PostProcess<<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, + *(dev_lists + BONDS), control->ensemble, system->N); + cudaThreadSynchronize (); + cudaCheckError (); +} + + +int validate_device (reax_system *system, simulation_data *data, static_storage *workspace, list **lists ) +{ + int retval = FALSE; + +#ifdef __BUILD_DEBUG__ + + retval |= validate_neighbors (system, lists); + retval |= validate_sym_dbond_indices (system, workspace, lists); + retval |= validate_bonds (system, workspace, lists); + retval |= validate_sparse_matrix (system, workspace); + retval |= validate_three_bodies (system, workspace, lists ); + retval |= validate_hbonds (system, workspace, lists); + retval |= validate_workspace (system, workspace, lists); + retval |= validate_data (system, data); + retval |= validate_atoms (system, lists); + //analyze_hbonds (system, workspace, lists); + + if (!retval) { + fprintf (stderr, "Results *DOES NOT* mattch between device and host \n"); + } +#endif + + return retval; +} diff --git a/PuReMD-GPU/src/cuda_forces.h b/PuReMD-GPU/src/cuda_forces.h new file mode 100644 index 0000000000000000000000000000000000000000..b017e63ebeee45c03d8926a79f1a9a0dd7a4771d --- /dev/null +++ b/PuReMD-GPU/src/cuda_forces.h @@ -0,0 +1,48 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_FORCES_H_ +#define __CUDA_FORCES_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void k_Estimate_Sparse_Matrix_Entries ( reax_atom *, control_params *, + simulation_data *, simulation_box *, list, int, int * ); + +void Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*, + static_storage*, list**, output_controls* ); + +void Cuda_Estimate_Storage_Sizes (reax_system *, control_params *, int *); + +void Cuda_Threebody_List( reax_system *, static_storage *, list *, int ); + +int validate_device (reax_system *, simulation_data *, static_storage *, list **); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/four_body_interactions.cu b/PuReMD-GPU/src/cuda_four_body_interactions.cu similarity index 53% rename from PuReMD-GPU/src/four_body_interactions.cu rename to PuReMD-GPU/src/cuda_four_body_interactions.cu index d7bf757eff65253989cfe58d1ac4dfabd63d602a..60d9973482ddd61a34d874301f1ed360443625b9 100644 --- a/PuReMD-GPU/src/four_body_interactions.cu +++ b/PuReMD-GPU/src/cuda_four_body_interactions.cu @@ -18,20 +18,19 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "four_body_interactions.h" -#include "bond_orders.h" +#include "cuda_four_body_interactions.h" + #include "box.h" +#include "index_utils.h" #include "list.h" -#include "lookup.h" #include "vector.h" -#include "math.h" -#include "index_utils.h" #include "cuda_helpers.h" #define MIN_SINE 1e-10 -HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk, + +DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk, rvec dvec_kl, real r_kl, rvec dvec_li, real r_li, three_body_interaction_data *p_ijk, three_body_interaction_data *p_jkl, @@ -72,7 +71,6 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_ hnhd = r_ij * r_kl * cos_ijk * sin_jkl; hnhe = r_ij * r_kl * sin_ijk * cos_jkl; - poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl; if( poem < 1e-20 ) poem = 1e-20; @@ -81,9 +79,14 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_ r_jk * r_kl * cos_jkl ); arg = tel / poem; - if( arg > 1.0 ) arg = 1.0; - if( arg < -1.0 ) arg = -1.0; - + if( arg > 1.0 ) + { + arg = 1.0; + } + if( arg < -1.0 ) + { + arg = -1.0; + } /*fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", @@ -111,10 +114,22 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_ -p_jkl->dcos_dk[1]/sin_jkl, -p_jkl->dcos_dk[2]/sin_jkl );*/ - if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE; - else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE; - if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE; - else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE; + if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) + { + sin_ijk = MIN_SINE; + } + else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) + { + sin_ijk = -MIN_SINE; + } + if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) + { + sin_jkl = MIN_SINE; + } + else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) + { + sin_jkl = -MIN_SINE; + } // dcos_omega_di rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li ); @@ -145,532 +160,7 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_ } - - - -void Four_Body_Interactions( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) -{ - int i, j, k, l, pi, pj, pk, pl, pij, plk; - int type_i, type_j, type_k, type_l; - int start_j, end_j, start_k, end_k; - int start_pj, end_pj, start_pk, end_pk; - int num_frb_intrs = 0; - - real Delta_j, Delta_k; - real r_ij, r_jk, r_kl, r_li; - real BOA_ij, BOA_jk, BOA_kl; - - real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl; - real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv; - real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl; - real fn10, f11_DjDk, dfn11, fn12; - - real theta_ijk, theta_jkl; - real sin_ijk, sin_jkl; - real cos_ijk, cos_jkl; - real tan_ijk_i, tan_jkl_i; - - real omega, cos_omega, cos2omega, cos3omega; - rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl; - - real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4; - real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; - real Cconj, CEconj1, CEconj2, CEconj3; - real CEconj4, CEconj5, CEconj6; - - real e_tor, e_con; - rvec dvec_li; - rvec force, ext_press; - ivec rel_box_jl; - // rtensor total_rtensor, temp_rtensor; - - four_body_header *fbh; - four_body_parameters *fbp; - bond_data *pbond_ij, *pbond_jk, *pbond_kl; - bond_order_data *bo_ij, *bo_jk, *bo_kl; - three_body_interaction_data *p_ijk, *p_jkl; - - real p_tor2 = system->reaxprm.gp.l[23]; - real p_tor3 = system->reaxprm.gp.l[24]; - real p_tor4 = system->reaxprm.gp.l[25]; - real p_cot2 = system->reaxprm.gp.l[27]; - - list *bonds = (*lists) + BONDS; - list *thb_intrs = (*lists) + THREE_BODIES; - - - for( j = 0; j < system->N; ++j ) { - type_j = system->atoms[j].type; - Delta_j = workspace->Delta_boc[j]; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - - - for( pk = start_j; pk < end_j; ++pk ) { - pbond_jk = &( bonds->select.bond_list[pk] ); - k = pbond_jk->nbr; - bo_jk = &( pbond_jk->bo_data ); - BOA_jk = bo_jk->BO - control->thb_cut; - - /* see if there are any 3-body interactions involving j&k - where j is the central atom. Otherwise there is no point in - trying to form a 4-body interaction out of this neighborhood */ - if( j < k && bo_jk->BO > control->thb_cut/*0*/ && - Num_Entries(pk, thb_intrs) ) { - start_k = Start_Index(k, bonds); - end_k = End_Index(k, bonds); - pj = pbond_jk->sym_index; // pj points to j on k's list - - /* do the same check as above: are there any 3-body interactions - involving k&j where k is the central atom */ - if( Num_Entries(pj, thb_intrs) ) { - type_k = system->atoms[k].type; - Delta_k = workspace->Delta_boc[k]; - r_jk = pbond_jk->d; - - start_pk = Start_Index(pk, thb_intrs ); - end_pk = End_Index(pk, thb_intrs ); - start_pj = Start_Index(pj, thb_intrs ); - end_pj = End_Index(pj, thb_intrs ); - - exp_tor2_jk = EXP( -p_tor2 * BOA_jk ); - exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) ); - exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) ); - exp_tor4_DjDk = EXP( p_tor4 * (Delta_j + Delta_k) ); - exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk); - f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv; - - - /* pick i up from j-k interaction where j is the centre atom */ - for( pi = start_pk; pi < end_pk; ++pi ) { - p_ijk = &( thb_intrs->select.three_body_list[pi] ); - pij = p_ijk->pthb; // pij is pointer to i on j's bond_list - pbond_ij = &( bonds->select.bond_list[pij] ); - bo_ij = &( pbond_ij->bo_data ); - - - if( bo_ij->BO > control->thb_cut/*0*/ ) { - i = p_ijk->thb; - type_i = system->atoms[i].type; - r_ij = pbond_ij->d; - BOA_ij = bo_ij->BO - control->thb_cut; - - theta_ijk = p_ijk->theta; - sin_ijk = SIN( theta_ijk ); - cos_ijk = COS( theta_ijk ); - //tan_ijk_i = 1. / TAN( theta_ijk ); - if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) - tan_ijk_i = cos_ijk / MIN_SINE; - else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) - tan_ijk_i = cos_ijk / -MIN_SINE; - else tan_ijk_i = cos_ijk / sin_ijk; - - exp_tor2_ij = EXP( -p_tor2 * BOA_ij ); - exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) ); - - /* pick l up from j-k intr. where k is the centre */ - for( pl = start_pj; pl < end_pj; ++pl ) { - p_jkl = &( thb_intrs->select.three_body_list[pl] ); - l = p_jkl->thb; - plk = p_jkl->pthb; //pointer to l on k's bond_list! - pbond_kl = &( bonds->select.bond_list[plk] ); - bo_kl = &( pbond_kl->bo_data ); - type_l = system->atoms[l].type; - fbh = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm ) ]); - fbp = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm )].prm[0]); - - if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ && - bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){ - ++num_frb_intrs; - r_kl = pbond_kl->d; - BOA_kl = bo_kl->BO - control->thb_cut; - - theta_jkl = p_jkl->theta; - sin_jkl = SIN( theta_jkl ); - cos_jkl = COS( theta_jkl ); - //tan_jkl_i = 1. / TAN( theta_jkl ); - if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) - tan_jkl_i = cos_jkl / MIN_SINE; - else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) - tan_jkl_i = cos_jkl / -MIN_SINE; - else tan_jkl_i = cos_jkl /sin_jkl; - - Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, - &(system->box), dvec_li ); - r_li = rvec_Norm( dvec_li ); - - - /* omega and its derivative */ - //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, - omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, - r_jk, pbond_kl->dvec, r_kl, - dvec_li, r_li, p_ijk, p_jkl, - dcos_omega_di, dcos_omega_dj, - dcos_omega_dk, dcos_omega_dl, - out_control); - cos_omega = COS( omega ); - cos2omega = COS( 2. * omega ); - cos3omega = COS( 3. * omega ); - /* end omega calculations */ - - /* torsion energy */ - exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk)); - exp_tor2_kl = EXP( -p_tor2 * BOA_kl ); - exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) ); - fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * - (1.0 - exp_tor2_kl); - - CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + - fbp->V2 * exp_tor1 * (1.0 - cos2omega) + - fbp->V3 * (1.0 + cos3omega) ); - //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + - // fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) + - // fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega); - - data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV; - - dfn11 = (-p_tor3 * exp_tor3_DjDk + - (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) * - (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv; - - CEtors1 = sin_ijk * sin_jkl * CV; - - CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * - (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * - sin_ijk * sin_jkl; - - CEtors3 = CEtors2 * dfn11; - - CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * - (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl); - - CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * - (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl); - - CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl * - (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk); - - cmn = -fn10 * CV; - CEtors7 = cmn * sin_jkl * tan_ijk_i; - CEtors8 = cmn * sin_ijk * tan_jkl_i; - CEtors9 = fn10 * sin_ijk * sin_jkl * - (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + - 1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega))); - //cmn = -fn10 * CV; - //CEtors7 = cmn * sin_jkl * cos_ijk; - //CEtors8 = cmn * sin_ijk * cos_jkl; - //CEtors9 = fn10 * sin_ijk * sin_jkl * - // (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + - // fbp->V3 * (6*SQR(cos_omega) - 1.50)); - /* end of torsion energy */ - - - /* 4-body conjugation energy */ - fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; - data->E_Con += e_con = fbp->p_cot1 * fn12 * - (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); - - Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * - (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); - - CEconj1 = Cconj * (BOA_ij - 1.5e0); - CEconj2 = Cconj * (BOA_jk - 1.5e0); - CEconj3 = Cconj * (BOA_kl - 1.5e0); - - CEconj4 = -fbp->p_cot1 * fn12 * - (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i; - CEconj5 = -fbp->p_cot1 * fn12 * - (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i; - //CEconj4 = -fbp->p_cot1 * fn12 * - // (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk; - //CEconj5 = -fbp->p_cot1 * fn12 * - // (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl; - CEconj6 = 2.0 * fbp->p_cot1 * fn12 * - cos_omega * sin_ijk * sin_jkl; - /* end 4-body conjugation energy */ - - //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ", - // workspace->orig_id[i], workspace->orig_id[j], - // workspace->orig_id[k], workspace->orig_id[l], - // omega, cos_omega, cos2omega, cos3omega ); - //fprintf(stdout, - // "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - // CEtors2, CEtors3, CEtors4, CEtors5, - // CEtors6, CEtors7, CEtors8, CEtors9 ); - //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - // theta_ijk, theta_jkl, sin_ijk, - // sin_jkl, cos_jkl, tan_jkl_i ); - - /* forces */ - bo_jk->Cdbopi += CEtors2; - workspace->CdDelta[j] += CEtors3; - workspace->CdDelta[k] += CEtors3; - bo_ij->Cdbo += (CEtors4 + CEconj1); - bo_jk->Cdbo += (CEtors5 + CEconj2); - - bo_kl->Cdbo += (CEtors6 + CEconj3); - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - /* dcos_theta_ijk */ - rvec_ScaledAdd( system->atoms[i].f, - CEtors7 + CEconj4, p_ijk->dcos_dk ); - rvec_ScaledAdd( system->atoms[j].f, - CEtors7 + CEconj4, p_ijk->dcos_dj ); - rvec_ScaledAdd( system->atoms[k].f, - CEtors7 + CEconj4, p_ijk->dcos_di ); - - /* dcos_theta_jkl */ - rvec_ScaledAdd( system->atoms[j].f, - CEtors8 + CEconj5, p_jkl->dcos_di ); - rvec_ScaledAdd( system->atoms[k].f, - CEtors8 + CEconj5, p_jkl->dcos_dj ); - rvec_ScaledAdd( system->atoms[l].f, - CEtors8 + CEconj5, p_jkl->dcos_dk ); - - /* dcos_omega */ - rvec_ScaledAdd( system->atoms[i].f, - CEtors9 + CEconj6, dcos_omega_di ); - rvec_ScaledAdd( system->atoms[j].f, - CEtors9 + CEconj6, dcos_omega_dj ); - rvec_ScaledAdd( system->atoms[k].f, - CEtors9 + CEconj6, dcos_omega_dk ); - rvec_ScaledAdd( system->atoms[l].f, - CEtors9 + CEconj6, dcos_omega_dl ); - } - else { - ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box); - - /* dcos_theta_ijk */ - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); - rvec_Add( system->atoms[i].f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_ScaledAdd( system->atoms[j].f, - CEtors7 + CEconj4, p_ijk->dcos_dj ); - - rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - - /* dcos_theta_jkl */ - rvec_ScaledAdd( system->atoms[j].f, - CEtors8 + CEconj5, p_jkl->dcos_di ); - - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); - rvec_Add( system->atoms[l].f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data->ext_press, ext_press ); - - - /* dcos_omega */ - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); - rvec_Add( system->atoms[i].f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_ScaledAdd( system->atoms[j].f, - CEtors9 + CEconj6, dcos_omega_dj ); - - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); - rvec_Add( system->atoms[l].f, force ); - rvec_iMultiply( ext_press, rel_box_jl, force ); - rvec_Add( data->ext_press, ext_press ); - - - /* This part is intended for a fully-flexible box */ - /* rvec_ScaledSum( temp_rvec, - CEtors7 + CEconj4, p_ijk->dcos_dk, // i - CEtors9 + CEconj6, dcos_omega_di ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[i].x ); - rtensor_Copy( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, - CEtors7 + CEconj4, p_ijk->dcos_dj, // j - CEtors8 + CEconj5, p_jkl->dcos_di ); - rvec_ScaledAdd( temp_rvec, - CEtors9 + CEconj6, dcos_omega_dj ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[j].x ); - rtensor_Add( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, - CEtors7 + CEconj4, p_ijk->dcos_di, // k - CEtors8 + CEconj5, p_jkl->dcos_dj ); - rvec_ScaledAdd( temp_rvec, - CEtors9 + CEconj6, dcos_omega_dk ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[k].x ); - rtensor_Add( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, - CEtors8 + CEconj5, p_jkl->dcos_dk, // l - CEtors9 + CEconj6, dcos_omega_dl ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[l].x ); - rtensor_Copy( total_rtensor, temp_rtensor ); - - if( pbond_ij->imaginary || pbond_jk->imaginary || - pbond_kl->imaginary ) - rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor ); - else - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } - -#ifdef TEST_ENERGY - /*fprintf( out_control->etor, - //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - //r_ij, r_jk, r_kl, - "%12.8f%12.8f%12.8f%12.8f\n", - cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/ - // fprintf( out_control->etor, "%12.8f\n", dfn11 ); - fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", - fn10, cos_omega, CV ); - - fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEtors2, CEtors3, CEtors4, CEtors5, - CEtors6, CEtors7, CEtors8, CEtors9 ); - - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */ - - fprintf( out_control->etor, - "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", - CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); - /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n", - fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/ - - fprintf( out_control->etor, - //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", - "%6d%6d%6d%6d%12.8f%12.8f\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], workspace->orig_id[l], - e_tor, e_con ); - //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor ); - - fprintf( out_control->econ, - "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], workspace->orig_id[l], - RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, - e_con,data->E_Con ); - - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], - (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], - (CEtors7 + CEconj4)*p_ijk->dcos_dk[2], - (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], - (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], - (CEtors7 + CEconj4)*p_ijk->dcos_dj[2], - (CEtors7 + CEconj4)*p_ijk->dcos_di[0], - (CEtors7 + CEconj4)*p_ijk->dcos_di[1], - (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */ - - - /* fprintf( out_control->etor, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - (CEtors8 + CEconj5)*p_jkl->dcos_di[0], - (CEtors8 + CEconj5)*p_jkl->dcos_di[1], - (CEtors8 + CEconj5)*p_jkl->dcos_di[2], - (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], - (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], - (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], - (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], - (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], - (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */ - - fprintf( out_control->etor, - "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", - dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], - dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], - dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2], - dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] ); -#endif - -#ifdef TEST_FORCES - // Torsion Forces - Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., - workspace->f_tor, workspace->f_tor); - Add_dDelta( system, lists, j, CEtors3, workspace->f_tor ); - Add_dDelta( system, lists, k, CEtors3, workspace->f_tor ); - Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor ); - Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor ); - Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor ); - - rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk); - rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj); - rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di); - - rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di); - rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj); - rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk); - - rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di ); - rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj ); - rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk ); - rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl ); - - // Conjugation Forces - Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con ); - Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con ); - Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con ); - - rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk); - rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj); - rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di); - - rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di); - rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj); - rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk); - - rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di ); - rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj ); - rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk ); - rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl ); -#endif - } // pl check ends - } // pl loop ends - } // pi check ends - } // pi loop ends - } // k-j neighbor check ends - } // j<k && j-k neighbor check ends - } // pk loop ends - } // j loop - - /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ - -#ifdef TEST_FORCES - fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs ); - fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", - data->E_Tor, data->E_Con ); -#endif -} - - -//////////////////////////////////////////////////////////////////////// -//Cuda Functions -//////////////////////////////////////////////////////////////////////// - -GLOBAL void Four_Body_Interactions ( reax_atom *atoms, +GLOBAL void k_Four_Body_Interactions ( reax_atom *atoms, global_parameters g_params, four_body_header *d_fbp, control_params *control, @@ -741,7 +231,6 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, list *thb_intrs = &p_thb_intrs; static_storage *workspace = &p_workspace; - //for( j = 0; j < system->N; ++j ) { type_j = atoms[j].type; Delta_j = workspace->Delta_boc[j]; @@ -836,8 +325,8 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, pbond_kl = &( bonds->select.bond_list[plk] ); bo_kl = &( pbond_kl->bo_data ); type_l = atoms[l].type; - fbh = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types) ]); - fbp = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]); + fbh = &(d_fbp[ index_fbp(type_i,type_j,type_k,type_l,num_atom_types) ]); + fbp = &(d_fbp[ index_fbp(type_i,type_j,type_k,type_l,num_atom_types)].prm[0]); if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ && bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){ @@ -889,7 +378,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, //PERFORMANCE IMPACT e_tor = fn10 * sin_ijk * sin_jkl * CV; - //atomicAdd (&data->E_Tor ,e_tor ); + //MYATOMICADD(&data->E_Tor ,e_tor ); E_Tor [j] += e_tor; //sh_tor [threadIdx.x] += e_tor; @@ -933,7 +422,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; //PERFORMANCE IMPACT e_con = fbp->p_cot1 * fn12 * (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); - //atomicAdd (&data->E_Con ,e_con ); + //MYATOMICADD(&data->E_Con ,e_con ); E_Con [j] += e_con ; //sh_con [threadIdx.x] += e_con; @@ -971,12 +460,12 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, /* forces */ //PERFORMANCE IMPACT /* - atomicAdd ( &bo_jk->Cdbopi, CEtors2 ); - atomicAdd ( &workspace->CdDelta[j], CEtors3 ); - atomicAdd ( &workspace->CdDelta[k], CEtors3 ); - atomicAdd ( &bo_ij->Cdbo, (CEtors4 + CEconj1) ); - atomicAdd ( &bo_jk->Cdbo, (CEtors5 + CEconj2) ); - atomicAdd ( &bo_kl->Cdbo, (CEtors6 + CEconj3) ); + MYATOMICADD( &bo_jk->Cdbopi, CEtors2 ); + MYATOMICADD( &workspace->CdDelta[j], CEtors3 ); + MYATOMICADD( &workspace->CdDelta[k], CEtors3 ); + MYATOMICADD( &bo_ij->Cdbo, (CEtors4 + CEconj1) ); + MYATOMICADD( &bo_jk->Cdbo, (CEtors5 + CEconj2) ); + MYATOMICADD( &bo_kl->Cdbo, (CEtors6 + CEconj3) ); */ //PERFORMANCE IMPACT @@ -987,39 +476,29 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, bo_jk->Cdbo += CEtors5 + CEconj2; //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE - atomicAdd (&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 ); + MYATOMICADD(&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 ); //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE if( control->ensemble == NVE || control->ensemble == NVT ||control->ensemble == bNVT) { /* dcos_theta_ijk */ //PERFORMANCE IMPACT - atomic_rvecScaledAdd (pbond_ij->i_f, - CEtors7 + CEconj4, p_ijk->dcos_dk ); - rvec_ScaledAdd( atoms[j].f, - CEtors7 + CEconj4, p_ijk->dcos_dj ); - atomic_rvecScaledAdd( pbond_jk->k_f, - CEtors7 + CEconj4, p_ijk->dcos_di ); + atomic_rvecScaledAdd( pbond_ij->i_f, CEtors7 + CEconj4, p_ijk->dcos_dk ); + rvec_ScaledAdd( atoms[j].f, CEtors7 + CEconj4, p_ijk->dcos_dj ); + atomic_rvecScaledAdd( pbond_jk->k_f, CEtors7 + CEconj4, p_ijk->dcos_di ); /* dcos_theta_jkl */ //PERFORMANCE IMPACT - rvec_ScaledAdd( atoms[j].f, - CEtors8 + CEconj5, p_jkl->dcos_di ); - atomic_rvecScaledAdd( pbond_jk->i_f, - CEtors8 + CEconj5, p_jkl->dcos_dj ); - atomic_rvecScaledAdd( pbond_kl->k_f, - CEtors8 + CEconj5, p_jkl->dcos_dk ); + rvec_ScaledAdd( atoms[j].f, CEtors8 + CEconj5, p_jkl->dcos_di ); + atomic_rvecScaledAdd( pbond_jk->i_f, CEtors8 + CEconj5, p_jkl->dcos_dj ); + atomic_rvecScaledAdd( pbond_kl->k_f, CEtors8 + CEconj5, p_jkl->dcos_dk ); /* dcos_omega */ //PERFORMANCE IMPACT - atomic_rvecScaledAdd( pbond_ij->i_f, - CEtors9 + CEconj6, dcos_omega_di ); - rvec_ScaledAdd( atoms[j].f, - CEtors9 + CEconj6, dcos_omega_dj ); - atomic_rvecScaledAdd( pbond_jk->i_f, - CEtors9 + CEconj6, dcos_omega_dk ); - atomic_rvecScaledAdd( pbond_kl->k_f, - CEtors9 + CEconj6, dcos_omega_dl ); + atomic_rvecScaledAdd( pbond_ij->i_f, CEtors9 + CEconj6, dcos_omega_di ); + rvec_ScaledAdd( atoms[j].f, CEtors9 + CEconj6, dcos_omega_dj ); + atomic_rvecScaledAdd( pbond_jk->i_f, CEtors9 + CEconj6, dcos_omega_dk ); + atomic_rvecScaledAdd( pbond_kl->k_f, CEtors9 + CEconj6, dcos_omega_dl ); } else { ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box); @@ -1033,8 +512,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, //rvec_Add (sh_press [threadIdx.x], ext_press); //PERFORMANCE IMPACT - rvec_ScaledAdd( atoms[j].f, - CEtors7 + CEconj4, p_ijk->dcos_dj ); + rvec_ScaledAdd( atoms[j].f, CEtors7 + CEconj4, p_ijk->dcos_dj ); rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); //PERFORMANCE IMPACT @@ -1047,8 +525,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, /* dcos_theta_jkl */ //PERFORMANCE IMPACT - rvec_ScaledAdd( atoms[j].f, - CEtors8 + CEconj5, p_jkl->dcos_di ); + rvec_ScaledAdd( atoms[j].f, CEtors8 + CEconj5, p_jkl->dcos_di ); rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); //PERFORMANCE IMPACT @@ -1327,7 +804,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, } -GLOBAL void Four_Body_Postprocess ( reax_atom *atoms, +GLOBAL void k_Four_Body_Postprocess( reax_atom *atoms, static_storage p_workspace, list p_bonds, int N ) { diff --git a/PuReMD-GPU/src/cuda_four_body_interactions.h b/PuReMD-GPU/src/cuda_four_body_interactions.h new file mode 100644 index 0000000000000000000000000000000000000000..088e24f4f15a1e83405b83427ba56d1e5d4e67ba --- /dev/null +++ b/PuReMD-GPU/src/cuda_four_body_interactions.h @@ -0,0 +1,42 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_FOUR_BODY_INTERACTIONS_H_ +#define __CUDA_FOUR_BODY_INTERACTIONS_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void k_Four_Body_Interactions( reax_atom *, global_parameters , + four_body_header *, control_params *, list , list , simulation_box *, + simulation_data *, static_storage , int , int , real *, real *, rvec * ); + +GLOBAL void k_Four_Body_Postprocess( reax_atom *, static_storage, list , int ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/cuda_grid.cu b/PuReMD-GPU/src/cuda_grid.cu new file mode 100644 index 0000000000000000000000000000000000000000..ca9556d6b43ffcac8a828d90f87e743d2647152c --- /dev/null +++ b/PuReMD-GPU/src/cuda_grid.cu @@ -0,0 +1,48 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_grid.h" + +#include "grid.h" +#include "index_utils.h" +#include "vector.h" + +#include "cuda_utils.h" +#include "cuda_reset_utils.h" + + +void Cuda_Bin_Atoms (reax_system *system, static_storage *workspace ) +{ + Cuda_Reset_Grid ( &system->d_g); + + Bin_Atoms ( system, workspace ); + + dev_workspace->realloc.gcell_atoms = workspace->realloc.gcell_atoms; +} + + +void Cuda_Bin_Atoms_Sync (reax_system *system) +{ + copy_host_device (system->g.top, system->d_g.top, + INT_SIZE * system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_TOP); + + copy_host_device (system->g.atoms, system->d_g.atoms, + INT_SIZE * system->g.max_atoms*system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_ATOMS); +} diff --git a/PuReMD-GPU/src/cuda_grid.h b/PuReMD-GPU/src/cuda_grid.h new file mode 100644 index 0000000000000000000000000000000000000000..28a20797a56bda9682354f967634eeb6301e44ae --- /dev/null +++ b/PuReMD-GPU/src/cuda_grid.h @@ -0,0 +1,39 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_GRID_H_ +#define __CUDA_GRID_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_Bin_Atoms( reax_system*, static_storage* ); +void Cuda_Bin_Atoms_Sync (reax_system *); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/cuda_helpers.h b/PuReMD-GPU/src/cuda_helpers.h index e021cf84a99b22713fa2fdc4a00af81db0ef5672..e306c673f8aec585bbda7b92250e2f0ec91eb2e3 100644 --- a/PuReMD-GPU/src/cuda_helpers.h +++ b/PuReMD-GPU/src/cuda_helpers.h @@ -21,9 +21,15 @@ #ifndef __CUDA_HELPERS__ #define __CUDA_HELPERS__ + #include "mytypes.h" -DEVICE inline int cuda_strcmp (char *a, char *b, int len) + +#ifdef __cplusplus +extern "C" { +#endif + +static inline DEVICE int cuda_strcmp(char *a, char *b, int len) { char *src, *dst; @@ -32,20 +38,25 @@ DEVICE inline int cuda_strcmp (char *a, char *b, int len) for (int i = 0; i < len; i++) { - if (*dst == '\0') + { return 0; + } - if (*src != *dst) return 1; + if (*src != *dst) + { + return 1; + } - src ++; - dst ++; + src++; + dst++; } return 0; } -DEVICE inline real atomicAdd(real* address, real val) + +static inline DEVICE double myAtomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; @@ -54,24 +65,31 @@ DEVICE inline real atomicAdd(real* address, real val) { assumed = old; old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); + __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } -DEVICE inline void atomic_rvecAdd( rvec ret, rvec v ) + +static inline DEVICE void atomic_rvecAdd( rvec ret, rvec v ) { - atomicAdd ( &ret[0], v[0] ); - atomicAdd ( &ret[1], v[1] ); - atomicAdd ( &ret[2], v[2] ); + MYATOMICADD( (double*)&ret[0], (double)v[0] ); + MYATOMICADD( (double*)&ret[1], (double)v[1] ); + MYATOMICADD( (double*)&ret[2], (double)v[2] ); } -DEVICE inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v ) + +static inline DEVICE void atomic_rvecScaledAdd( rvec ret, real c, rvec v ) { - atomicAdd ( &ret[0], c * v[0] ); - atomicAdd ( &ret[1], c * v[1] ); - atomicAdd ( &ret[2], c * v[2] ); + MYATOMICADD( (double*)&ret[0], (double)(c * v[0]) ); + MYATOMICADD( (double*)&ret[1], (double)(c * v[1]) ); + MYATOMICADD( (double*)&ret[2], (double)(c * v[2]) ); +} + +#ifdef __cplusplus } +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_init.cu b/PuReMD-GPU/src/cuda_init.cu index 09515038daa71b15ef2c5796f54f3c79c29063df..4ca4bac18d08052311d0bdcd83290b5617b66b55 100644 --- a/PuReMD-GPU/src/cuda_init.cu +++ b/PuReMD-GPU/src/cuda_init.cu @@ -18,59 +18,65 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ - - - #include "cuda_init.h" + #include "cuda_utils.h" #include "cuda_copy.h" +#include "cuda_reset_utils.h" + #include "vector.h" -#include "reset_utils.h" -void Cuda_Init_System ( reax_system *system) + +void Cuda_Init_System( reax_system *system) { - cuda_malloc ( (void **) &system->d_atoms, system->N * REAX_ATOM_SIZE, 1, RES_SYSTEM_ATOMS ); + cuda_malloc( (void **) &system->d_atoms, system->N * REAX_ATOM_SIZE, 1, RES_SYSTEM_ATOMS ); - cuda_malloc ( (void **) &system->d_box, sizeof (simulation_box), 1, RES_SYSTEM_SIMULATION_BOX ); + cuda_malloc( (void **) &system->d_box, sizeof (simulation_box), 1, RES_SYSTEM_SIMULATION_BOX ); //interaction parameters - cuda_malloc ((void **) &system->reaxprm.d_sbp, system->reaxprm.num_atom_types * SBP_SIZE, + cuda_malloc((void **) &system->reaxprm.d_sbp, system->reaxprm.num_atom_types * SBP_SIZE, 1, RES_REAX_INT_SBP ); - cuda_malloc ((void **) &system->reaxprm.d_tbp, pow (system->reaxprm.num_atom_types, 2) * TBP_SIZE, + cuda_malloc((void **) &system->reaxprm.d_tbp, pow (system->reaxprm.num_atom_types, 2) * TBP_SIZE, 1, RES_REAX_INT_TBP ); - cuda_malloc ((void **) &system->reaxprm.d_thbp, pow (system->reaxprm.num_atom_types, 3) * THBP_SIZE, + cuda_malloc((void **) &system->reaxprm.d_thbp, pow (system->reaxprm.num_atom_types, 3) * THBP_SIZE, 1, RES_REAX_INT_THBP ); - cuda_malloc ((void **) &system->reaxprm.d_hbp, pow (system->reaxprm.num_atom_types, 3) * HBP_SIZE, + cuda_malloc((void **) &system->reaxprm.d_hbp, pow (system->reaxprm.num_atom_types, 3) * HBP_SIZE, 1, RES_REAX_INT_HBP ); - cuda_malloc ((void **) &system->reaxprm.d_fbp, pow (system->reaxprm.num_atom_types, 4) * FBP_SIZE, + cuda_malloc((void **) &system->reaxprm.d_fbp, pow (system->reaxprm.num_atom_types, 4) * FBP_SIZE, 1, RES_REAX_INT_FBP ); - cuda_malloc ((void **) &system->reaxprm.d_gp.l, REAL_SIZE * system->reaxprm.gp.n_global, 1, RES_GLOBAL_PARAMS ); + cuda_malloc((void **) &system->reaxprm.d_gp.l, REAL_SIZE * system->reaxprm.gp.n_global, 1, RES_GLOBAL_PARAMS ); system->reaxprm.d_gp.n_global = 0; system->reaxprm.d_gp.vdw_type = 0; } -void Cuda_Init_Control (control_params *control) + +void Cuda_Init_Control(control_params *control) { - cuda_malloc ((void **)&control->d_control, CONTROL_PARAMS_SIZE, 1, RES_CONTROL_PARAMS ); - copy_host_device (control, control->d_control, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS ); + cuda_malloc((void **)&control->d_control, CONTROL_PARAMS_SIZE, 1, RES_CONTROL_PARAMS ); + copy_host_device(control, control->d_control, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS ); } + void Cuda_Init_Simulation_Data (simulation_data *data) { - cuda_malloc ((void **) &(data->d_simulation_data), SIMULATION_DATA_SIZE, 1, RES_SIMULATION_DATA ); + cuda_malloc((void **) &(data->d_simulation_data), SIMULATION_DATA_SIZE, 1, RES_SIMULATION_DATA ); } -GLOBAL void Initialize_Grid (ivec *nbrs, rvec *nbrs_cp, int N) + +GLOBAL void Initialize_Grid(ivec *nbrs, rvec *nbrs_cp, int N) { int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index >= N) return; + if (index >= N) + { + return; + } nbrs[index][0] = -1; nbrs[index][1] = -1; @@ -80,6 +86,7 @@ GLOBAL void Initialize_Grid (ivec *nbrs, rvec *nbrs_cp, int N) nbrs_cp[index][2] = -1; } + void Cuda_Init_Grid (grid *host, grid *dev) { int total = host->ncell[0] * host->ncell[1] * host->ncell[2]; @@ -89,30 +96,31 @@ void Cuda_Init_Grid (grid *host, grid *dev) dev->max_cuda_nbrs = host->max_cuda_nbrs; dev->cell_size = host->cell_size; - ivec_Copy (dev->spread, host->spread); - ivec_Copy (dev->ncell, host->ncell); - rvec_Copy (dev->len, host->len); - rvec_Copy (dev->inv_len, host->inv_len); + ivec_Copy( dev->spread, host->spread ); + ivec_Copy( dev->ncell, host->ncell ); + rvec_Copy( dev->len, host->len ); + rvec_Copy( dev->inv_len, host->inv_len ); - cuda_malloc ((void **) &dev->top, INT_SIZE * total , 1, RES_GRID_TOP ); - cuda_malloc ((void **) &dev->mark, INT_SIZE * total , 1, RES_GRID_MARK ); - cuda_malloc ((void **) &dev->start, INT_SIZE * total , 1, RES_GRID_START ); - cuda_malloc ((void **) &dev->end, INT_SIZE * total , 1, RES_GRID_END ); + cuda_malloc((void **) &dev->top, INT_SIZE * total , 1, RES_GRID_TOP ); + cuda_malloc((void **) &dev->mark, INT_SIZE * total , 1, RES_GRID_MARK ); + cuda_malloc((void **) &dev->start, INT_SIZE * total , 1, RES_GRID_START ); + cuda_malloc((void **) &dev->end, INT_SIZE * total , 1, RES_GRID_END ); - cuda_malloc ((void **) &dev->atoms, INT_SIZE * total * host->max_atoms, 1, RES_GRID_ATOMS ); - cuda_malloc ((void **) &dev->nbrs, IVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS ); - cuda_malloc ((void **) &dev->nbrs_cp, RVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS_CP ); + cuda_malloc((void **) &dev->atoms, INT_SIZE * total * host->max_atoms, 1, RES_GRID_ATOMS ); + cuda_malloc((void **) &dev->nbrs, IVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS ); + cuda_malloc((void **) &dev->nbrs_cp, RVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS_CP ); int block_size = 512; int blocks = (total*dev->max_nbrs) / block_size + ((total*dev->max_nbrs) % block_size == 0 ? 0 : 1); - Initialize_Grid <<<blocks, block_size>>> - (dev->nbrs, dev->nbrs_cp, total * host->max_nbrs ); - cudaThreadSynchronize (); - cudaCheckError (); + Initialize_Grid<<<blocks, block_size>>> + ( dev->nbrs, dev->nbrs_cp, total * host->max_nbrs ); + cudaThreadSynchronize( ); + cudaCheckError( ); } -GLOBAL void Init_Workspace_Arrays (single_body_parameters *sbp, reax_atom *atoms, + +GLOBAL void Init_Workspace_Arrays(single_body_parameters *sbp, reax_atom *atoms, static_storage workspace, int N) { @@ -127,6 +135,7 @@ GLOBAL void Init_Workspace_Arrays (single_body_parameters *sbp, reax_atom *atoms workspace.b[i+N] = -1.0; } + GLOBAL void Init_Map_Serials (int *input, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -135,6 +144,7 @@ GLOBAL void Init_Map_Serials (int *input, int N) input[i] = -1; } + void Cuda_Init_Workspace_System (reax_system *system, static_storage *workspace ) { int blocks, block_size = BLOCK_SIZE; @@ -262,6 +272,7 @@ void Cuda_Init_Workspace( reax_system *system, control_params *control, Cuda_Reset_Workspace( system, workspace ); } + void Cuda_Init_Workspace_Device ( static_storage *workspace ) { workspace->realloc.estimate_nbrs = -1; @@ -273,6 +284,7 @@ void Cuda_Init_Workspace_Device ( static_storage *workspace ) workspace->realloc.gcell_atoms = -1; } + void Cuda_Init_Sparse_Matrix (sparse_matrix *matrix, int entries, int N) { cuda_malloc ((void **) &matrix->start, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX ); @@ -284,7 +296,8 @@ void Cuda_Init_Sparse_Matrix (sparse_matrix *matrix, int entries, int N) } -void Cuda_Init_Scratch () + +void Cuda_Init_Scratch() { cuda_malloc ((void **) &scratch, SCRATCH_SIZE, 0, RES_SCRATCH ); diff --git a/PuReMD-GPU/src/cuda_init.h b/PuReMD-GPU/src/cuda_init.h index 233761691fe56bc8b1290c8972c4b413cc876f54..cd9c568130730d298fbaf781f4f6b18b0f02f65a 100644 --- a/PuReMD-GPU/src/cuda_init.h +++ b/PuReMD-GPU/src/cuda_init.h @@ -18,22 +18,31 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ - #ifndef __CUDA_INIT_H__ #define __CUDA_INIT_H__ #include "mytypes.h" -void Cuda_Init_System ( reax_system* ); -void Cuda_Init_Simulation_Data (simulation_data *); -void Cuda_Init_Workspace_System ( reax_system *, static_storage *); -void Cuda_Init_Workspace ( reax_system *, control_params *, static_storage *); -void Cuda_Init_Workspace_Device ( static_storage *); -void Cuda_Init_Control (control_params *); -void Cuda_Init_Grid (grid *, grid *); -void Cuda_Init_Sparse_Matrix (sparse_matrix *, int, int); +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_Init_System( reax_system* ); +void Cuda_Init_Simulation_Data( simulation_data * ); +void Cuda_Init_Workspace_System( reax_system *, static_storage * ); +void Cuda_Init_Workspace( reax_system *, control_params *, static_storage * ); +void Cuda_Init_Workspace_Device( static_storage * ); +void Cuda_Init_Control( control_params * ); +void Cuda_Init_Grid( grid *, grid * ); + +void Cuda_Init_Sparse_Matrix( sparse_matrix *, int, int ); + +void Cuda_Init_Scratch( ); + +#ifdef __cplusplus +} +#endif -void Cuda_Init_Scratch (); #endif diff --git a/PuReMD-GPU/src/cuda_init_md.cu b/PuReMD-GPU/src/cuda_init_md.cu new file mode 100644 index 0000000000000000000000000000000000000000..1a205506e4c5ff767e02398a3859f838818c1e1a --- /dev/null +++ b/PuReMD-GPU/src/cuda_init_md.cu @@ -0,0 +1,586 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_init_md.h" + +#include "allocate.h" +#include "box.h" +#include "forces.h" +#include "grid.h" +#include "index_utils.h" +#include "init_md.h" +#include "integrate.h" +#include "lookup.h" +#include "print_utils.h" +#include "reset_utils.h" +#include "system_props.h" +#include "traj.h" +#include "vector.h" + +#include "cuda_allocate.h" +#include "cuda_utils.h" +#include "cuda_init.h" +#include "cuda_copy.h" +#include "cuda_box.h" +#include "cuda_forces.h" +#include "cuda_grid.h" +#include "cuda_integrate.h" +#include "cuda_lin_alg.h" +#include "cuda_list.h" +#include "cuda_lookup.h" +#include "cuda_neighbors.h" +#include "cuda_reduction.h" +#include "cuda_reset_utils.h" +#include "cuda_system_props.h" +#include "validation.h" + + +void Cuda_Init_System( reax_system *system, control_params *control, + simulation_data *data ) +{ + int i; + rvec dx; + + if( !control->restart ) + { + Cuda_Reset_Atoms( system ); + } + + Cuda_Compute_Total_Mass( system, data ); + + Cuda_Compute_Center_of_Mass( system, data, stderr ); + + /* reposition atoms */ + // just fit the atoms to the periodic box + if( control->reposition_atoms == 0 ) + { + rvec_MakeZero( dx ); + } + // put the center of mass to the center of the box + else if( control->reposition_atoms == 1 ) + { + rvec_Scale( dx, 0.5, system->box.box_norms ); + rvec_ScaledAdd( dx, -1., data->xcm ); + } + // put the center of mass to the origin + else if( control->reposition_atoms == 2 ) + { + rvec_Scale( dx, -1., data->xcm ); + } + else + { + fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" ); + exit( UNKNOWN_OPTION ); + } + + k_compute_Inc_on_T3<<<BLOCKS_POW_2, BLOCK_SIZE>>> + (system->d_atoms, system->N, system->d_box, dx[0], dx[1], dx[2]); + cudaThreadSynchronize( ); + cudaCheckError( ); + + //copy back the atoms from device to the host + copy_host_device( system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , + cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); + + /* Initialize velocities so that desired init T can be attained */ + if( !control->restart || (control->restart && control->random_vel) ) { + Generate_Initial_Velocities( system, control->T_init ); + } + + Setup_Grid( system ); +} + + +void Cuda_Init_Simulation_Data( reax_system *system, control_params *control, + simulation_data *data, output_controls *out_control, + evolve_function *Evolve ) +{ + + Reset_Simulation_Data( data ); + + if( !control->restart ) + data->step = data->prev_steps = 0; + + switch( control->ensemble ) { + case NVE: + data->N_f = 3 * system->N; + *Evolve = Cuda_Velocity_Verlet_NVE; + break; + + + case NVT: + data->N_f = 3 * system->N + 1; + //control->Tau_T = 100 * data->N_f * K_B * control->T_final; + if( !control->restart || (control->restart && control->random_vel) ) { + data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - + data->N_f * K_B * control->T ); + data->therm.v_xi = data->therm.G_xi * control->dt; + data->therm.v_xi_old = 0; + data->therm.xi = 0; +#if defined(DEBUG_FOCUS) + fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n", + data->therm.G_xi, control->Tau_T, data->E_Kin, + data->N_f, data->therm.v_xi ); +#endif + } + + *Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein; + break; + + + case NPT: // Anisotropic NPT + fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); + exit( UNKNOWN_OPTION ); + data->N_f = 3 * system->N + 9; + if( !control->restart ) { + data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - + data->N_f * K_B * control->T ); + data->therm.v_xi = data->therm.G_xi * control->dt; + data->iso_bar.eps = 0.33333 * log(system->box.volume); + //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P)); + //Compute_Pressure( system, data, workspace ); + } + *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; + break; + + + case sNPT: // Semi-Isotropic NPT + fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); + exit( UNKNOWN_OPTION ); + data->N_f = 3 * system->N + 4; + *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT; + break; + + + case iNPT: // Isotropic NPT + fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); + exit( UNKNOWN_OPTION ); + data->N_f = 3 * system->N + 2; + *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; + break; + + case bNVT: //berendensen NVT + data->N_f = 3 * system->N + 1; + *Evolve = Cuda_Velocity_Verlet_Berendsen_NVT; + break; + + default: + break; + } + + Cuda_Compute_Kinetic_Energy( system, data ); + +#ifdef __BUILD_DEBUG__ + real t_E_Kin = 0; + t_E_Kin = data->E_Kin; +#endif + + copy_host_device( &data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, + REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); + if( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! + data->therm.T = ALMOST_ZERO; + +#ifdef __BUILD_DEBUG__ + if (check_zero( t_E_Kin, data->E_Kin)){ + fprintf( stderr, "SimulationData:E_Kin does not match between host and device (%f %f) \n", t_E_Kin, data->E_Kin ); + exit( 1 ); + } + //validate_data ( system, data ); +#endif + + /* init timing info for the host*/ + data->timing.start = Get_Time( ); + data->timing.total = data->timing.start; + data->timing.nbrs = 0; + data->timing.init_forces = 0; + data->timing.bonded = 0; + data->timing.nonb = 0; + data->timing.QEq = 0; + data->timing.matvecs = 0; + + /* init timing info for the device */ + d_timing.start = Get_Time( ); + d_timing.total = data->timing.start; + d_timing.nbrs = 0; + d_timing.init_forces = 0; + d_timing.bonded = 0; + d_timing.nonb = 0; + d_timing.QEq = 0; + d_timing.matvecs = 0; +} + + +int Estimate_Device_Matrix( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int *indices, *Htop; + list *far_nbrs = dev_lists + FAR_NBRS; + int max_sparse_entries = 0; + real t1, t2; + + indices = (int *) scratch; + cuda_memset( indices, 0, INT_SIZE * system->N, RES_SCRATCH ); + + t1 = Get_Time( ); + + k_Estimate_Sparse_Matrix_Entries<<<BLOCKS, BLOCK_SIZE>>> + ( system->d_atoms, (control_params *)control->d_control, + (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, + *far_nbrs, system->N, indices ); + cudaThreadSynchronize( ); + cudaCheckError( ); + + t2 = Get_Timing_Info( t1 ); + + //fprintf (stderr, " Time to estimate sparse matrix entries --- > %f \n", t2 ); + + Htop = (int *) malloc( INT_SIZE * (system->N + 1) ); + memset( Htop, 0, INT_SIZE * (system->N + 1) ); + copy_host_device( Htop, indices, system->N * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); + + for (int i = 0; i < system->N; i++) + { + if (max_sparse_entries < Htop[i]) { + max_sparse_entries = Htop[i]; + } + } + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, + " Max sparse entries for this run are ---> %d \n", max_sparse_entries ); +#endif + + return max_sparse_entries * SAFE_ZONE; + //return max_sparse_entries; +} + + +void Allocate_Device_Matrix (reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + + //Allocate space for the sparse Matrix entries here. + system->max_sparse_matrix_entries = + Estimate_Device_Matrix( system, control, data, workspace, lists, out_control ); + dev_workspace->H.n = system->N ; + dev_workspace->H.m = system->N * system->max_sparse_matrix_entries; + Cuda_Init_Sparse_Matrix( &dev_workspace->H, system->max_sparse_matrix_entries * system->N, system->N ); + +#ifdef __CUDA_MEM__ + fprintf( stderr, "Device memory allocated: sparse matrix= %ld (MB)\n", + system->max_sparse_matrix_entries * system->N * sizeof(sparse_matrix_entry) / (1024*1024) ); +#endif +} + + +void Cuda_Init_Lists( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop; + int *hb_top, *bond_top; + + real t_start, t_elapsed; + + grid *g = &( system->g ); + int *d_indices = (int *) scratch; + int total = g->ncell[0] * g->ncell[1] * g->ncell[2]; + + cuda_memset( d_indices, 0, INT_SIZE * system->N, RES_SCRATCH ); + +#ifdef __BUILD_DEBUG__ + for (int i = 0; i < g->max_nbrs; i ++) + { + if ((g->nbrs[i][0] >= g->ncell[0]) || + (g->nbrs[i][1] >= g->ncell[1]) || + (g->nbrs[i][2] >= g->ncell[2]) ) + { + fprintf( stderr, " Grid Incorrectly built.... \n" ); + exit( 1 ); + } + + } +#endif + + dim3 blockspergrid( system->g.ncell[0], system->g.ncell[1], system->g.ncell[2] ); + dim3 threadsperblock( system->g.max_atoms ); + +#ifdef __BUILD_DEBUG__ + fprintf( stderr, "Blocks per grid (%d %d %d)\n", system->g.ncell[0], system->g.ncell[1], system->g.ncell[2] ); + fprintf( stderr, "Estimate Num Neighbors with threads per block as %d \n", system->d_g.max_atoms ); + fprintf( stderr, "Max nbrs %d \n", system->d_g.max_nbrs ); +#endif + + //First Bin atoms and they sync the host and the device for the grid. + //This will copy the atoms from host to device. + Cuda_Bin_Atoms( system, workspace ); + Sync_Host_Device_Grid( &system->g, &system->d_g, cudaMemcpyHostToDevice ); + + k_Estimate_NumNeighbors<<<blockspergrid, threadsperblock >>> + (system->d_atoms, system->d_g, system->d_box, + (control_params *)control->d_control, d_indices); + cudaThreadSynchronize( ); + cudaCheckError( ); + + int *nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) ); + memset( nbrs_indices , 0, INT_SIZE * (system->N + 1) ); + + nbrs_indices [0] = 0; + copy_host_device( &nbrs_indices [1], d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); + + for (int i = 1; i <= system->N; i++) + { + nbrs_indices [i] += nbrs_indices [i-1]; + } + + num_nbrs = nbrs_indices [system->N] ; + system->num_nbrs = num_nbrs; + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Total neighbors %d \n", nbrs_indices[system->N]); + fprintf (stderr, "Corrected Total neighbors %d \n", num_nbrs); +#endif + + list *far_nbrs = (dev_lists + FAR_NBRS); + if( !Cuda_Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, far_nbrs) ) { + fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); + exit( INIT_ERR ); + } + +#ifdef __CUDA_MEM__ + fprintf( stderr, "Device memory allocated: far_nbrs = %ld (MB)\n", + num_nbrs * sizeof(far_neighbor_data) / (1024*1024) ); +#endif + + copy_host_device( nbrs_indices, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ ); + copy_host_device( nbrs_indices, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ ); + Cuda_Generate_Neighbor_Lists( system, workspace, control, FALSE ); + +#ifdef __BUILD_DEBUG__ + + int *end = (int *)malloc( sizeof (int) * system->N ); + int *start = (int *) malloc( sizeof (int) * system->N ); + + copy_host_device( start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0 ); + copy_host_device( end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0 ); + + far_neighbor_data *far_data = (far_neighbor_data *) + malloc( FAR_NEIGHBOR_SIZE * num_nbrs ); + copy_host_device( far_data, far_nbrs->select.far_nbr_list, + FAR_NEIGHBOR_SIZE * num_nbrs, cudaMemcpyDeviceToHost, 0 ); + + compare_far_neighbors( nbrs_indices, start, end, far_data, *lists + FAR_NBRS, system->N ); + + free( start ); + free( end ); +#endif + + int *output, size; + size = INT_SIZE * 2 * system->N + 2; + output = (int *) malloc (size); + Cuda_Estimate_Storage_Sizes( system, control, output ); + + Htop = output[0]; + num_3body = output[1]; + hb_top = &output[ 2 ]; + bond_top = &output[ 2 + system->N ]; + +#ifdef __DEBUG_CUDA__ + int max_hbonds = 0; + int min_hbonds = 1000; + int max_bonds = 0; + int min_bonds = 1000; + + for (int i = 0; i < system->N; i++) + { + if ( max_hbonds < hb_top[i]) + { + max_hbonds = hb_top[i]; + } + if (min_hbonds > hb_top[i]) + { + min_hbonds = hb_top[i]; + } + + if (max_bonds < bond_top [i]) + { + max_bonds = bond_top[i]; + } + if (min_bonds > bond_top[i]) + { + min_bonds = bond_top[i]; + } + } + + fprintf( stderr, "Max Hbonds %d min Hbonds %d \n", max_hbonds, min_hbonds ); + fprintf( stderr, "Max bonds %d min bonds %d \n", max_bonds, min_bonds ); + fprintf( stderr, "Device HTop --> %d and num_3body --> %d \n", Htop, num_3body ); +#endif + + Allocate_Device_Matrix( system, control, data, workspace, lists, out_control ); + + dev_workspace->num_H = 0; + + if( control->hb_cut > 0 ) + { + + int *hbond_index = (int *) malloc ( INT_SIZE * system->N ); + // init H indexes + num_hbonds = 0; + for( i = 0; i < system->N; ++i ) + { + if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 || + system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 2 ) // H atom + { + //hbond_index[i] = workspace->num_H++; + hbond_index[i] = num_hbonds ++; + } + else + { + hbond_index[i] = -1; + } + } + + copy_host_device( hbond_index, dev_workspace->hbond_index, + system->N * INT_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_HBOND_INDEX ); + dev_workspace->num_H = num_hbonds; + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, "Device num_H --> %d \n", dev_workspace->num_H ); +#endif + + Cuda_Allocate_HBond_List( system->N, dev_workspace->num_H, dev_workspace->hbond_index, + hb_top, (dev_lists+HBONDS) ); + num_hbonds = hb_top[system->N-1]; + system->num_hbonds = num_hbonds; + +#ifdef __CUDA_MEM__ + fprintf( stderr, "Device memory allocated: Hydrogen Bonds list: %ld (MB) \n", + sizeof (hbond_data) * num_hbonds / (1024*1024) ); +#endif + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, "Device Total number of HBonds --> %d \n", num_hbonds ); +#endif + + free( hbond_index ); + } + + // bonds list + Cuda_Allocate_Bond_List( system->N, bond_top, dev_lists+BONDS ); + num_bonds = bond_top[system->N-1]; + system->num_bonds = num_bonds; + +#ifdef __CUDA_MEM__ + fprintf( stderr, "Device memory allocated: Bonds list: %ld (MB) \n", + sizeof (bond_data) * num_bonds / (1024*1024)); +#endif + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, "Device Total Bonds --> %d \n", num_bonds ); +#endif + + // system->max_thb_intrs = num_3body; + // 3bodies list + //if(!Cuda_Make_List(num_bonds, num_bonds * MAX_THREE_BODIES, TYP_THREE_BODY, dev_lists + THREE_BODIES)) { + // fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); + // exit( INIT_ERR ); + //} + + //fprintf( stderr, "***memory allocated: three_body = %ldMB\n", + // num_bonds * MAX_THREE_BODIES *sizeof(three_body_interaction_data) / (1024*1024) ); + //fprintf (stderr, "size of (three_body_interaction_data) : %d \n", sizeof (three_body_interaction_data)); + + free( output ); + free( nbrs_indices ); +} + + +void Cuda_Initialize( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, list **lists, + output_controls *out_control, evolve_function *Evolve ) +{ + compute_blocks( &BLOCKS, &BLOCK_SIZE, system->N ); + compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 ); + + //MATVEC_BLOCKS = system.N; + //MATVEC_BLOCK_SIZE = 32; + + MATVEC_BLOCKS = (system->N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) + + ((system->N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) == 0 ? 0 : 1); + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, " MATVEC Blocks : %d, Block_Size : %d \n", MATVEC_BLOCKS, MATVEC_BLOCK_SIZE ); + fprintf( stderr, " Blocks : %d, Blocks_Pow_2 : %d, Block_Size : %d \n", BLOCKS, BLOCKS_POW_2, BLOCK_SIZE ); + fprintf( stderr, " Size of far neighbor data %d \n", sizeof (far_neighbor_data) ); + fprintf( stderr, " Size of reax_atom %d \n", sizeof (reax_atom) ); + fprintf( stderr, " size of sparse matrix entry %d \n", sizeof (sparse_matrix_entry) ); + fprintf( stderr, " TOTAL NUMBER OF ATOMS IN THE SYSTEM --> %d \n", system.N ); +#endif + + Randomize( ); + + Cuda_Init_Scratch( ); + + //System + Cuda_Init_System( system ); + Sync_Host_Device_Sys( system, cudaMemcpyHostToDevice ); + Cuda_Init_System( system, control, data ); + + //Simulation Data + copy_host_device( system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , + cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS ); + Cuda_Init_Simulation_Data( data ); + //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice ); + Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve ); + Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice ); + + //static storage + Cuda_Init_Workspace_System( system, dev_workspace ); + Cuda_Init_Workspace( system, control, dev_workspace ); + Cuda_Init_Workspace_Device( workspace ); + + //control + Cuda_Init_Control( control ); + + //Grid + Cuda_Init_Grid( &system->g, &system->d_g ); + + //lists + Cuda_Init_Lists( system, control, data, workspace, lists, out_control ); + + Init_Out_Controls( system, control, workspace, out_control ); + + if( control->tabulate ) + { + real start, end; + start = Get_Time( ); + Make_LR_Lookup_Table( system, control ); + copy_LR_table_to_device( system, control ); + end = Get_Timing_Info( start ); + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, "Done copying the LR table to the device ---> %f \n", end ); +#endif + } +} diff --git a/PuReMD-GPU/src/cuda_init_md.h b/PuReMD-GPU/src/cuda_init_md.h new file mode 100644 index 0000000000000000000000000000000000000000..0a6544b017d31ca0a82651840682c3aae49d1b11 --- /dev/null +++ b/PuReMD-GPU/src/cuda_init_md.h @@ -0,0 +1,40 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_INIT_MD_H_ +#define __CUDA_INIT_MD_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_Initialize( reax_system*, control_params*, simulation_data*, + static_storage*, list**, output_controls*, evolve_function* ); + +#ifdef __cplusplus +} +#endif + + +#endif + diff --git a/PuReMD-GPU/src/cuda_integrate.cu b/PuReMD-GPU/src/cuda_integrate.cu new file mode 100644 index 0000000000000000000000000000000000000000..cba0b79c39b4f9b66e5b506d11dcffb81adc488d --- /dev/null +++ b/PuReMD-GPU/src/cuda_integrate.cu @@ -0,0 +1,517 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_integrate.h" + +#include "allocate.h" +#include "box.h" +#include "forces.h" +#include "grid.h" +#include "print_utils.h" +#include "reset_utils.h" +#include "system_props.h" +#include "vector.h" +#include "list.h" + +#include "cuda_utils.h" +#include "cuda_reduction.h" +#include "cuda_allocate.h" +#include "cuda_forces.h" +#include "cuda_grid.h" +#include "cuda_neighbors.h" +#include "cuda_QEq.h" +#include "cuda_reset_utils.h" +#include "cuda_system_props.h" +#include "validation.h" + + +GLOBAL void Cuda_Velocity_Verlet_NVE_atoms1 (reax_atom *atoms, + single_body_parameters *sbp, + simulation_box *box, + int N, real dt) +{ + real inv_m, dt_sqr; + rvec dx; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + dt_sqr = SQR(dt); + //for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / sbp[atoms[i].type].mass; + + rvec_ScaledSum( dx, dt, atoms[i].v, + 0.5 * dt_sqr * -F_CONV * inv_m, atoms[i].f ); + Inc_on_T3( atoms[i].x, dx, box ); + + rvec_ScaledAdd( atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, atoms[i].f ); + //} +} + + +GLOBAL void Cuda_Velocity_Verlet_NVE_atoms2 (reax_atom *atoms, single_body_parameters *sbp, int N, real dt) +{ + real inv_m; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + //for( i = 0; i < system->N; i++ ) { + inv_m = 1.0 / sbp[atoms[i].type].mass; + rvec_ScaledAdd( atoms[i].v, + 0.5 * dt * -F_CONV * inv_m, atoms[i].f ); + //} +} + + +void Cuda_Velocity_Verlet_NVE(reax_system* system, control_params* control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, steps, renbr; + real inv_m, dt, dt_sqr; + rvec dx; + int blocks, block_size; + + dt = control->dt; + dt_sqr = SQR(dt); + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "step%d: ", data->step ); +#endif + + compute_blocks (&blocks, &block_size, system->N); + Cuda_Velocity_Verlet_NVE_atoms1 <<<blocks, block_size>>> + (system->d_atoms, system->reaxprm.d_sbp, + (simulation_box *)system->d_box, system->N, dt); + cudaThreadSynchronize (); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "verlet1 - "); +#endif + + Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); + Cuda_Reset( system, control, data, workspace, lists ); + + if( renbr ) { + Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, TRUE); + } + + Cuda_Compute_Forces( system, control, data, workspace, lists, out_control ); + + Cuda_Velocity_Verlet_NVE_atoms2<<<blocks, block_size>>> + (system->d_atoms, system->reaxprm.d_sbp, system->N, dt); + cudaThreadSynchronize (); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "verlet2\n"); +#endif +} + + +GLOBAL void Compute_X_t_dt (real dt, real dt_sqr, thermostat p_therm, + reax_atom *atoms, single_body_parameters *sbp, + simulation_box *box, + static_storage p_workspace, int N) +{ + + real inv_m; + rvec dx; + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= N) return; + + static_storage *workspace = &p_workspace; + thermostat *therm = &p_therm; + + /* Compute x(t + dt) and copy old forces */ + //for (i=0; i < system->N; i++) { + inv_m = 1.0 / sbp[atoms[i].type].mass; + + rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, atoms[i].v, + 0.5 * dt_sqr * inv_m * -F_CONV, atoms[i].f ); + + Inc_on_T3( atoms[i].x, dx, box ); + + rvec_Copy( workspace->f_old[i], atoms[i].f ); + //} + +} + + +GLOBAL void Update_Velocity (reax_atom *atoms, single_body_parameters *sbp, + static_storage p_workspace, real dt, thermostat p_therm, + int N) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + real inv_m; + static_storage *workspace = &p_workspace; + thermostat *therm = &p_therm; + + //for( i = 0; i < system->N; ++i ) { + inv_m = 1.0 / sbp[atoms[i].type].mass; + + rvec_Scale( workspace->v_const[i], + 1.0 - 0.5 * dt * therm->v_xi, atoms[i].v ); + rvec_ScaledAdd( workspace->v_const[i], + 0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] ); + rvec_ScaledAdd( workspace->v_const[i], + 0.5 * dt * inv_m * -F_CONV, atoms[i].f ); + //} +} + + +GLOBAL void E_Kin_Reduction (reax_atom *atoms, static_storage p_workspace, + single_body_parameters *sbp, + real *per_block_results, real coef_v, const size_t n) +{ + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; + static_storage *workspace = &p_workspace; + + if(i < n) + { + rvec_Scale( atoms[i].v, coef_v, workspace->v_const[i] ); + x = ( 0.5 * sbp[atoms[i].type].mass * + rvec_Dot( atoms[i].v, atoms[i].v ) ); + } + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + + __syncthreads(); + } + + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } +} + + +void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control ) +{ + int i, itr, steps, renbr; + real inv_m, coef_v, dt, dt_sqr; + real E_kin_new, G_xi_new, v_xi_new, v_xi_old; + rvec dx; + thermostat *therm; + + real *results = (real *)scratch; + + dt = control->dt; + dt_sqr = SQR( dt ); + therm = &( data->therm ); + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " Device: Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein --> coef to update velocity --> %6.10f\n", therm->v_xi_old); +#endif + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "step%d: ", data->step ); +#endif + + Compute_X_t_dt <<< BLOCKS, BLOCK_SIZE >>> + (dt, dt_sqr, data->therm, system->d_atoms, + system->reaxprm.d_sbp, system->d_box, *dev_workspace, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + /* Compute xi(t + dt) */ + therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi ); +#if defined(DEBUG_FOCUS) + fprintf( stderr, "verlet1 - " ); +#endif + + Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); + Cuda_Reset( system, control, data, workspace, lists ); + + if( renbr ) + { + //generate_neighbor_lists here + Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, TRUE); + } + + /* Calculate Forces at time (t + dt) */ + Cuda_Compute_Forces( system,control,data, workspace, lists, out_control ); + + /* Compute iteration constants for each atom's velocity */ + Update_Velocity <<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, system->reaxprm.d_sbp, *dev_workspace, + dt, *therm, system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; + E_kin_new = G_xi_new = v_xi_old = 0; + itr = 0; + do { + itr++; + + /* new values become old in this iteration */ + v_xi_old = v_xi_new; + coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old); + E_kin_new = 0; + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " Device: coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old); +#endif + + /*reduction for the E_Kin_new here*/ + cuda_memset (results, 0, 2 * BLOCK_SIZE * REAL_SIZE, RES_SCRATCH ); + E_Kin_Reduction <<< BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + (system->d_atoms, *dev_workspace, system->reaxprm.d_sbp, + results, coef_v, system->N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + (results, results + BLOCKS_POW_2, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (&E_kin_new, results + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); + + G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - + data->N_f * K_B * control->T ); + v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new ); + +#if defined(DEBUG) + fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n", + itr, G_xi_new, v_xi_new, v_xi_old ); +#endif + } + while( fabs(v_xi_new - v_xi_old ) > 1e-5 ); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " Iteration Count in NVE --> %d \n", itr ); +#endif + + therm->v_xi_old = therm->v_xi; + therm->v_xi = v_xi_new; + therm->G_xi = G_xi_new; + +#if defined(DEBUG_FOCUS) + fprintf( stderr,"vel scale\n" ); +#endif +} + + +GLOBAL void ker_update_velocity_1 (reax_atom *atoms, + single_body_parameters *sbp, + real dt, + simulation_box *box, + int N) +{ + real inv_m; + rvec dx; + reax_atom *atom; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N ) return; + + /* velocity verlet, 1st part */ + //for( i = 0; i < system->n; i++ ) { + atom = &(atoms[i]); + inv_m = 1.0 / sbp[atom->type].mass; + /* Compute x(t + dt) */ + rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); + rvec_Add( atom->x, dx ); + + /* Metin's suggestion to rebox the atoms */ + /* bNVT fix */ + Inc_on_T3( atoms[i].x, dx, box ); + /* bNVT fix */ + + /* Compute v(t + dt/2) */ + rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); + //} +} + + +void bNVT_update_velocity_part1 (reax_system *system, simulation_box *box, real dt) +{ + ker_update_velocity_1 <<< BLOCKS, BLOCK_SIZE>>> + (system->d_atoms, system->reaxprm.d_sbp, dt, box, system->N); + cudaThreadSynchronize (); + cudaCheckError (); +} + + +GLOBAL void ker_update_velocity_2 (reax_atom *atoms, + single_body_parameters *sbp, + real dt, + int N) +{ + reax_atom *atom; + real inv_m; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N ) return; + + /* velocity verlet, 2nd part */ + //for( i = 0; i < system->n; i++ ) { + atom = &(atoms[i]); + inv_m = 1.0 / sbp[atom->type].mass; + /* Compute v(t + dt) */ + rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); + //} +} + + +void bNVT_update_velocity_part2 (reax_system *system, real dt) +{ + ker_update_velocity_2 <<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, system->reaxprm.d_sbp, dt, system->N); + cudaThreadSynchronize (); + cudaCheckError (); +} + + +GLOBAL void ker_scale_velocities (reax_atom *atoms, real lambda, int N) +{ + reax_atom *atom; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N ) return; + + /* Scale velocities and positions at t+dt */ + //for( i = 0; i < system->n; ++i ) { + atom = &(atoms[i]); + rvec_Scale( atom->v, lambda, atom->v ); + //} +} + + +void bNVT_scale_velocities (reax_system *system, real lambda) +{ + ker_scale_velocities <<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, lambda, system->N); + cudaThreadSynchronize (); + cudaCheckError (); +} + + +void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, + control_params* control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control + ) +{ + int i, steps, renbr; + real inv_m, dt, lambda; + rvec dx; + reax_atom *atom; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "step%d\n", data->step ); +#endif + dt = control->dt; + steps = data->step - data->prev_steps; + renbr = (steps % control->reneighbor == 0); + + /* velocity verlet, 1st part + for( i = 0; i < system->N; i++ ) { + atom = &(system->atoms[i]); + inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; + // Compute x(t + dt) + rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); + rvec_Add( atom->x, dx ); + // Compute v(t + dt/2) + rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); + } + */ + bNVT_update_velocity_part1 (system, (simulation_box *) system->d_box, dt); + +#if defined(DEBUG_FOCUS) + fprintf(stderr, "step%d: verlet1 done\n", data->step); +#endif + + Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); + Cuda_Reset( system, control, data, workspace, lists ); + + if( renbr ) { + Cuda_Generate_Neighbor_Lists( system, workspace, control, TRUE); + } + + Cuda_Compute_Forces( system, control, data, workspace, + lists, out_control ); + + /* velocity verlet, 2nd part + for( i = 0; i < system->N; i++ ) { + atom = &(system->atoms[i]); + inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; + // Compute v(t + dt) + rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); + } + */ + bNVT_update_velocity_part2 (system, dt); +#if defined(DEBUG_FOCUS) + fprintf(stderr, "step%d: verlet2 done\n", data->step); +#endif + + /* temperature scaler */ + Cuda_Compute_Kinetic_Energy( system, data ); + //get the latest temperature from the device to the host. + copy_host_device (&data->therm, &((simulation_data *)data->d_simulation_data)->therm, + sizeof (thermostat), cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); + + lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); + if( lambda < MIN_dT ) + lambda = MIN_dT; + else if (lambda > MAX_dT ) + lambda = MAX_dT; + lambda = SQRT( lambda ); + + //fprintf (stderr, "step:%d lambda -> %f \n", data->step, lambda); + + /* Scale velocities and positions at t+dt + for( i = 0; i < system->N; ++i ) { + atom = &(system->atoms[i]); + rvec_Scale( atom->v, lambda, atom->v ); + } + */ + bNVT_scale_velocities (system, lambda); + Cuda_Compute_Kinetic_Energy( system, data ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "step%d: scaled velocities\n", + data->step ); +#endif + +} + + diff --git a/PuReMD-GPU/src/cuda_integrate.h b/PuReMD-GPU/src/cuda_integrate.h new file mode 100644 index 0000000000000000000000000000000000000000..959b6684fef618b86252a5606b6feb4a6d093f15 --- /dev/null +++ b/PuReMD-GPU/src/cuda_integrate.h @@ -0,0 +1,46 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_INTEGRATE_H_ +#define __CUDA_INTEGRATE_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*, + static_storage*, list**, output_controls* ); +void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*, + simulation_data*, static_storage*, + list**, output_controls* ); +void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* , + simulation_data *, static_storage *, + list **, output_controls * ); + +#ifdef __cplusplus +} +#endif + + +#endif + diff --git a/PuReMD-GPU/src/cuda_lin_alg.cu b/PuReMD-GPU/src/cuda_lin_alg.cu new file mode 100644 index 0000000000000000000000000000000000000000..5dc9eb3518ed8b46174838a498cfdba62d34e989 --- /dev/null +++ b/PuReMD-GPU/src/cuda_lin_alg.cu @@ -0,0 +1,589 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_lin_alg.h" + +#include "list.h" +#include "vector.h" +#include "index_utils.h" + +#include "cuda_copy.h" +#include "cuda_utils.h" +#include "cuda_reduction.h" +#include "system_props.h" + +#include "cublas_v2.h" +#include "cusparse_v2.h" + + +//one thread per row +GLOBAL void Cuda_Matvec (sparse_matrix H, real *vec, real *results, int rows) +{ + real results_row = 0; + int col; + real val; + + int i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= rows) return; + + for (int c = H.start[i]; c < H.end[i]; c++) + { + col = H.entries [c].j; + val = H.entries[c].val; + + results_row += val * vec [col]; + } + + results [i] = results_row; +} + + +//32 thread warp per matrix row. +//invoked as follows +// <<< system->N, 32 >>> +GLOBAL void Cuda_Matvec_csr (sparse_matrix H, real *vec, real *results, int num_rows) +{ + extern __shared__ real vals []; + int thread_id = blockDim.x * blockIdx.x + threadIdx.x; + int warp_id = thread_id / 32; + int lane = thread_id & (32 - 1); + + int row_start; + int row_end; + + // one warp per row + //int row = warp_id; + int row = warp_id; + //if (row < num_rows) + { + vals[threadIdx.x] = 0; + + if (row < num_rows) { + row_start = H.start[row]; + row_end = H.end[row]; + + // compute running sum per thread + for(int jj = row_start + lane; jj < row_end; jj += 32) + vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ]; + //vals[threadIdx.x] += H.val[jj] * vec [ H.j[jj] ]; + } + + __syncthreads (); + + // parallel reduction in shared memory + //SIMD instructions with a WARP are synchronous -- so we do not need to synch here + if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads(); + if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads (); + if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads (); + if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads (); + if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads (); + + // first thread writes the result + if (lane == 0 && row < num_rows) + results[row] = vals[threadIdx.x]; + } +} + + +GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv, int entries) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= entries) return; + + b_proc [i] = b[i] * Hdia_inv[i]; +} + + +GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j, real *output) +{ + real tmp1, tmp2, cc; + + for( int i = 0; i <= j; i++ ) { + if( i == j ) { + cc = SQRT( SQR(h[ index_wkspace_res (j,j) ])+SQR(h[ index_wkspace_res (j+1,j) ]) ); + hc[j] = h[ index_wkspace_res (j,j) ] / cc; + hs[j] = h[ index_wkspace_res (j+1,j) ] / cc; + } + + tmp1 = hc[i] * h[ index_wkspace_res (i,j) ] + hs[i] * h[ index_wkspace_res (i+1,j) ]; + tmp2 = -hs[i] * h[ index_wkspace_res (i,j) ] + hc[i] * h[ index_wkspace_res (i+1,j) ]; + + h[ index_wkspace_res (i,j) ] = tmp1; + h[ index_wkspace_res (i+1,j) ] = tmp2; + } + + /* apply Givens rotations to the rhs as well */ + tmp1 = hc[j] * g_j; + tmp2 = -hs[j] * g_j; + + output[0] = tmp1; + output[1] = tmp2; +} + + +GLOBAL void GMRES_BackSubstitution (int j, real *g, real *h, real *y) +{ + real temp; + for( int i = j-1; i >= 0; i-- ) { + temp = g[i]; + for( int k = j-1; k > i; k-- ) + temp -= h[ index_wkspace_res (i,k) ] * y[k]; + + y[i] = temp / h[ index_wkspace_res (i,i) ]; + } +} + + +int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x ) +{ + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + real v_add_tmp; + sparse_matrix *H = &workspace->H; + real t_start, t_elapsed; + real *spad = (real *)scratch; + real *g = (real *) calloc ((RESTART+1), REAL_SIZE); + + N = H->n; + + cuda_memset(spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (b, spad, H->n, INITIAL); + cudaThreadSynchronize(); + cudaCheckError(); + + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, + cudaMemcpyDeviceToHost, __LINE__); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Norm of the array is %e \n", bnorm ); +#endif + + /* apply the diagonal pre-conditioner to rhs */ + GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> + (workspace->b_prc, b, workspace->Hdia_inv, N); + cudaThreadSynchronize(); + cudaCheckError(); + + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) { + /* calculate r0 */ + //Sparse_MatVec( H, x, workspace->b_prm ); + Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> + ( *H, x, workspace->b_prm, N ); + cudaThreadSynchronize(); + cudaCheckError(); + + GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>> + (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N); + cudaThreadSynchronize(); + cudaCheckError(); + + Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>> + (&workspace->v[ index_wkspace_sys (0,0,N) ], 1., + workspace->b_prc, -1., workspace->b_prm, N); + cudaThreadSynchronize(); + cudaCheckError (); + + //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system->N)], N ); + { + cuda_memset( spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL); + cudaThreadSynchronize(); + cudaCheckError(); + + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device( g, workspace->g, REAL_SIZE, + cudaMemcpyDeviceToHost, RES_STORAGE_G); + } + + Cuda_Vector_Scale<<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], + &workspace->v[index_wkspace_sys(0,0,N)], N ); + cudaThreadSynchronize(); + cudaCheckError(); + + /* GMRES inner-loop */ +#ifdef __DEBUG_CUDA__ + fprintf( stderr, + " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, + tol, g[0] ); +#endif + + for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) { + /* matvec */ + //Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] ); + Cuda_Matvec_csr<<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> + ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], + &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); + cudaThreadSynchronize(); + cudaCheckError(); + + GMRES_Diagonal_Preconditioner<<<BLOCKS, BLOCK_SIZE>>> + (&workspace->v[ index_wkspace_sys (j+1,0,N) ], + &workspace->v[ index_wkspace_sys( j+1,0,N) ], + workspace->Hdia_inv, N ); + cudaThreadSynchronize(); + cudaCheckError(); + + + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + for( i = 0; i <= j; i++ ) + { + Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v[index_wkspace_sys(i,0,N)], + &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); + cudaThreadSynchronize(); + cudaCheckError(); + + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + Cuda_Vector_Add<<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[index_wkspace_sys(j+1,0,N)], + -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); + cudaThreadSynchronize(); + cudaCheckError(); + } + + //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system->N)], N ); + cuda_memset(spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); + + Cuda_Norm<<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); + cudaThreadSynchronize(); + cudaCheckError(); + + Cuda_Norm<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize(); + cudaCheckError(); + + copy_host_device(&v_add_tmp, + &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + Cuda_Vector_Scale<<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[index_wkspace_sys(j+1,0,N)], + 1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N ); + cudaThreadSynchronize(); + cudaCheckError(); + + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + GMRES_Givens_Rotation<<<1, 1>>> + (j, workspace->h, workspace->hc, workspace->hs, g[j], spad); + cudaThreadSynchronize(); + cudaCheckError(); + copy_host_device(&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + } + + copy_host_device(g, workspace->g, (RESTART+1)*REAL_SIZE, + cudaMemcpyHostToDevice, __LINE__); + + /* solve Hy = g. + H is now upper-triangular, do back-substitution */ + copy_host_device(g, spad, (RESTART+1) * REAL_SIZE, + cudaMemcpyHostToDevice, RES_STORAGE_G); + GMRES_BackSubstitution<<<1, 1>>> + (j, spad, workspace->h, workspace->y); + cudaThreadSynchronize(); + cudaCheckError(); + + /* update x = x_0 + Vy */ + for( i = 0; i < j; i++ ) + { + copy_host_device(&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>> + ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError(); + } + + /* stopping condition */ + if( fabs(g[j]) / bnorm <= tol ) + { + break; + } + } + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + return itr * (RESTART+1) + j + 1; + } + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); +#endif + + return itr * (RESTART+1) + j + 1; +} + + +int Cublas_GMRES(reax_system *system, static_storage *workspace, real *b, real tol, real *x ) +{ + + real CSR_ALPHA = 1, CSR_BETA = 0; + + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + real v_add_tmp; + sparse_matrix *H = &workspace->H; + real t_start, t_elapsed; + real *spad = (real *)scratch; + real *g = (real *) calloc ((RESTART+1), REAL_SIZE); + cublasHandle_t cublasHandle; + + N = H->n; + + cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + + /* + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + */ + + cublasCheckError (cublasDnrm2 ( cublasHandle, N, b, 1, &bnorm )); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Norm of the array is %e \n", bnorm ); +#endif + + /* apply the diagonal pre-conditioner to rhs */ + GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> + (workspace->b_prc, b, workspace->Hdia_inv, N); + cudaThreadSynchronize (); + cudaCheckError (); + + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) { + /* calculate r0 */ + //Sparse_MatVec( H, x, workspace->b_prm ); + Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N ); + cudaThreadSynchronize (); + cudaCheckError (); + + GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>> + (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N); + cudaThreadSynchronize (); + cudaCheckError (); + + /* + Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>> + (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); + cudaThreadSynchronize (); + cudaCheckError (); + */ + cuda_memset (workspace->v, 0, REAL_SIZE * (RESTART+1) * N, RES_STORAGE_V); + + double D_ONE = 1.; + double D_MINUS_ONE = -1.; + cublasCheckError (cublasDaxpy (cublasHandle, N, &D_ONE, workspace->b_prc, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); + cublasCheckError (cublasDaxpy (cublasHandle, N, &D_MINUS_ONE, workspace->b_prm, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); + + //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system->N)], N ); + { + /* + cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH ); + + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G); + */ + + cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (0, 0, N)], 1, g )); + copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); + } + + /* + Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + double D_SCALE = 1.0 / g[0]; + cublasCheckError (cublasDscal (cublasHandle, N, &D_SCALE, &workspace->v[ index_wkspace_sys (0,0,N) ], 1)); + + + /* GMRES inner-loop */ +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] ); +#endif + for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) { + /* matvec */ + Cuda_Matvec_csr + <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> + ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N ); + cudaThreadSynchronize (); + cudaCheckError (); + + GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>> + (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N); + cudaThreadSynchronize (); + cudaCheckError (); + + + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + for( i = 0; i <= j; i++ ) { + + /* + Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + */ + + cublasCheckError (cublasDdot (cublasHandle, N, &workspace->v[index_wkspace_sys(i,0,N)], 1, + &workspace->v[index_wkspace_sys(j+1,0,N)], 1, + &v_add_tmp)); + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); + + /* + Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[index_wkspace_sys(j+1,0,N)], + -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + double NEG_V_ADD_TMP = -v_add_tmp; + cublasCheckError (cublasDaxpy (cublasHandle, N, &NEG_V_ADD_TMP, &workspace->v[index_wkspace_sys(i,0,N)], 1, + &workspace->v[index_wkspace_sys(j+1,0,N)], 1 )); + } + + + //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system->N)], N ); + /* + cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH ); + + Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL); + cudaThreadSynchronize (); + cudaCheckError (); + + Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL); + cudaThreadSynchronize (); + cudaCheckError (); + + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + */ + cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (j+1, 0, N)], 1, &v_add_tmp )); + copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); + + + /* + Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>> + ( &workspace->v[index_wkspace_sys(j+1,0,N)], + 1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + */ + double REC_V_ADD_TMP = 1. / v_add_tmp; + cublasCheckError (cublasDscal (cublasHandle, N, &REC_V_ADD_TMP, &workspace->v[index_wkspace_sys(j+1,0,N)], 1)); + + + + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + GMRES_Givens_Rotation <<<1, 1>>> + (j, workspace->h, workspace->hc, workspace->hs, g[j], spad); + cudaThreadSynchronize (); + cudaCheckError (); + copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + } + + copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__); + + /* solve Hy = g. + H is now upper-triangular, do back-substitution */ + copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G); + GMRES_BackSubstitution <<<1, 1>>> + (j, spad, workspace->h, workspace->y); + cudaThreadSynchronize (); + cudaCheckError (); + + /* update x = x_0 + Vy */ + for( i = 0; i < j; i++ ) + { + /* + copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>> + ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N ); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + cublasCheckError (cublasDaxpy (cublasHandle, N, &v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], 1, + x, 1)); + } + + /* stopping condition */ + if( fabs(g[j]) / bnorm <= tol ) + break; + } + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + return itr * (RESTART+1) + j + 1; + } + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j); +#endif + + return itr * (RESTART+1) + j + 1; +} + + diff --git a/PuReMD-GPU/src/cuda_lin_alg.h b/PuReMD-GPU/src/cuda_lin_alg.h new file mode 100644 index 0000000000000000000000000000000000000000..6b464152280f3590c1c70761c1b3ef206779cf5b --- /dev/null +++ b/PuReMD-GPU/src/cuda_lin_alg.h @@ -0,0 +1,43 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_LIN_ALG_H_ +#define __CUDA_LIN_ALG_H_ + +#define SIGN(x) (x < 0.0 ? -1 : 1); + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void Cuda_Matvec (sparse_matrix , real *, real *, int ); +GLOBAL void Cuda_Matvec_csr (sparse_matrix , real *, real *, int ); +int Cuda_GMRES( static_storage *, real *b, real tol, real *x ); +int Cublas_GMRES( reax_system *, static_storage *, real *b, real tol, real *x ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/cuda_list.cu b/PuReMD-GPU/src/cuda_list.cu new file mode 100644 index 0000000000000000000000000000000000000000..5375297c8da15000d673cc70ef5cad3a91dae541 --- /dev/null +++ b/PuReMD-GPU/src/cuda_list.cu @@ -0,0 +1,114 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_list.h" + +#include "cuda_utils.h" + + +char Cuda_Make_List(int n, int num_intrs, int type, list* l) +{ + char success=1; + + l->n = n; + l->num_intrs = num_intrs; + + cuda_malloc ((void **)&l->index, n * sizeof(int), 1, LIST_INDEX ); + cuda_malloc ((void **)&l->end_index, n * sizeof(int), 1, LIST_END_INDEX ); + + switch(type) + { + case TYP_FAR_NEIGHBOR: + cuda_malloc ((void **) &l->select.far_nbr_list, + l->num_intrs*sizeof(far_neighbor_data), + 1, LIST_FAR_NEIGHBOR_DATA); + /* + cudaHostAlloc ((void **) &l->select.far_nbr_list, + l->num_intrs*sizeof(far_neighbor_data), + cudaHostAllocMapped); + + cudaHostGetDevicePointer ( (void **) &l->select.far_nbr_list, + (void *)l->select.far_nbr_list, 0); + */ + break; + + case TYP_HBOND: + cuda_malloc ((void **) &l->select.hbond_list, + l->num_intrs * sizeof(hbond_data), + 1, LIST_HBOND_DATA ); + break; + + case TYP_BOND: + cuda_malloc ((void **) &l->select.bond_list, + l->num_intrs * sizeof(bond_data), + 1, LIST_BOND_DATA ); + break; + + case TYP_THREE_BODY: + cuda_malloc ( (void **) &l->select.three_body_list, + l->num_intrs * sizeof(three_body_interaction_data), + 1, LIST_THREE_BODY_DATA ); + break; + + default: + fprintf (stderr, "Unknown list creation \n" ); + exit (1); + } + + return success; +} + + +void Cuda_Delete_List(list* l) +{ + if (l->index != NULL) + cuda_free (l->index, LIST_INDEX ); + if (l->end_index != NULL) + cuda_free (l->end_index, LIST_END_INDEX ); + + switch(l->type) + { + case TYP_FAR_NEIGHBOR: + if (l->select.far_nbr_list != NULL) + cuda_free (l->select.far_nbr_list, LIST_FAR_NEIGHBOR_DATA); + break; + + case TYP_HBOND: + if (l->select.hbond_list != NULL) + cuda_free (l->select.hbond_list, LIST_HBOND_DATA ); + break; + + case TYP_BOND: + if (l->select.bond_list != NULL) + cuda_free (l->select.bond_list, LIST_BOND_DATA ); + break; + + case TYP_THREE_BODY: + if (l->select.three_body_list != NULL) + cuda_free ( l->select.three_body_list, LIST_THREE_BODY_DATA ); + break; + + default: + fprintf (stderr, "Unknown list deletion \n" ); + exit (1); + } +} + + diff --git a/PuReMD-GPU/src/helpers.h b/PuReMD-GPU/src/cuda_list.h similarity index 82% rename from PuReMD-GPU/src/helpers.h rename to PuReMD-GPU/src/cuda_list.h index aa1bb62080ee5cdd9b92718c21124a387e248bad..cafc85743b9eb6b31da5b35b063361ef97fddf54 100644 --- a/PuReMD-GPU/src/helpers.h +++ b/PuReMD-GPU/src/cuda_list.h @@ -18,12 +18,22 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#ifndef __HELPERS_H__ -#define __HELPERS_H__ +#ifndef __CUDA_LIST_H_ +#define __CUDA_LIST_H_ #include "mytypes.h" -GLOBAL void compute_Inc_on_T3 (reax_atom *atoms, unsigned int N, - simulation_box *box, real d1, real d2, real d3); + +#ifdef __cplusplus +extern "C" { +#endif + +char Cuda_Make_List( int, int, int, list* ); +void Cuda_Delete_List( list* ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/lookup.cu b/PuReMD-GPU/src/cuda_lookup.cu similarity index 60% rename from PuReMD-GPU/src/lookup.cu rename to PuReMD-GPU/src/cuda_lookup.cu index c6cc23cfcb431d3fbb2921068fa958816b5287fc..9804ad8b6afee5b0c7e04ea37f46d9c206c5ba68 100644 --- a/PuReMD-GPU/src/lookup.cu +++ b/PuReMD-GPU/src/cuda_lookup.cu @@ -18,37 +18,16 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "lookup.h" -#include "two_body_interactions.h" +#include "cuda_lookup.h" -#include "cuda_utils.h" #include "index_utils.h" -void Make_Lookup_Table(real xmin, real xmax, int n, - lookup_function f, lookup_table* t) -{ - int i; - - t->xmin = xmin; - t->xmax = xmax; - t->n = n; - t->dx = (xmax - xmin)/(n-1); - t->inv_dx = 1.0 / t->dx; - t->a = (n-1)/(xmax-xmin); - t->y = (real*) malloc(n*sizeof(real)); - - for(i=0; i < n; i++) - t->y[i] = f(i*t->dx + t->xmin); - - // //fprintf(stdout,"dx = %lf\n",t->dx); - // for(i=0; i < n; i++) - // //fprintf( stdout,"%d %lf %lf %lf\n", - // i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) ); -} +#include "cuda_utils.h" +#include "cuda_two_body_interactions.h" /* Fills solution into x. Warning: will modify c and d! */ -HOST_DEVICE void Tridiagonal_Solve( const real *a, const real *b, +DEVICE void Tridiagonal_Solve( const real *a, const real *b, real *c, real *d, real *x, unsigned int n){ int i; real id; @@ -68,6 +47,7 @@ HOST_DEVICE void Tridiagonal_Solve( const real *a, const real *b, x[i] = d[i] - c[i] * x[i + 1]; } + GLOBAL void Cuda_Tridiagonal_Solve (const real *a, const real *b, real *c, real *d, real *x, unsigned int n) { @@ -75,65 +55,6 @@ GLOBAL void Cuda_Tridiagonal_Solve (const real *a, const real *b, } - - - - - - - - -void Natural_Cubic_Spline( const real *h, const real *f, - cubic_spline_coef *coef, unsigned int n ) -{ - int i; - real *a, *b, *c, *d, *v; - - /* allocate space for the linear system */ - a = (real*) malloc( n * sizeof(real) ); - b = (real*) malloc( n * sizeof(real) ); - c = (real*) malloc( n * sizeof(real) ); - d = (real*) malloc( n * sizeof(real) ); - v = (real*) malloc( n * sizeof(real) ); - - /* build the linear system */ - a[0] = a[1] = a[n-1] = 0; - for( i = 2; i < n-1; ++i ) - a[i] = h[i-1]; - - b[0] = b[n-1] = 0; - for( i = 1; i < n-1; ++i ) - b[i] = 2 * (h[i-1] + h[i]); - - c[0] = c[n-2] = c[n-1] = 0; - for( i = 1; i < n-2; ++i ) - c[i] = h[i]; - - d[0] = d[n-1] = 0; - for( i = 1; i < n-1; ++i ) - d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); - - /*//fprintf( stderr, "i a b c d\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ - v[0] = 0; - v[n-1] = 0; - Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); - - for( i = 1; i < n; ++i ){ - coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); - coef[i-1].c = v[i]/2; - coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; - coef[i-1].a = f[i]; - } - - /*//fprintf( stderr, "i v coef\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f %f\n", - i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ -} - - GLOBAL void cubic_spline_init_a ( real *a, const real *h, int n ) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -146,6 +67,7 @@ GLOBAL void cubic_spline_init_a ( real *a, const real *h, int n ) } } + GLOBAL void cubic_spline_init_b (real *b, const real *h, int n ) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -158,6 +80,7 @@ GLOBAL void cubic_spline_init_b (real *b, const real *h, int n ) } } + GLOBAL void cubic_spline_init_c (real *c, const real *h, int n ) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -170,6 +93,7 @@ GLOBAL void cubic_spline_init_c (real *c, const real *h, int n ) } } + GLOBAL void cubic_spline_init_d (real *d, const real *f, const real *h, int n ) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -182,6 +106,7 @@ GLOBAL void cubic_spline_init_d (real *d, const real *f, const real *h, int n ) } } + GLOBAL void calculate_cubic_spline_coef ( const real *f, real *v, const real *h, LR_lookup_table *data, int offset, int n ) { cubic_spline_coef *coef; @@ -270,66 +195,6 @@ void Cuda_Natural_Cubic_Spline( const real *h, const real *f, } - - - - - - - - - -void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast, - cubic_spline_coef *coef, unsigned int n ) -{ - int i; - real *a, *b, *c, *d, *v; - - /* allocate space for the linear system */ - a = (real*) malloc( n * sizeof(real) ); - b = (real*) malloc( n * sizeof(real) ); - c = (real*) malloc( n * sizeof(real) ); - d = (real*) malloc( n * sizeof(real) ); - v = (real*) malloc( n * sizeof(real) ); - - /* build the linear system */ - a[0] = 0; - for( i = 1; i < n; ++i ) - a[i] = h[i-1]; - - b[0] = 2*h[0]; - for( i = 1; i < n; ++i ) - b[i] = 2 * (h[i-1] + h[i]); - - c[n-1] = 0; - for( i = 0; i < n-1; ++i ) - c[i] = h[i]; - - d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0; - d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]); - for( i = 1; i < n-1; ++i ) - d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); - - /*//fprintf( stderr, "i a b c d\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ - Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n ); - // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); - - for( i = 1; i < n; ++i ){ - coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); - coef[i-1].c = v[i]/2; - coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; - coef[i-1].a = f[i]; - } - - /*//fprintf( stderr, "i v coef\n" ); - for( i = 0; i < n; ++i ) - //fprintf( stderr, "%d %f %f %f %f %f\n", - i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ -} - - GLOBAL void complete_cubic_spline_init_a (real *a, const real *h, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -341,6 +206,7 @@ GLOBAL void complete_cubic_spline_init_a (real *a, const real *h, int n) } } + GLOBAL void complete_cubic_spline_init_b (real *b, const real *h, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -364,6 +230,7 @@ GLOBAL void complete_cubic_spline_init_c (real *c, const real *h, int n ) } } + GLOBAL void complete_cubic_spline_init_d (real *d, const real *f, const real *h, int v0_r, int vlast_r, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -383,6 +250,7 @@ GLOBAL void complete_cubic_spline_init_d (real *d, const real *f, const real *h, d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); } + GLOBAL void calculate_complete_cubic_spline_coef (LR_lookup_table *data, int offset, real *v, const real *h, const real *f, int n) { @@ -409,6 +277,7 @@ GLOBAL void calculate_complete_cubic_spline_coef (LR_lookup_table *data, int off coef[i-1].a = f[i]; } + void Cuda_Complete_Cubic_Spline( const real *h, const real *f, int v0_r, int vlast_r, LR_lookup_table *data, int offset, unsigned int n ) { @@ -471,206 +340,6 @@ void Cuda_Complete_Cubic_Spline( const real *h, const real *f, int v0_r, int vla } - - -void LR_Lookup( LR_lookup_table *t, real r, LR_data *y ) -{ - int i; - real base, dif; - - i = (int)(r * t->inv_dx); - if( i == 0 ) ++i; - base = (real)(i+1) * t->dx; - dif = r - base; - ////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif ); - - y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + - t->vdW[i].a; - y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + - t->CEvd[i].b)*dif + t->CEvd[i].a; - //y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b; - - y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + - t->ele[i].a; - y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif + - t->CEclmb[i].a; - - y->H = y->e_ele * EV_to_KCALpMOL / C_ele; - //y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a; -} - - -void Make_LR_Lookup_Table( reax_system *system, control_params *control ) -{ - int i, j, r; - int num_atom_types; - int existing_types[MAX_ATOM_TYPES]; - real dr; - real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb; - real v0_vdw, v0_ele, vlast_vdw, vlast_ele; - /* real rand_dist; - real evdw_abserr, evdw_relerr, fvdw_abserr, fvdw_relerr; - real eele_abserr, eele_relerr, fele_abserr, fele_relerr; - real evdw_maxerr, eele_maxerr; - LR_data y, y_spline; */ - - /* initializations */ - vlast_ele = 0; - vlast_vdw = 0; - v0_ele = 0; - v0_vdw = 0; - - num_atom_types = system->reaxprm.num_atom_types; - dr = control->r_cut / control->tabulate; - h = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fh = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fele = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) ); - - /* allocate Long-Range LookUp Table space based on - number of atom types in the ffield file */ - //LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) ); - //for( i = 0; i < num_atom_types; ++i ) - // LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table)); - - LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table)); - - /* most atom types in ffield file will not exist in the current - simulation. to avoid unnecessary lookup table space, determine - the atom types that exist in the current simulation */ - for( i = 0; i < MAX_ATOM_TYPES; ++i ) - existing_types[i] = 0; - for( i = 0; i < system->N; ++i ) - existing_types[ system->atoms[i].type ] = 1; - - /* fill in the lookup table entries for existing atom types. - only lower half should be enough. */ - for( i = 0; i < num_atom_types; ++i ) - if( existing_types[i] ) - for( j = i; j < num_atom_types; ++j ) - if( existing_types[j] ) { - LR[ index_lr (i,j,num_atom_types) ].xmin = 0; - LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut; - LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1; - LR[ index_lr (i,j,num_atom_types) ].dx = dr; - LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut; - LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data)); - LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) - malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); - - for( r = 1; r <= control->tabulate; ++r ) { - LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) ); - h[r] = LR[ index_lr (i,j,num_atom_types) ].dx; - fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H; - fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW; - fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; - fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele; - fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; - - if( r == 1 ){ - v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; - v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; - } - else if( r == control->tabulate ){ - vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; - vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; - } - } - - /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fh" ); - for( r = 1; r <= control->tabulate; ++r ) - //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fh[r] ); */ - Natural_Cubic_Spline( &h[1], &fh[1], - &(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 ); - - /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fvdw" ); - for( r = 1; r <= control->tabulate; ++r ) - //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fvdw[r] ); - //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw ); - */ - Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, - &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 ); - Natural_Cubic_Spline( &h[1], &fCEvd[1], - &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 ); - - /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fele" ); - for( r = 1; r <= control->tabulate; ++r ) - //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fele[r] ); - //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele ); - */ - Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, - &(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 ); - Natural_Cubic_Spline( &h[1], &fCEclmb[1], - &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 ); - } - - /***** //test LR-Lookup table - evdw_maxerr = 0; - eele_maxerr = 0; - for( i = 0; i < num_atom_types; ++i ) - if( existing_types[i] ) - for( j = i; j < num_atom_types; ++j ) - if( existing_types[j] ) { - for( r = 1; r <= 100; ++r ) { - rand_dist = (real)rand()/RAND_MAX * control->r_cut; - LR_vdW_Coulomb( system, control, i, j, rand_dist, &y ); - LR_Lookup( &(LR[i][j]), rand_dist, &y_spline ); - - evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW); - evdw_relerr = fabs(evdw_abserr / y.e_vdW); - fvdw_abserr = fabs(y.CEvd - y_spline.CEvd); - fvdw_relerr = fabs(fvdw_abserr / y.CEvd); - eele_abserr = fabs(y.e_ele - y_spline.e_ele); - eele_relerr = fabs(eele_abserr / y.e_ele); - fele_abserr = fabs(y.CEclmb - y_spline.CEclmb); - fele_relerr = fabs(fele_abserr / y.CEclmb); - - if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){ -//fprintf( stderr, "rand_dist = %24.15e\n", rand_dist ); -//fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", -y.H, y_spline.H, -fabs(y.H-y_spline.H), fabs((y.H-y_spline.H)/y.H) ); - -//fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", -y.e_vdW, y_spline.e_vdW, evdw_abserr, evdw_relerr ); -//fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", -y.CEvd, y_spline.CEvd, fvdw_abserr, fvdw_relerr ); - -//fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", -y.e_ele, y_spline.e_ele, eele_abserr, eele_relerr ); -//fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", -y.CEclmb, y_spline.CEclmb, fele_abserr, fele_relerr ); -} - -if( evdw_relerr > evdw_maxerr ) -evdw_maxerr = evdw_relerr; -if( eele_relerr > eele_maxerr ) -eele_maxerr = eele_relerr; -} -} -//fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr ); -//fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr ); - *******/ - -free(h); -free(fh); -free(fvdw); -free(fCEvd); -free(fele); -free(fCEclmb); -} - void copy_LR_table_to_device (reax_system *system, control_params *control) { int i, j, r; @@ -728,30 +397,6 @@ void copy_LR_table_to_device (reax_system *system, control_params *control) } - - - - - - - - - - - - - - - - - - - - -////////////////////////////////////////////////////////////////////////// -// CUDA Functions for Lookup Table -////////////////////////////////////////////////////////////////////////// - GLOBAL void calculate_LR_Values ( LR_lookup_table *d_LR, real *h, real *fh, real *fvdw, real *fCEvd, real *fele, real *fCEclmb, global_parameters g_params, two_body_parameters *tbp, control_params *control, int i, @@ -760,17 +405,18 @@ GLOBAL void calculate_LR_Values ( LR_lookup_table *d_LR, real *h, real *fh, real int r = blockIdx.x * blockDim.x + threadIdx.x; if ( r == 0 || r > count ) return; - LR_vdW_Coulomb ( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types ); + d_LR_vdW_Coulomb( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types ); - h[r] = d_LR[ index_lr (i, j, num_atom_types) ].dx; - fh[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].H; - fvdw[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_vdW; - fCEvd[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEvd; - fele[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_ele; - fCEclmb[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEclmb; + h[r] = d_LR[ index_lr(i, j, num_atom_types) ].dx; + fh[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].H; + fvdw[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].e_vdW; + fCEvd[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].CEvd; + fele[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].e_ele; + fCEclmb[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].CEclmb; } -GLOBAL void init_LR_values ( LR_lookup_table *d_LR, control_params *control, real dr, int i, int j, int num_atom_types ) + +GLOBAL void init_LR_values( LR_lookup_table *d_LR, control_params *control, real dr, int i, int j, int num_atom_types ) { d_LR[ index_lr (i, j, num_atom_types) ].xmin = 0; d_LR[ index_lr (i, j, num_atom_types) ].xmax = control->r_cut; @@ -779,6 +425,7 @@ GLOBAL void init_LR_values ( LR_lookup_table *d_LR, control_params *control, rea d_LR[ index_lr (i, j, num_atom_types) ].inv_dx = control->tabulate / control->r_cut; } + void Cuda_Make_LR_Lookup_Table( reax_system *system, control_params *control ) { int i, j, r; @@ -907,54 +554,3 @@ void Cuda_Make_LR_Lookup_Table( reax_system *system, control_params *control ) cuda_free(fele, RES_LR_LOOKUP_ELE); cuda_free(fCEclmb, RES_LR_LOOKUP_CECLMB); } - - - - - - - - -////////////////////////////////////////////////////////////////////////// -// CUDA Functions for Lookup Table -////////////////////////////////////////////////////////////////////////// - - - -int Lookup_Index_Of( real x, lookup_table* t ) -{ - return (int)( t->a * ( x - t->xmin ) ); -} - - -real Lookup( real x, lookup_table* t ) -{ - real x1, x2; - real b; - int i; - - /* if ( x < t->xmin) - { - //fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x); - exit(0); - } - if ( x > t->xmax) - { - //fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x); - exit(0); - } */ - - i = Lookup_Index_Of( x, t ); - x1 = i * t->dx + t->xmin; - x2 = (i+1) * t->dx + t->xmin; - - b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx; - // //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n", - // i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x)); - - return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b; -} - - - - diff --git a/PuReMD-GPU/src/cuda_lookup.h b/PuReMD-GPU/src/cuda_lookup.h new file mode 100644 index 0000000000000000000000000000000000000000..a0e05e0c092f1634a1d21fd902f198f714ac2391 --- /dev/null +++ b/PuReMD-GPU/src/cuda_lookup.h @@ -0,0 +1,40 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_LOOKUP_H_ +#define __CUDA_LOOKUP_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_Make_LR_Lookup_Table( reax_system*, control_params* ); +void copy_LR_table_to_device ( reax_system*, control_params* ); + +#ifdef __cplusplus +} +#endif + + +#endif + diff --git a/PuReMD-GPU/src/cuda_neighbors.cu b/PuReMD-GPU/src/cuda_neighbors.cu new file mode 100644 index 0000000000000000000000000000000000000000..876b6b9913e4d825e0cc8be5a2fc1d092c56d9f8 --- /dev/null +++ b/PuReMD-GPU/src/cuda_neighbors.cu @@ -0,0 +1,764 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_neighbors.h" + +#include "box.h" +#include "grid.h" +#include "list.h" +#include "neighbors.h" +#include "reset_utils.h" +#include "system_props.h" +#include "vector.h" +#include "index_utils.h" + +#include "cuda_utils.h" +#include "cuda_grid.h" + + +extern inline DEVICE int index_grid (int blocksize) +{ + return blockIdx.x * gridDim.y * gridDim.z * blocksize + + blockIdx.y * gridDim.z * blocksize + + blockIdx.z * blocksize ; +} + + +DEVICE int d_Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, + real cutoff, far_neighbor_data *data ) +{ + real norm_sqr, d, tmp; + int i; + + norm_sqr = 0; + + for( i = 0; i < 3; i++ ) { + d = x2[i] - x1[i]; + tmp = SQR(d); + + if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) { + if( x2[i] > x1[i] ) { + d -= box->box_norms[i]; + data->rel_box[i] = -1; + } + else { + d += box->box_norms[i]; + data->rel_box[i] = +1; + } + + data->dvec[i] = d; + norm_sqr += SQR(d); + } + else { + data->dvec[i] = d; + norm_sqr += tmp; + data->rel_box[i] = 0; + } + } + + if( norm_sqr <= SQR(cutoff) ){ + data->d = sqrt(norm_sqr); + return 1; + } + + return 0; +} + + +GLOBAL void k_Estimate_NumNeighbors( reax_atom *sys_atoms, + grid g, simulation_box *box, control_params *control, int *indices ) +{ + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, l, iter, max, m, num_far; + far_neighbor_data nbr_data; + int x, y, z, i; + + if (threadIdx.x >= *(top + index_grid(1))){ + return; + } + + nbrs = nbrs + index_grid (g.max_nbrs); + nbrs_cp = nbrs_cp + index_grid (g.max_nbrs); + atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x]; + + num_far = 0; + iter = 0; + + while (nbrs[iter][0] >= 0) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + //condition check for cutoff here + if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + for (m = 0; m < max; m++) { + atom2 = nbr_atoms[m]; + + //CHANGE ORIGINAL + /* + if (atom1 > atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &nbr_data)){ + ++num_far; + } + } + */ + if (atom1 > atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &nbr_data)){ + ++num_far; + } + } + else if (atom1 < atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, + control->vlist_cut, &nbr_data)){ + ++num_far; + } + } + //CHANGE ORIGINAL + } + } + ++iter; + } + + //indices[ atom1 ] = num_far;// * SAFE_ZONE; + indices[ atom1 ] = num_far * SAFE_ZONE; +} + + +/*One thread per atom Implementation */ +GLOBAL void k_New_Estimate_NumNeighbors( reax_atom *sys_atoms, + grid g, simulation_box *box, control_params* control, int N, int *indices ) +{ + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, iter, max, m, num_far; + int x, y, z, i; + int atom_x, atom_y, atom_z; + far_neighbor_data temp; + rvec atom1_x; + + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index > N) return; + + atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]); + atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]); + atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]); + +#ifdef __BNVT_FIX__ + if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; + if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; + if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; +#endif + + nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + atom1 = index; + + rvec_Copy (atom1_x, sys_atoms [atom1].x ); + + num_far = 0; + iter = 0; + + while (nbrs[iter][0] >= 0) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + + for (m = 0; m < max; m++) + { + atom2 = nbr_atoms[m]; + if (atom1 > atom2) { + if (d_Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)){ + num_far++; + } + } + else if (atom1 < atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, + control->vlist_cut, &temp)){ + num_far ++; + } + } + } + } + ++iter; + } + indices [atom1] = num_far * SAFE_ZONE; +} + + +/*One thread per entry in the gcell implementation */ +GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms, + grid g, simulation_box *box, control_params* control, + list far_nbrs ) +{ + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, l, iter, max, m, num_far; + int x, y, z, i; + far_neighbor_data *nbr_data; + far_neighbor_data temp; + + if (threadIdx.x >= *(top + index_grid(1))){ + return; + } + + nbrs = nbrs + index_grid (g.max_nbrs); + nbrs_cp = nbrs_cp + index_grid (g.max_nbrs); + atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x]; + + num_far = Start_Index (atom1, &far_nbrs); + //Set_Start_Index (atom1, 0, &far_nbrs); + //num_far = 0; + iter = 0; + + while (nbrs[iter][0] >= 0) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + //condition check for cutoff here + if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + + for (m = 0; m < max; m++) { + atom2 = nbr_atoms[m]; + + //nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] ); + + //CHANGE ORIGINAL + /* + if (atom1 > atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)){ + + nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + ++num_far; + } + } + */ + if (atom1 > atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)){ + nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + ++num_far; + } + } + else if (atom1 < atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, + control->vlist_cut, &temp)){ + nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + ++num_far; + } + } + //CHANGE ORIGINAL + } + } + ++iter; + } + + //end the far_neighbor list here + Set_End_Index (atom1, num_far, &far_nbrs); +} + + +/*One thread per atom Implementation */ +GLOBAL void k_New_Generate_Neighbor_Lists( reax_atom *sys_atoms, + grid g, simulation_box *box, control_params* control, + list far_nbrs, int N ) +{ + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, l, iter, max, m, num_far; + int x, y, z, i; + far_neighbor_data *nbr_data, *my_start; + far_neighbor_data temp; + int atom_x, atom_y, atom_z; + rvec atom1_x; + + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index > N) return; + + atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]); + atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]); + atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]); + +#ifdef __BNVT_FIX__ + if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; + if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; + if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; +#endif + + nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + atom1 = index; + + rvec_Copy (atom1_x, sys_atoms [atom1].x ); + + num_far = Start_Index (atom1, &far_nbrs); + my_start = & (far_nbrs.select.far_nbr_list [num_far] ); + + //Set_Start_Index (atom1, 0, &far_nbrs); + //num_far = 0; + iter = 0; + + while (nbrs[iter][0] >= 0) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + //condition check for cutoff here + //if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= + if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + + for (m = 0; m < max; m++) + { + atom2 = nbr_atoms[m]; + if (atom1 > atom2) { + if (d_Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)){ + //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data = my_start; + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + num_far++; + my_start ++; + } + } + else if (atom1 < atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, + control->vlist_cut, &temp)){ + //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); + nbr_data = my_start; + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + num_far ++; + my_start ++; + } + } + //CHANGE ORIGINAL + } + } + ++iter; + } + + //end the far_neighbor list here + Set_End_Index (atom1, num_far, &far_nbrs); +} + + +/*Multiple threads per atom Implementation */ +GLOBAL void Test_Generate_Neighbor_Lists( reax_atom *sys_atoms, + grid g, simulation_box *box, control_params* control, + list far_nbrs, int N ) +{ + + extern __shared__ int __nbr[]; + extern __shared__ int __sofar []; + int nbrgen; + + int __THREADS_PER_ATOM__ = NBRS_THREADS_PER_ATOM; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); + int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; + + if (warp_id >= N ) return; + + int *tnbr = __nbr; + //int *nbrssofar = __nbr + __THREADS_PER_ATOM__; + int *nbrssofar = __nbr + blockDim.x; + + int *atoms = g.atoms; + int *top = g.top; + ivec *nbrs = g.nbrs; + rvec *nbrs_cp = g.nbrs_cp; + + int *nbr_atoms; + int atom1, atom2, l, iter, max, m, num_far; + int leader = -10; + int x, y, z, i; + far_neighbor_data *nbr_data, *my_start; + far_neighbor_data temp; + int atom_x, atom_y, atom_z; + + + atom1 = warp_id; + atom_x = (int)(sys_atoms[atom1].x[0] * g.inv_len[0]); + atom_y = (int)(sys_atoms[atom1].x[1] * g.inv_len[1]); + atom_z = (int)(sys_atoms[atom1].x[2] * g.inv_len[2]); + +#ifdef __BNVT_FIX__ + if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; + if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; + if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; +#endif + + nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); + + num_far = Start_Index (atom1, &far_nbrs); + my_start = & (far_nbrs.select.far_nbr_list [num_far] ); + + iter = 0; + tnbr[threadIdx.x] = 0; + + if (lane_id == 0) { + //nbrssofar [threadIdx.x /__THREADS_PER_ATOM__] = 0; + nbrssofar [my_bucket] = 0; + } + + __syncthreads (); + + while ((nbrs[iter][0] >= 0)) { + x = nbrs[iter][0]; + y = nbrs[iter][1]; + z = nbrs[iter][2]; + + tnbr[threadIdx.x] = 0; + nbrgen = FALSE; + + if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms [atom1].x) <= + SQR (control->vlist_cut)) + { + nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); + max = top [index_grid_3d(x, y, z, &g)]; + + tnbr[threadIdx.x] = 0; + nbrgen = FALSE; + m = lane_id ; //0-31 + int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1); + int iterations = 0; + //while (m < max) + while (iterations < loopcount) + { + tnbr [threadIdx.x] = 0; + nbrgen = FALSE; + + if (m < max) { + atom2 = nbr_atoms[m]; + if (atom1 > atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + control->vlist_cut, &temp)) + { + tnbr [threadIdx.x] = 1; + nbrgen = TRUE; + } + } + else if (atom1 < atom2) { + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, + control->vlist_cut, &temp)){ + tnbr [threadIdx.x] = 1; + nbrgen = TRUE; + } + } + } + + if (nbrgen) + { + //do leader selection here + leader = -1; + //for (l = threadIdx.x / __THREADS_PER_ATOM__; l < threadIdx.x / __THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++) + for (l = my_bucket *__THREADS_PER_ATOM__; l < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++) + if (tnbr[l]){ + leader = l; + break; + } + + //do the reduction; + if (threadIdx.x == leader) + for (l = 1; l < __THREADS_PER_ATOM__; l++) + //tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + l] += tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + (l-1)]; + tnbr [my_bucket * __THREADS_PER_ATOM__ + l] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (l-1)]; + } + + //__syncthreads (); + //MYATOMICADD( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1); + //while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ; + + if (nbrgen) + { + //got the indices + //nbr_data = my_start + nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] + tnbr [threadIdx.x] - 1; + nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1; + nbr_data->nbr = atom2; + nbr_data->rel_box[0] = temp.rel_box[0]; + nbr_data->rel_box[1] = temp.rel_box[1]; + nbr_data->rel_box[2] = temp.rel_box[2]; + + nbr_data->d = temp.d; + nbr_data->dvec[0] = temp.dvec[0]; + nbr_data->dvec[1] = temp.dvec[1]; + nbr_data->dvec[2] = temp.dvec[2]; + + if (threadIdx.x == leader) + //nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] += tnbr[(threadIdx.x / __THREADS_PER_ATOM__)*__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; + nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; + } + + m += __THREADS_PER_ATOM__; + iterations ++; + + //cleanup + nbrgen = FALSE; + tnbr [threadIdx.x] = 0; + } + } + ++iter; + } + + __syncthreads (); + + //end the far_neighbor list here + if (lane_id == 0) + Set_End_Index (atom1, num_far + nbrssofar[my_bucket], &far_nbrs); + //Set_End_Index (atom1, num_far + tnbr[63], &far_nbrs); +} + + +void Cuda_Generate_Neighbor_Lists (reax_system *system, static_storage *workspace, control_params *control, int estimate) +{ + real t_start, t_elapsed; + real t_1, t_2; + + list *far_nbrs = dev_lists + FAR_NBRS; + + int *d_indices = (int *) scratch; + int *nbrs_start, *nbrs_end; + int i, max_nbrs = 0; + int nbs; + + t_start = Get_Time (); + + Cuda_Bin_Atoms (system, workspace); + Cuda_Bin_Atoms_Sync ( system ); + + if (dev_workspace->realloc.estimate_nbrs > -1) { + + /*reset the re-neighbor condition */ + dev_workspace->realloc.estimate_nbrs = -1; + + //#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Recomputing the neighbors estimate.... \n"); + //#endif + cuda_memset (d_indices, 0, INT_SIZE * system->N, RES_SCRATCH ); + /* + dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); + dim3 threadsperblock (system->g.max_atoms); + + k_Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>> + (system->d_atoms, system->d_g, system->d_box, + (control_params *)control->d_control, d_indices); + cudaThreadSynchronize (); + cudaCheckError (); + */ + nbs = (system->N / NBRS_BLOCK_SIZE) + (((system->N) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); + k_New_Estimate_NumNeighbors <<<nbs, NBRS_BLOCK_SIZE>>> + ( system->d_atoms, system->d_g, + system->d_box, (control_params *)control->d_control, + system->N, d_indices); + cudaThreadSynchronize (); + cudaCheckError (); + + + int *nbrs_indices = NULL; + nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) ); + if (nbrs_indices == NULL) + { + fprintf (stderr, "Malloc failed for nbrs indices .... \n"); + exit (1); + } + memset (nbrs_indices , 0, INT_SIZE * (system->N+1) ); + + copy_host_device (nbrs_indices+1, d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 1; i <= system->N; i++) + nbrs_indices [i] += nbrs_indices [i-1]; + + copy_host_device (nbrs_indices, (far_nbrs->index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ ); + copy_host_device (nbrs_indices, (far_nbrs->end_index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ ); + + free (nbrs_indices); + } + + /* + One thread per atom Implementation + Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> + (system->d_atoms, system->d_g, system->d_box, + (control_params *)control->d_control, *far_nbrs); + */ + nbs = (system->N * NBRS_THREADS_PER_ATOM/ NBRS_BLOCK_SIZE) + + (((system->N *NBRS_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); + + /* Multiple threads per atom Implementation */ + Test_Generate_Neighbor_Lists <<<nbs, NBRS_BLOCK_SIZE, + INT_SIZE * (NBRS_BLOCK_SIZE+ NBRS_BLOCK_SIZE/NBRS_THREADS_PER_ATOM) >>> + (system->d_atoms, system->d_g, system->d_box, + (control_params *)control->d_control, *far_nbrs, system->N ); + cudaThreadSynchronize (); + cudaCheckError (); + + t_elapsed = Get_Timing_Info (t_start); + d_timing.nbrs += t_elapsed; + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Done with neighbor generation ---> %f \n", t_elapsed); +#endif + + /*validate neighbors list*/ + nbrs_start = (int *) calloc (system->N, INT_SIZE); + nbrs_end = (int *) calloc (system->N, INT_SIZE); + + copy_host_device (nbrs_start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); + copy_host_device (nbrs_end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); + + int device_nbrs = 0; + for(i = 0; i < system->N; i++) + { + if ((nbrs_end[i] - nbrs_start[i]) > max_nbrs) + max_nbrs = nbrs_end[i] - nbrs_start[i]; + + device_nbrs += nbrs_end[i] - nbrs_start[i]; + } + +#ifdef __CUDA_TEST__ + //fprintf (stderr, " New Device count is : %d \n", device_nbrs); + //dev_workspace->realloc.num_far = device_nbrs; +#endif + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Max neighbors is ---> %d \n", max_nbrs ); + fprintf (stderr, "DEVICE NEIGHBORS ---> %d \n", device_nbrs); +#endif + + //validate check here + //get the num_far from the list here + for (i = 0; i < system->N-1; i++) + { + if ((nbrs_end[i] - nbrs_start[i]) > (nbrs_start[i+1] - nbrs_start[i]) * DANGER_ZONE ) + { + dev_workspace->realloc.num_far = device_nbrs; + //#ifdef __CUDA_MEM__ + //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far); + //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d (%d %d %d) \n", + // i, nbrs_start[i], nbrs_end[i], nbrs_start[i+1]); + //#endif + } + + if (nbrs_end[i] > nbrs_start[i+1]) { + fprintf( stderr, "**ran out of space on far_nbrs: start[i] = %d, end[i]=%d, start[i+1]=%d, end[i+1] = %d", + nbrs_start[i], nbrs_end[i], nbrs_start[i+1], nbrs_end[i+1]); + exit( INSUFFICIENT_SPACE ); + } + } + + if ((nbrs_end[i] - nbrs_start[i]) > (far_nbrs->num_intrs - nbrs_start[i]) * DANGER_ZONE ) { + dev_workspace->realloc.num_far = device_nbrs; + //#ifdef __CUDA_MEM__ + //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far); + //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d start: %d, end: %d, count: %d\n" + // , i, nbrs_start[i], nbrs_end[i], far_nbrs->num_intrs); + //#endif + } + if (nbrs_end[i] > far_nbrs->num_intrs) { + fprintf( stderr, "**ran out of space on far_nbrs: top=%d, max=%d", + nbrs_end[i], far_nbrs->num_intrs ); + exit( INSUFFICIENT_SPACE ); + } + + free (nbrs_start); + free (nbrs_end); +} diff --git a/PuReMD-GPU/src/cuda_neighbors.h b/PuReMD-GPU/src/cuda_neighbors.h new file mode 100644 index 0000000000000000000000000000000000000000..13656f62f12b53509fad82f535e974e45cc45805 --- /dev/null +++ b/PuReMD-GPU/src/cuda_neighbors.h @@ -0,0 +1,44 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_NEIGHBORS_H_ +#define __CUDA_NEIGHBORS_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void k_Estimate_NumNeighbors( reax_atom *, grid, simulation_box *, + control_params *, int * ); + +void Cuda_Generate_Neighbor_Lists (reax_system *system, + static_storage *workspace, control_params *control, int); + +DEVICE int d_Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/cuda_post_evolve.cu b/PuReMD-GPU/src/cuda_post_evolve.cu new file mode 100644 index 0000000000000000000000000000000000000000..f5dbad825f2ac9f6e120e6cf7147258c89868c71 --- /dev/null +++ b/PuReMD-GPU/src/cuda_post_evolve.cu @@ -0,0 +1,148 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_post_evolve.h" + +#include "vector.h" + +#include "cuda_utils.h" +#include "cuda_copy.h" +#include "cuda_system_props.h" + + +void Cuda_Setup_Evolve( reax_system* system, control_params* control, + simulation_data* data, static_storage* workspace, + list** lists, output_controls *out_control ) +{ + //fprintf (stderr, "Begin ... \n"); + //to Sync step to the device. + //Sync_Host_Device_Data( &data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice ); + copy_host_device( &data->step, &((simulation_data *)data->d_simulation_data)->step, + INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + +} + + +void Cuda_Setup_Output( reax_system* system, simulation_data* data ) +{ + // Here sync the simulation data, because it has been changed. + Prep_Device_For_Output( system, data ); +} + + +void Cuda_Sync_Temp( control_params* control ) +{ + Sync_Host_Device_Params( control, (control_params*)control->d_control, cudaMemcpyHostToDevice ); +} + + +GLOBAL void Update_Atoms_Post_Evolve (reax_atom *atoms, simulation_data *data, int N) +{ + rvec diff, cross; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= N) return; + + //for( i = 0; i < system->N; i++ ) { + // remove translational + rvec_ScaledAdd( atoms[i].v, -1., data->vcm ); + + // remove rotational + rvec_ScaledSum( diff, 1., atoms[i].x, -1., data->xcm ); + rvec_Cross( cross, data->avcm, diff ); + rvec_ScaledAdd( atoms[i].v, -1., cross ); + //} +} + + +void Cuda_Post_Evolve( reax_system* system, control_params* control, + simulation_data* data, static_storage* workspace, + list** lists, output_controls *out_control ) +{ + int i; + rvec diff, cross; + + /* compute kinetic energy of the system */ + /* + real *results = (real *) scratch; + cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); + Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (system->reaxprm.d_sbp, system->d_atoms, system->N, + (simulation_data *)data->d_simulation_data, (real *) results); + cudaThreadSynchronize (); + cudaCheckError (); + */ + + //fprintf (stderr, "Cuda_Post_Evolve: Begin\n"); + Cuda_Compute_Kinetic_Energy( system, data ); + //fprintf (stderr, " Cuda_Compute_Kinetic_Energy done.... \n"); + + /* remove rotational and translational velocity of the center of mass */ + if( control->ensemble != NVE && + control->remove_CoM_vel && + data->step && data->step % control->remove_CoM_vel == 0 ) { + + /* + rvec t_xcm, t_vcm, t_avcm; + rvec_MakeZero (t_xcm); + rvec_MakeZero (t_vcm); + rvec_MakeZero (t_avcm); + + rvec_Copy (t_xcm, data->xcm); + rvec_Copy (t_vcm, data->vcm); + rvec_Copy (t_avcm, data->avcm); + */ + + /* compute velocity of the center of mass */ + Cuda_Compute_Center_of_Mass( system, data, out_control->prs ); + //fprintf (stderr, "Cuda_Compute_Center_of_Mass done... \n"); + /* + fprintf (stderr, "center of mass done on the device \n"); + + fprintf (stderr, "xcm --> %4.10f %4.10f \n", t_xcm, data->xcm ); + fprintf (stderr, "vcm --> %4.10f %4.10f \n", t_vcm, data->vcm ); + fprintf (stderr, "avcm --> %4.10f %4.10f \n", t_avcm, data->avcm ); + + if (check_zero (t_xcm, data->xcm) || + check_zero (t_vcm, data->vcm) || + check_zero (t_avcm, data->avcm)){ + fprintf (stderr, "SimulationData (xcm, vcm, avcm) does not match between device and host \n"); + exit (0); + } + */ + + //xcm, avcm, + copy_host_device( data->vcm, + ((simulation_data *)data->d_simulation_data)->vcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + copy_host_device( data->xcm, + ((simulation_data *)data->d_simulation_data)->xcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + copy_host_device( data->avcm, + ((simulation_data *)data->d_simulation_data)->avcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + + //fprintf (stderr, "data copied.... \n"); + + Update_Atoms_Post_Evolve<<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, (simulation_data *)data->d_simulation_data, system->N); + cudaThreadSynchronize( ); + cudaCheckError( ); + + //fprintf (stderr, " Cuda_Post_Evolve:End \n"); + + } +} diff --git a/PuReMD-GPU/src/cuda_post_evolve.h b/PuReMD-GPU/src/cuda_post_evolve.h new file mode 100644 index 0000000000000000000000000000000000000000..1d8fdc270edd28e0f387d53d9b9837c3a4879542 --- /dev/null +++ b/PuReMD-GPU/src/cuda_post_evolve.h @@ -0,0 +1,48 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_POST_EVOLVE_H__ +#define __CUDA_POST_EVOLVE_H__ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_Setup_Evolve( reax_system *, control_params *, + simulation_data *, static_storage *, + list **, output_controls * ); + +void Cuda_Setup_Output( reax_system *, simulation_data * ); + +void Cuda_Sync_Temp( control_params * ); + +void Cuda_Post_Evolve( reax_system *, control_params *, + simulation_data *, static_storage *, + list **, output_controls * ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/reduction.cu b/PuReMD-GPU/src/cuda_reduction.cu similarity index 97% rename from PuReMD-GPU/src/reduction.cu rename to PuReMD-GPU/src/cuda_reduction.cu index 48fb5efc473adda2540a3dac4ffb2664ad9ca2a9..e22f9ad8474830ebdbe11089cdfcc986ff8dd8f3 100644 --- a/PuReMD-GPU/src/reduction.cu +++ b/PuReMD-GPU/src/cuda_reduction.cu @@ -18,9 +18,9 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "reduction.h" +#include "cuda_reduction.h" + #include "vector.h" -#include "mytypes.h" GLOBAL void Cuda_reduction(const real *input, real *per_block_results, const size_t n) @@ -52,6 +52,7 @@ GLOBAL void Cuda_reduction(const real *input, real *per_block_results, const siz } } + GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t n, int pass) { extern __shared__ real sdata[]; @@ -87,6 +88,7 @@ GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t } } + GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, const size_t n ) { extern __shared__ real sdata[]; @@ -116,6 +118,7 @@ GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, con } } + GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results, const size_t n) { extern __shared__ real sdata[]; @@ -146,11 +149,7 @@ GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results } - - - - -GLOBAL void Cuda_reduction(const int *input, int *per_block_results, const size_t n) +GLOBAL void Cuda_reduction_int(const int *input, int *per_block_results, const size_t n) { extern __shared__ int sh_input[]; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -213,10 +212,10 @@ GLOBAL void Cuda_reduction_rvec (rvec *input, rvec *results, size_t n) } } + ////////////////////////////////////////////////// //vector functions ////////////////////////////////////////////////// - GLOBAL void Cuda_Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -225,6 +224,7 @@ GLOBAL void Cuda_Vector_Sum( real* dest, real c, real* v, real d, real* y, int k dest[i] = c * v[i] + d * y[i]; } + GLOBAL void Cuda_Vector_Scale( real* dest, real c, real* v, int k ) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -233,6 +233,7 @@ GLOBAL void Cuda_Vector_Scale( real* dest, real c, real* v, int k ) dest[i] = c * v[i]; } + GLOBAL void Cuda_Vector_Add( real* dest, real c, real* v, int k ) { int i = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/PuReMD-GPU/src/reduction.h b/PuReMD-GPU/src/cuda_reduction.h similarity index 66% rename from PuReMD-GPU/src/reduction.h rename to PuReMD-GPU/src/cuda_reduction.h index fefbe4c2de5e7674c2d53c73c2aba698b7c93f2a..5b9baf1df69f2a4ab2bdab0d3bff26e390634951 100644 --- a/PuReMD-GPU/src/reduction.h +++ b/PuReMD-GPU/src/cuda_reduction.h @@ -18,22 +18,32 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#ifndef __REDUCTION_H__ -#define __REDUCTION_H__ +#ifndef __CUDA_REDUCTION_H__ +#define __CUDA_REDUCTION_H__ #include "mytypes.h" #define INITIAL 0 #define FINAL 1 -GLOBAL void Cuda_reduction (const real *input, real *per_block_results, const size_t n); -GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t n, int pass); -GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, const size_t n); -GLOBAL void Cuda_reduction (const int *input, int *per_block_results, const size_t n); -GLOBAL void Cuda_reduction_rvec (rvec *, rvec *, size_t n); + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void Cuda_reduction( const real *input, real *per_block_results, const size_t n ); +GLOBAL void Cuda_Norm( const real *input, real *per_block_results, const size_t n, int pass ); +GLOBAL void Cuda_Dot( const real *a, const real *b, real *per_block_results, const size_t n ); +GLOBAL void Cuda_reduction_int( const int *input, int *per_block_results, const size_t n ); +GLOBAL void Cuda_reduction_rvec( rvec *, rvec *, size_t n ); GLOBAL void Cuda_Vector_Sum( real* , real , real* , real , real* , int ) ; GLOBAL void Cuda_Vector_Scale( real* , real , real* , int ) ; GLOBAL void Cuda_Vector_Add( real* , real , real* , int ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/reset_utils.cu b/PuReMD-GPU/src/cuda_reset_utils.cu similarity index 55% rename from PuReMD-GPU/src/reset_utils.cu rename to PuReMD-GPU/src/cuda_reset_utils.cu index 0c6f852bdf295ec9ad48af63f9195737adc387c0..d18d9f749a9a57a269f731aecae7754bdb8925a4 100644 --- a/PuReMD-GPU/src/reset_utils.cu +++ b/PuReMD-GPU/src/cuda_reset_utils.cu @@ -18,13 +18,16 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "reset_utils.h" +#include "cuda_reset_utils.h" + #include "list.h" +#include "reset_utils.h" #include "vector.h" #include "cuda_utils.h" #include "cuda_copy.h" + GLOBAL void Reset_Atoms (reax_atom *atoms, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -35,6 +38,7 @@ GLOBAL void Reset_Atoms (reax_atom *atoms, int N) atoms[i].f[2] = 0.0; } + void Cuda_Reset_Atoms (reax_system *system ) { Reset_Atoms <<<BLOCKS, BLOCK_SIZE>>> @@ -43,42 +47,6 @@ void Cuda_Reset_Atoms (reax_system *system ) cudaCheckError (); } -void Reset_Atoms( reax_system* system ) -{ - int i; - - for( i = 0; i < system->N; ++i ) - memset( system->atoms[i].f, 0.0, RVEC_SIZE ); -} - - -void Reset_Pressures( simulation_data *data ) -{ - rtensor_MakeZero( data->flex_bar.P ); - data->iso_bar.P = 0; - rvec_MakeZero( data->int_press ); - rvec_MakeZero( data->ext_press ); - /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */ -} - - -void Reset_Simulation_Data( simulation_data* data ) -{ - data->E_BE = 0; - data->E_Ov = 0; - data->E_Un = 0; - data->E_Lp = 0; - data->E_Ang = 0; - data->E_Pen = 0; - data->E_Coa = 0; - data->E_HB = 0; - data->E_Tor = 0; - data->E_Con = 0; - data->E_vdW = 0; - data->E_Ele = 0; - data->E_Kin = 0; -} void Cuda_Sync_Simulation_Data (simulation_data *data) { @@ -93,40 +61,6 @@ void Cuda_Sync_Simulation_Data (simulation_data *data) } -#ifdef TEST_FORCES -void Reset_Test_Forces( reax_system *system, static_storage *workspace ) -{ - memset( workspace->f_ele, 0, system->N * sizeof(rvec) ); - memset( workspace->f_vdw, 0, system->N * sizeof(rvec) ); - memset( workspace->f_bo, 0, system->N * sizeof(rvec) ); - memset( workspace->f_be, 0, system->N * sizeof(rvec) ); - memset( workspace->f_lp, 0, system->N * sizeof(rvec) ); - memset( workspace->f_ov, 0, system->N * sizeof(rvec) ); - memset( workspace->f_un, 0, system->N * sizeof(rvec) ); - memset( workspace->f_ang, 0, system->N * sizeof(rvec) ); - memset( workspace->f_coa, 0, system->N * sizeof(rvec) ); - memset( workspace->f_pen, 0, system->N * sizeof(rvec) ); - memset( workspace->f_hb, 0, system->N * sizeof(rvec) ); - memset( workspace->f_tor, 0, system->N * sizeof(rvec) ); - memset( workspace->f_con, 0, system->N * sizeof(rvec) ); -} -#endif - - -void Reset_Workspace( reax_system *system, static_storage *workspace ) -{ - memset( workspace->total_bond_order, 0, system->N * sizeof( real ) ); - memset( workspace->dDeltap_self, 0, system->N * sizeof( rvec ) ); - - memset( workspace->CdDelta, 0, system->N * sizeof( real ) ); - //memset( workspace->virial_forces, 0, system->N * sizeof( rvec ) ); - -#ifdef TEST_FORCES - memset( workspace->dDelta, 0, sizeof(rvec) * system->N ); - Reset_Test_Forces( system, workspace ); -#endif -} - void Cuda_Reset_Workspace( reax_system *system, static_storage *workspace ) { cuda_memset( workspace->total_bond_order, 0, system->N * REAL_SIZE, RES_STORAGE_TOTAL_BOND_ORDER ); @@ -157,6 +91,7 @@ GLOBAL void Reset_Neighbor_Lists (single_body_parameters *sbp, reax_atom *atoms, } } + void Cuda_Reset_Neighbor_Lists (reax_system *system, control_params *control, static_storage *workspace, list **lists ) { @@ -172,6 +107,7 @@ void Cuda_Reset_Neighbor_Lists (reax_system *system, control_params *control, cuda_memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs, LIST_BOND_DATA ); } + GLOBAL void Reset_Far_Neighbors_List (list far_nbrs, int N) { int tmp; @@ -183,6 +119,7 @@ GLOBAL void Reset_Far_Neighbors_List (list far_nbrs, int N) Set_End_Index (index, tmp, &far_nbrs); } + void Cuda_Reset_Far_Neighbors_List ( reax_system *system ) { Reset_Far_Neighbors_List <<<BLOCKS, BLOCK_SIZE>>> @@ -191,52 +128,6 @@ void Cuda_Reset_Far_Neighbors_List ( reax_system *system ) cudaCheckError (); } -void Reset_Neighbor_Lists( reax_system *system, control_params *control, - static_storage *workspace, list **lists ) -{ - int i, tmp; - list *bonds = (*lists) + BONDS; - list *hbonds = (*lists) + HBONDS; - - for( i = 0; i < system->N; ++i ) { - tmp = Start_Index( i, bonds ); - Set_End_Index( i, tmp, bonds ); - } - - //TODO check if this is needed - memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs ); - - if( control->hb_cut > 0 ) - for( i = 0; i < system->N; ++i ) - if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) { - tmp = Start_Index( workspace->hbond_index[i], hbonds ); - Set_End_Index( workspace->hbond_index[i], tmp, hbonds ); - /* fprintf( stderr, "i:%d, hbond: %d-%d\n", - i, Start_Index( workspace->hbond_index[i], hbonds ), - End_Index( workspace->hbond_index[i], hbonds ) );*/ - } -} - - -void Reset( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, list **lists ) -{ - Reset_Atoms( system ); - - Reset_Simulation_Data( data ); - - if( control->ensemble == NPT || control->ensemble == sNPT || - control->ensemble == iNPT ) - Reset_Pressures( data ); - - Reset_Workspace( system, workspace ); - - Reset_Neighbor_Lists( system, control, workspace, lists ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "reset - "); -#endif -} void Cuda_Reset_Sparse_Matrix (reax_system *system, static_storage *workspace) { @@ -244,6 +135,7 @@ void Cuda_Reset_Sparse_Matrix (reax_system *system, static_storage *workspace) cuda_memset (workspace->H.val, 0, (system->N * system->max_sparse_matrix_entries) * INT_SIZE, RES_SPARSE_MATRIX_INDEX ); } + void Cuda_Reset( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list **lists ) { @@ -251,7 +143,7 @@ void Cuda_Reset( reax_system *system, control_params *control, //Reset_Simulation_Data( data ); Cuda_Sync_Simulation_Data ( data ); - //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); + //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice ); if( control->ensemble == NPT || control->ensemble == sNPT || control->ensemble == iNPT ) @@ -268,23 +160,7 @@ void Cuda_Reset( reax_system *system, control_params *control, } -void Reset_Grid( grid *g ) -{ - memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]); -} - void Cuda_Reset_Grid (grid *g) { cuda_memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2], RES_GRID_TOP); } - - -void Reset_Marks( grid *g, ivec *grid_stack, int grid_top ) -{ - int i; - - for( i = 0; i < grid_top; ++i ) - g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + - grid_stack[i][1] * g->ncell[2] + - grid_stack[i][2]] = 0; -} diff --git a/PuReMD-GPU/src/cuda_reset_utils.h b/PuReMD-GPU/src/cuda_reset_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..bf730b935cef4857eb8e0aa97d614ef122bfca0e --- /dev/null +++ b/PuReMD-GPU/src/cuda_reset_utils.h @@ -0,0 +1,45 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_RESET_UTILS_H_ +#define __CUDA_RESET_UTILS_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_Reset_Grid( grid* ); + +void Cuda_Reset_Workspace (reax_system *, static_storage *); + +void Cuda_Reset( reax_system*, control_params*, simulation_data*, + static_storage*, list** ); + +void Cuda_Reset_Atoms (reax_system *); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/single_body_interactions.cu b/PuReMD-GPU/src/cuda_single_body_interactions.cu similarity index 63% rename from PuReMD-GPU/src/single_body_interactions.cu rename to PuReMD-GPU/src/cuda_single_body_interactions.cu index 3c6c08822fb056c18c78bf4db5f7aa334496ea34..530e63d7071135b143d0b432bf1350070d46667c 100644 --- a/PuReMD-GPU/src/single_body_interactions.cu +++ b/PuReMD-GPU/src/cuda_single_body_interactions.cu @@ -18,7 +18,8 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "single_body_interactions.h" +#include "cuda_single_body_interactions.h" + #include "bond_orders.h" #include "list.h" #include "lookup.h" @@ -28,301 +29,6 @@ #include "cuda_helpers.h" -void LonePair_OverUnder_Coordination_Energy( reax_system *system, - control_params *control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control ) -{ - int i, j, pj, type_i, type_j; - real Delta_lpcorr, dfvl; - real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; - real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; - real e_ov, CEover1, CEover2, CEover3, CEover4; - real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; - real exp_ovun2n, exp_ovun6, exp_ovun8; - real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; - real e_un, CEunder1, CEunder2, CEunder3, CEunder4; - real p_lp1, p_lp2, p_lp3; - real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; - - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_data *pbond; - bond_order_data *bo_ij; - list *bonds = (*lists) + BONDS; - - /* Initialize parameters */ - p_lp1 = system->reaxprm.gp.l[15]; - p_lp3 = system->reaxprm.gp.l[5]; - p_ovun3 = system->reaxprm.gp.l[32]; - p_ovun4 = system->reaxprm.gp.l[31]; - p_ovun6 = system->reaxprm.gp.l[6]; - p_ovun7 = system->reaxprm.gp.l[8]; - p_ovun8 = system->reaxprm.gp.l[9]; - - for( i = 0; i < system->N; ++i ) { - /* set the parameter pointer */ - type_i = system->atoms[i].type; - sbp_i = &(system->reaxprm.sbp[ type_i ]); - - /* lone-pair Energy */ - p_lp2 = sbp_i->p_lp2; - expvd2 = EXP( -75 * workspace->Delta_lp[i] ); - inv_expvd2 = 1. / (1. + expvd2 ); - - /* calculate the energy */ - data->E_Lp += e_lp = - p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - - dElp = p_lp2 * inv_expvd2 + - 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); - CElp = dElp * workspace->dDelta_lp[i]; - - workspace->CdDelta[i] += CElp; // lp - 1st term - -#ifdef TEST_ENERGY - fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", - p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp ); - fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n", - workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp ); -#endif -#ifdef TEST_FORCES - Add_dDelta( system, lists, i, CElp, workspace->f_lp ); // lp - 1st term -#endif - - /* correction for C2 */ - if( system->reaxprm.gp.l[5] > 0.001 && - !strcmp( system->reaxprm.sbp[type_i].name, "C" ) ) - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) - if( i < bonds->select.bond_list[pj].nbr ) { - j = bonds->select.bond_list[pj].nbr; - type_j = system->atoms[j].type; - - if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) { - twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - Di = workspace->Delta[i]; - vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); - - if( vov3 > 3. ) { - data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0); - //estrain(i) += e_lph; - - deahu2dbo = 2.*p_lp3*(vov3 - 3.); - deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); - - bo_ij->Cdbo += deahu2dbo; - workspace->CdDelta[i] += deahu2dsbo; -#ifdef TEST_ENERGY - fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n", - // workspace->orig_id[i], workspace->orig_id[j], - i+1, j+1, e_lph, deahu2dbo, deahu2dsbo ); -#endif -#ifdef TEST_FORCES - Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp); - Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp); -#endif - } - } - - } - } - - - for( i = 0; i < system->N; ++i ) { - type_i = system->atoms[i].type; - sbp_i = &(system->reaxprm.sbp[ type_i ]); - - /* over-coordination energy */ - if( sbp_i->mass > 21.0 ) - dfvl = 0.0; - else dfvl = 1.0; // only for 1st-row elements - - p_ovun2 = sbp_i->p_ovun2; - sum_ovun1 = 0; - sum_ovun2 = 0; - - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) { - j = bonds->select.bond_list[pj].nbr; - type_j = system->atoms[j].type; - bo_ij = &(bonds->select.bond_list[pj].bo_data); - sbp_j = &(system->reaxprm.sbp[ type_j ]); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - - sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; - sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* - ( bo_ij->BO_pi + bo_ij->BO_pi2 ); - - /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", - i+1, j+1, - dfvl * workspace->Delta_lp_temp[j], - sbp_j->nlp_opt, - workspace->nlp_temp[j] );*/ - } - - exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); - inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); - Delta_lpcorr = workspace->Delta[i] - - (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1; - - exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); - inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); - - DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 ); - CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; - - data->E_Ov += e_ov = sum_ovun1 * CEover1; - - CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * - ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); - - CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 ); - - CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); - - - /* under-coordination potential */ - p_ovun2 = sbp_i->p_ovun2; - p_ovun5 = sbp_i->p_ovun5; - - exp_ovun2n = 1.0 / exp_ovun2; - exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); - exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); - inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); - inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); - - data->E_Un += e_un = - -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; - - CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + - p_ovun2 * e_un * exp_ovun2n); - CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; - CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); - CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * - p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; - - //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n", - // i+1, sum_ovun2, e_ov, e_un ); - - /* forces */ - workspace->CdDelta[i] += CEover3; // OvCoor - 2nd term - workspace->CdDelta[i] += CEunder3; // UnCoor - 1st term - -#ifdef TEST_FORCES - Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor - 2nd - Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st -#endif - - - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - pbond = &(bonds->select.bond_list[pj]); - j = pbond->nbr; - type_j = system->atoms[j].type; - bo_ij = &(pbond->bo_data); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - - - bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st - workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* - (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a - bo_ij->Cdbopi += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b - bo_ij->Cdbopi2 += CEover4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b - - - workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2); // UnCoor - 2a - bo_ij->Cdbopi += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b - bo_ij->Cdbopi2 += CEunder4 * - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b - - -#ifdef TEST_ENERGY - /* fprintf( out_control->eov, "%6d%23.15e%23.15e" - workspace->orig_id[j]+1, - //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2, - CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */ - - /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[j]+1, - CEover4, - CEover4* - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), - CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), - (1.0 - dfvl*workspace->dDelta_lp[j]), - CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2) );*/ - - /* fprintf( out_control->eun, "%6d%23.15e\n", - workspace->orig_id[j]+1, CEunder3 ); */ - - /*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[j]+1, - CEunder4, - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), - CEunder4* - (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), - CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])* - (bo_ij->BO_pi + bo_ij->BO_pi2) );*/ -#endif - -#ifdef TEST_FORCES - Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, - workspace->f_ov ); // OvCoor - 1st term - - Add_dDelta( system, lists, j, - CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a - - Add_dBOpinpi2( system, lists, i, pj, - CEover4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - CEover4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - workspace->f_ov, workspace->f_ov ); // OvCoor - 3b - - Add_dDelta( system, lists, j, - CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * - (bo_ij->BO_pi + bo_ij->BO_pi2), - workspace->f_un ); // UnCoor - 2a - - Add_dBOpinpi2( system, lists, i, pj, - CEunder4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - CEunder4 * (workspace->Delta[j] - - dfvl * workspace->Delta_lp_temp[j]), - workspace->f_un, workspace->f_un ); // UnCoor - 2b -#endif - } - -#ifdef TEST_ENERGY - - fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", - i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); - - fprintf( out_control->eov, "%6d%15.8f%15.8f\n", - i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un ); - - fprintf( out_control->eov, "%6d%15.8f%15.8f\n", - i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un ); -#endif - } -} - - - - - - - - - -//CUDA Functions GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, two_body_parameters *tbp, static_storage p_workspace, simulation_data *data, @@ -374,7 +80,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; //PERFORMANCE IMPACT - atomicAdd (&data->E_Lp, e_lp); + MYATOMICADD(&data->E_Lp, e_lp); dElp = p_lp2 * inv_expvd2 + 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); @@ -382,7 +88,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT //workspace->CdDelta[i] += CElp; // lp - 1st term - atomicAdd (&workspace->CdDelta[i], CElp); + MYATOMICADD(&workspace->CdDelta[i], CElp); #ifdef TEST_ENERGY @@ -407,7 +113,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob type_j = atoms[j].type; if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) { - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + twbp = &( tbp[ index_tbp(type_i,type_j,num_atom_types) ]); bo_ij = &( bonds->select.bond_list[pj].bo_data ); Di = workspace->Delta[i]; vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); @@ -416,7 +122,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT e_lph = p_lp3 * SQR(vov3-3.0); - atomicAdd (&data->E_Lp, e_lph ); + MYATOMICADD(&data->E_Lp, e_lph ); //estrain(i) += e_lph; deahu2dbo = 2.*p_lp3*(vov3 - 3.); @@ -426,7 +132,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[i], deahu2dsbo); + MYATOMICADD(&workspace->CdDelta[i], deahu2dsbo); #ifdef TEST_ENERGY //TODO //fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n", @@ -469,7 +175,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob type_j = atoms[j].type; bo_ij = &(bonds->select.bond_list[pj].bo_data); sbp_j = &(sbp[ type_j ]); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + twbp = &(tbp[ index_tbp(type_i,type_j,num_atom_types) ]); sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* @@ -500,7 +206,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT //data->E_Ov += e_ov = sum_ovun1 * CEover1; e_ov = sum_ovun1 * CEover1; - atomicAdd (&data->E_Ov, e_ov ); + MYATOMICADD(&data->E_Ov, e_ov ); CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); @@ -523,7 +229,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; - atomicAdd (&data->E_Un, e_un ); + MYATOMICADD(&data->E_Un, e_un ); CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + p_ovun2 * e_un * exp_ovun2n); @@ -537,8 +243,8 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob // forces //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[i] , CEover3); // OvCoor - 2nd term - atomicAdd (&workspace->CdDelta[i], CEunder3); // UnCoor - 1st term + MYATOMICADD(&workspace->CdDelta[i] , CEover3); // OvCoor - 2nd term + MYATOMICADD(&workspace->CdDelta[i], CEunder3); // UnCoor - 1st term #ifdef TEST_FORCES //TODO @@ -553,13 +259,13 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob j = pbond->nbr; type_j = atoms[j].type; bo_ij = &(pbond->bo_data); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + twbp = &(tbp[ index_tbp(type_i,type_j,num_atom_types) ]); bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a + MYATOMICADD(&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a bo_ij->Cdbopi += CEover4 * (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b @@ -568,7 +274,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob //PERFORMANCE IMPACT - atomicAdd (&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ); // UnCoor - 2a + MYATOMICADD(&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ); // UnCoor - 2a bo_ij->Cdbopi += CEunder4 * (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b @@ -647,14 +353,11 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob } - //////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////// //TEST ONLY CODE -- See if this is working. //////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////// - -//CUDA Functions GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, two_body_parameters *tbp, static_storage p_workspace, simulation_data *data, @@ -705,7 +408,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob // calculate the energy e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - //atomicAdd (&data->E_Lp, e_lp ); + //MYATOMICADD(&data->E_Lp, e_lp ); E_Lp [ i ] = e_lp; dElp = p_lp2 * inv_expvd2 + @@ -723,7 +426,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob type_j = atoms[j].type; if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) { - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + twbp = &( tbp[ index_tbp(type_i,type_j,num_atom_types) ]); bo_ij = &( bonds->select.bond_list[pj].bo_data ); Di = workspace->Delta[i]; vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); @@ -732,7 +435,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob e_lph = p_lp3 * SQR(vov3-3.0); E_Lp [i] += e_lph; - //atomicAdd (&data->E_Lp, e_lph ); + //MYATOMICADD(&data->E_Lp, e_lph ); //estrain(i) += e_lph; deahu2dbo = 2.*p_lp3*(vov3 - 3.); @@ -769,7 +472,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob type_j = atoms[j].type; bo_ij = &(bonds->select.bond_list[pj].bo_data); sbp_j = &(sbp[ type_j ]); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + twbp = &(tbp[ index_tbp(type_i,type_j,num_atom_types) ]); sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* @@ -790,7 +493,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob e_ov = sum_ovun1 * CEover1; E_Ov [ i ] = e_ov; - //atomicAdd ( &data->E_Ov, e_ov ); + //MYATOMICADD( &data->E_Ov, e_ov ); CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); @@ -813,7 +516,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; E_Un [i] = e_un; - //atomicAdd ( &data->E_Un, e_un ); + //MYATOMICADD( &data->E_Un, e_un ); CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + p_ovun2 * e_un * exp_ovun2n); @@ -831,7 +534,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob j = pbond->nbr; type_j = atoms[j].type; bo_ij = &(pbond->bo_data); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + twbp = &(tbp[ index_tbp(type_i,type_j,num_atom_types) ]); bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st @@ -854,7 +557,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob } } -/////////////////////////////////////////////////////////// + GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, two_body_parameters *tbp, static_storage p_workspace, simulation_data *data, @@ -903,7 +606,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g // calculate the energy e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2; - //atomicAdd (&data->E_Lp, e_lp ); + //MYATOMICADD(&data->E_Lp, e_lp ); E_Lp [ i ] = e_lp; dElp = p_lp2 * inv_expvd2 + @@ -921,7 +624,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g type_j = atoms[j].type; if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) { - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]); + twbp = &( tbp[ index_tbp(type_i,type_j,num_atom_types) ]); bo_ij = &( bonds->select.bond_list[pj].bo_data ); Di = workspace->Delta[i]; vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); @@ -930,7 +633,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g e_lph = p_lp3 * SQR(vov3-3.0); E_Lp [i] += e_lph; - //atomicAdd (&data->E_Lp, e_lph ); + //MYATOMICADD(&data->E_Lp, e_lph ); //estrain(i) += e_lph; deahu2dbo = 2.*p_lp3*(vov3 - 3.); @@ -943,7 +646,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g } } } -/////////////////////////////////////////////////////////// + GLOBAL void test_LonePair_Postprocess ( reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp, two_body_parameters *tbp, diff --git a/PuReMD-GPU/src/cuda_single_body_interactions.h b/PuReMD-GPU/src/cuda_single_body_interactions.h new file mode 100644 index 0000000000000000000000000000000000000000..3ecd4b9a68b174624ac1dcf56aae4cec360750be --- /dev/null +++ b/PuReMD-GPU/src/cuda_single_body_interactions.h @@ -0,0 +1,59 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_SINGLE_BODY_INTERACTIONS_H_ +#define __CUDA_SINGLE_BODY_INTERACTIONS_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters , + single_body_parameters *, two_body_parameters *, + static_storage , simulation_data *, + list , int , int ); + +GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *, global_parameters , + single_body_parameters *, two_body_parameters *, + static_storage , simulation_data *, + list , int , int, + real *, real *, real *); + +GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters , + single_body_parameters *, two_body_parameters *, + static_storage , simulation_data *, + list , int , int, + real *, real *, real *); + +GLOBAL void test_LonePair_Postprocess ( reax_atom *, global_parameters , + single_body_parameters *, two_body_parameters *, + static_storage , simulation_data *, + list , int , int ); + +#ifdef __cplusplus +} +#endif + + +#endif + diff --git a/PuReMD-GPU/src/system_props.cu b/PuReMD-GPU/src/cuda_system_props.cu similarity index 52% rename from PuReMD-GPU/src/system_props.cu rename to PuReMD-GPU/src/cuda_system_props.cu index 3ec39134a3fdd647f8bb81b292d3184e51a6a2b0..7ff5c11fe6d644e6878cd5e25e67b82de39ad0f5 100644 --- a/PuReMD-GPU/src/system_props.cu +++ b/PuReMD-GPU/src/cuda_system_props.cu @@ -18,80 +18,29 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#include "system_props.h" +#include "cuda_system_props.h" + #include "box.h" #include "vector.h" -#include "cuda_utils.h" +#include "cuda_center_mass.h" #include "cuda_copy.h" -#include "reduction.h" -#include "center_mass.h" -#include "validation.h" - - -real Get_Time( ) -{ - struct timeval tim; - - gettimeofday(&tim, NULL ); - return( tim.tv_sec + (tim.tv_usec / 1000000.0) ); -} - - -real Get_Timing_Info( real t_start ) -{ - struct timeval tim; - real t_end; +#include "cuda_utils.h" +#include "cuda_reduction.h" - gettimeofday(&tim, NULL ); - t_end = tim.tv_sec + (tim.tv_usec / 1000000.0); - return (t_end - t_start); -} +GLOBAL void k_Compute_Total_Mass(single_body_parameters *, reax_atom *, real *, size_t ); +GLOBAL void k_Compute_Kinetic_Energy(single_body_parameters *, reax_atom *, unsigned int , real *); +GLOBAL void k_Kinetic_Energy_Reduction(simulation_data *, real *, int); -void Temperature_Control( control_params *control, simulation_data *data, - output_controls *out_control ) -{ - real tmp; - - if( control->T_mode == 1 ) { // step-wise temperature control - if( (data->step - data->prev_steps) % - ((int)(control->T_freq / control->dt)) == 0 ) { - if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) ) - control->T += control->T_rate; - else control->T = control->T_final; - } - } - else if( control->T_mode == 2 ) { // constant slope control - tmp = control->T_rate * control->dt / control->T_freq; - - if( fabs( control->T - control->T_final ) >= fabs( tmp ) ) - control->T += tmp; - } -} void prep_dev_system (reax_system *system) { //copy the system atoms to the device - Sync_Host_Device ( system, cudaMemcpyHostToDevice ); + Sync_Host_Device_Sys( system, cudaMemcpyHostToDevice ); } -void Compute_Total_Mass( reax_system *system, simulation_data *data ) -{ - int i; - int blocks; - int block_size; - real *partial_sums = 0; - - data->M = 0; - - for( i = 0; i < system->N; i++ ) - data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass; - - data->inv_M = 1. / data->M; -} - void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data ) { real *partial_sums = (real *) scratch; @@ -100,7 +49,7 @@ void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data ) //cuda_malloc ((void **)&partial_sums, sizeof (real) * (blocks + 1), 1, 0); cuda_memset (partial_sums, 0, REAL_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH ); - Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + k_Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> (system->reaxprm.d_sbp, system->d_atoms, partial_sums, system->N); cudaThreadSynchronize (); cudaCheckError (); @@ -133,158 +82,6 @@ void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data ) } -GLOBAL void Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) -{ - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - - if(i < n) - x = sbp [ atoms[ i ].type ].mass; - - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdata[0]; - } -} - - -void Compute_Center_of_Mass( reax_system *system, simulation_data *data, - FILE *fout ) -{ - int i; - real m, xx, xy, xz, yy, yz, zz, det; - rvec tvec, diff; - rtensor mat, inv; - - int blocks; - int block_size; - rvec *l_xcm, *l_vcm, *l_amcm; - real t_start, t_end; - - rvec_MakeZero( data->xcm ); // position of CoM - rvec_MakeZero( data->vcm ); // velocity of CoM - rvec_MakeZero( data->amcm ); // angular momentum of CoM - rvec_MakeZero( data->avcm ); // angular velocity of CoM - - /* Compute the position, velocity and angular momentum about the CoM */ - for( i = 0; i < system->N; ++i ) { - m = system->reaxprm.sbp[ system->atoms[i].type ].mass; - - rvec_ScaledAdd( data->xcm, m, system->atoms[i].x ); - rvec_ScaledAdd( data->vcm, m, system->atoms[i].v ); - - rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v ); - rvec_ScaledAdd( data->amcm, m, tvec ); - - /*fprintf( fout,"%3d %g %g %g\n", - i+1, - system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); - fprintf( fout, "vcm: %g %g %g\n", - data->vcm[0], data->vcm[1], data->vcm[2] ); - */ - } - - rvec_Scale( data->xcm, data->inv_M, data->xcm ); - rvec_Scale( data->vcm, data->inv_M, data->vcm ); - - rvec_Cross( tvec, data->xcm, data->vcm ); - rvec_ScaledAdd( data->amcm, -data->M, tvec ); - - data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); - - /* Calculate and then invert the inertial tensor */ - xx = xy = xz = yy = yz = zz = 0; - - for( i = 0; i < system->N; ++i ) { - m = system->reaxprm.sbp[ system->atoms[i].type ].mass; - - rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); - xx += diff[0] * diff[0] * m; - xy += diff[0] * diff[1] * m; - xz += diff[0] * diff[2] * m; - yy += diff[1] * diff[1] * m; - yz += diff[1] * diff[2] * m; - zz += diff[2] * diff[2] * m; - } - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " xx: %f \n", xx); - fprintf (stderr, " xy: %f \n", xy); - fprintf (stderr, " xz: %f \n", xz); - fprintf (stderr, " yy: %f \n", yy); - fprintf (stderr, " yz: %f \n", yz); - fprintf (stderr, " zz: %f \n", zz); -#endif - - mat[0][0] = yy + zz; - mat[0][1] = mat[1][0] = -xy; - mat[0][2] = mat[2][0] = -xz; - mat[1][1] = xx + zz; - mat[2][1] = mat[1][2] = -yz; - mat[2][2] = xx + yy; - - /* invert the inertial tensor */ - det = ( mat[0][0] * mat[1][1] * mat[2][2] + - mat[0][1] * mat[1][2] * mat[2][0] + - mat[0][2] * mat[1][0] * mat[2][1] ) - - ( mat[0][0] * mat[1][2] * mat[2][1] + - mat[0][1] * mat[1][0] * mat[2][2] + - mat[0][2] * mat[1][1] * mat[2][0] ); - - inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; - inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; - inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; - inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; - inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; - inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; - inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; - inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; - inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; - - if( fabs(det) > ALMOST_ZERO ) - rtensor_Scale( inv, 1./det, inv ); - else - rtensor_MakeZero( inv ); - - /* Compute the angular velocity about the centre of mass */ - rtensor_MatVec( data->avcm, inv, data->amcm ); - data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); - -#if defined(DEBUG) - fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", - data->xcm[0], data->xcm[1], data->xcm[2] ); - fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", - data->vcm[0], data->vcm[1], data->vcm[2] ); - fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", - data->amcm[0], data->amcm[1], data->amcm[2] ); - /* fprintf( fout, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", - mat[0][0], mat[0][1], mat[0][2], - mat[1][0], mat[1][1], mat[1][2], - mat[2][0], mat[2][1], mat[2][2] ); - fprintf( fout, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", - inv[0][0], inv[0][1], inv[0][2], - inv[1][0], inv[1][1], inv[1][2], - inv[2][0], inv[2][1], inv[2][2] ); - fflush( fout ); */ - fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", - data->avcm[0], data->avcm[1], data->avcm[2] ); -#endif -} - - void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, FILE *fout ) { @@ -316,12 +113,12 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, l_vcm = r_scratch + (BLOCKS_POW_2 + 1); l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); - center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (RVEC_SIZE * BLOCK_SIZE) >>> + k_center_of_mass_blocks<<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (RVEC_SIZE * BLOCK_SIZE) >>> (system->reaxprm.d_sbp, system->d_atoms, l_xcm, l_vcm, l_amcm, system->N); cudaThreadSynchronize (); cudaCheckError (); - center_of_mass <<<1, BLOCKS_POW_2, 3 * (RVEC_SIZE * BLOCKS_POW_2) >>> + k_center_of_mass<<<1, BLOCKS_POW_2, 3 * (RVEC_SIZE * BLOCKS_POW_2) >>> (l_xcm, l_vcm, l_amcm, l_xcm + BLOCKS_POW_2, l_vcm + BLOCKS_POW_2, @@ -391,18 +188,18 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, cuda_memset (partial_results, 0, REAL_SIZE * 6 * (BLOCKS_POW_2 + 1), RES_SCRATCH ); local_results = (real *) malloc (REAL_SIZE * 6 *(BLOCKS_POW_2+ 1)); - compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> + k_compute_center_mass_sbp<<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> (system->reaxprm.d_sbp, system->d_atoms, partial_results, data->xcm[0], data->xcm[1], data->xcm[2], system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - compute_center_mass <<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> + k_compute_center_mass<<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__ ); #ifdef __BUILD_DEBUG__ if (check_zero (local_results[0],xx) || @@ -456,16 +253,19 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; if( fabs(det) > ALMOST_ZERO ) + { rtensor_Scale( inv, 1./det, inv ); + } else + { rtensor_MakeZero( inv ); + } /* Compute the angular velocity about the centre of mass */ rtensor_MatVec( data->avcm, inv, data->amcm ); data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); - //free the resources - free (local_results); + free( local_results ); #if defined(DEBUG) fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", @@ -489,34 +289,51 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, } - -void Compute_Kinetic_Energy( reax_system* system, simulation_data* data ) +void Cuda_Compute_Kinetic_Energy( reax_system *system, simulation_data *data ) { - int i; - rvec p; - real m; + real *results = (real *) scratch; + cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); + k_Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> + (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results); + cudaThreadSynchronize (); + cudaCheckError (); - data->E_Kin = 0.0; + k_Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2); + cudaThreadSynchronize (); + cudaCheckError (); +} - for (i=0; i < system->N; i++) { - m = system->reaxprm.sbp[system->atoms[i].type].mass; - rvec_Scale( p, m, system->atoms[i].v ); - data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v ); +GLOBAL void k_Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) +{ + extern __shared__ real sdata[]; + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + real x = 0; - /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n", - i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], - system->reaxprm.sbp[system->atoms[i].type].mass); */ - } + if(i < n) + x = sbp [ atoms[ i ].type ].mass; - data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); + sdata[threadIdx.x] = x; + __syncthreads(); + + for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if(threadIdx.x < offset) + { + sdata[threadIdx.x] += sdata[threadIdx.x + offset]; + } + __syncthreads(); + } - if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */ - data->therm.T = ALMOST_ZERO; + if(threadIdx.x == 0) + { + per_block_results[blockIdx.x] = sdata[0]; + } } -GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, +GLOBAL void k_Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, unsigned int N, real *output) { extern __shared__ real sh_ekin[]; @@ -547,8 +364,8 @@ GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atom } } -GLOBAL void Kinetic_Energy_Reduction (simulation_data *data, - real *input, int n) + +GLOBAL void k_Kinetic_Energy_Reduction (simulation_data *data, real *input, int n) { extern __shared__ real sdata[]; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -582,20 +399,6 @@ GLOBAL void Kinetic_Energy_Reduction (simulation_data *data, } } -void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data) -{ - real *results = (real *) scratch; - cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); - Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results); - cudaThreadSynchronize (); - cudaCheckError (); - - Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); -} /* GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, @@ -658,119 +461,3 @@ data->therm.T = ALMOST_ZERO; } } */ - - -/* IMPORTANT: This function assumes that current kinetic energy and - * the center of mass of the system is already computed before. - * - * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs - * to be added when there are long-range interactions or long-range - * corrections to short-range interactions present. - * We may want to add that for more accuracy. - */ -void Compute_Pressure_Isotropic( reax_system* system, control_params *control, - simulation_data* data, - output_controls *out_control ) -{ - int i; - reax_atom *p_atom; - rvec tx; - rvec tmp; - simulation_box *box = &(system->box); - - /* Calculate internal pressure */ - rvec_MakeZero( data->int_press ); - - // 0: both int and ext, 1: ext only, 2: int only - if( control->press_mode == 0 || control->press_mode == 2 ) { - for( i = 0; i < system->N; ++i ) { - p_atom = &( system->atoms[i] ); - - /* transform x into unitbox coordinates */ - Transform_to_UnitBox( p_atom->x, box, 1, tx ); - - /* this atom's contribution to internal pressure */ - rvec_Multiply( tmp, p_atom->f, tx ); - rvec_Add( data->int_press, tmp ); - - if( out_control->debug_level > 0 ) { - fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", - i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] ); - fprintf( out_control->prs, "%8.2f%8.2f%8.2f", - p_atom->f[0], p_atom->f[1], p_atom->f[2] ); - fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", - data->int_press[0],data->int_press[1],data->int_press[2]); - } - } - } - - /* kinetic contribution */ - data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV ); - - /* Calculate total pressure in each direction */ - data->tot_press[0] = data->kin_press - - ((data->int_press[0] + data->ext_press[0]) / - (box->box_norms[1] * box->box_norms[2] * P_CONV)); - - data->tot_press[1] = data->kin_press - - ((data->int_press[1] + data->ext_press[1])/ - (box->box_norms[0] * box->box_norms[2] * P_CONV)); - - data->tot_press[2] = data->kin_press - - ((data->int_press[2] + data->ext_press[2])/ - (box->box_norms[0] * box->box_norms[1] * P_CONV)); - - /* Average pressure for the whole box */ - data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3; -} - - -void Compute_Pressure_Isotropic_Klein( reax_system* system, - simulation_data* data ) -{ - int i; - reax_atom *p_atom; - rvec dx; - - // IMPORTANT: This function assumes that current kinetic energy and - // the center of mass of the system is already computed before. - data->iso_bar.P = 2.0 * data->E_Kin; - - for( i = 0; i < system->N; ++i ) - { - p_atom = &( system->atoms[i] ); - rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm); - data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) ); - } - - data->iso_bar.P /= (3.0 * system->box.volume); - - // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs - // to be added when there are long-range interactions or long-range - // corrections to short-range interactions present. - // We may want to add that for more accuracy. -} - - -void Compute_Pressure( reax_system* system, simulation_data* data, - static_storage *workspace ) -{ - int i; - reax_atom *p_atom; - rtensor temp; - - rtensor_MakeZero( data->flex_bar.P ); - - for( i = 0; i < system->N; ++i ) { - p_atom = &( system->atoms[i] ); - // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx ); - rvec_OuterProduct( temp, p_atom->v, p_atom->v ); - rtensor_ScaledAdd( data->flex_bar.P, - system->reaxprm.sbp[ p_atom->type ].mass, temp ); - // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); - rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp ); - } - - rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P ); - data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0; -} diff --git a/PuReMD-GPU/src/cuda_system_props.h b/PuReMD-GPU/src/cuda_system_props.h new file mode 100644 index 0000000000000000000000000000000000000000..026999e9e7bbc6ecc70b0945e7a30bd853a6cbe0 --- /dev/null +++ b/PuReMD-GPU/src/cuda_system_props.h @@ -0,0 +1,42 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_SYSTEM_PROP_H_ +#define __CUDA_SYSTEM_PROP_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void prep_dev_system (reax_system *system); + +void Cuda_Compute_Total_Mass( reax_system*, simulation_data* ); +void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* ); +void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/cuda_three_body_interactions.cu b/PuReMD-GPU/src/cuda_three_body_interactions.cu new file mode 100644 index 0000000000000000000000000000000000000000..038b88402b49e516ad4594de520a09464b4c7c8e --- /dev/null +++ b/PuReMD-GPU/src/cuda_three_body_interactions.cu @@ -0,0 +1,1636 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_three_body_interactions.h" + +#include "bond_orders.h" +#include "list.h" +#include "lookup.h" +#include "vector.h" +#include "index_utils.h" + +#include "cuda_helpers.h" + + +/* calculates the theta angle between i-j-k */ +DEVICE void d_Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, + real *theta, real *cos_theta ) +{ + (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk ); + if( *cos_theta > 1. ) *cos_theta = 1.0; + if( *cos_theta < -1. ) *cos_theta = -1.0; + + (*theta) = ACOS( *cos_theta ); +} + + +/* calculates the derivative of the cosine of the angle between i-j-k */ +DEVICE void d_Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, + rvec* dcos_theta_di, rvec* dcos_theta_dj, + rvec* dcos_theta_dk ) +{ + int t; + real sqr_d_ji = SQR(d_ji); + real sqr_d_jk = SQR(d_jk); + real inv_dists = 1.0 / (d_ji * d_jk); + real inv_dists3 = POW( inv_dists, 3 ); + real dot_dvecs = Dot( dvec_ji, dvec_jk, 3 ); + real Cdot_inv3 = dot_dvecs * inv_dists3; + + for( t = 0; t < 3; ++t ) { + (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - + Cdot_inv3 * sqr_d_jk * dvec_ji[t]; + + (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists + + Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] ); + + (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - + Cdot_inv3 * sqr_d_ji * dvec_jk[t]; + } + + /*fprintf( stderr, + "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + dvec_jk[t] * inv_dists*/ +} + + +/* this is a 3-body interaction in which the main role is + played by j which sits in the middle of the other two. */ +GLOBAL void k_Three_Body_Interactions( reax_atom *atoms, + single_body_parameters *sbp, + three_body_header *d_thbp, + global_parameters g_params, + control_params *control, + simulation_data *data, + static_storage p_workspace, + list p_bonds, list p_thb_intrs, + int N, int num_atom_types, + real *E_Ang, real *E_Pen, real *E_Coa, rvec *aux_ext_press ) +{ + int i, j, pi, k, pk, t; + int type_i, type_j, type_k; + int start_j, end_j, start_pk, end_pk; + int flag, cnt, num_thb_intrs; + + real temp, temp_bo_jt, pBOjt7; + real p_val1, p_val2, p_val3, p_val4, p_val5; + real p_val6, p_val7, p_val8, p_val9, p_val10; + real p_pen1, p_pen2, p_pen3, p_pen4; + real p_coa1, p_coa2, p_coa3, p_coa4; + real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; + real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; + real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO; + real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; + real CEpen1, CEpen2, CEpen3; + real e_ang, e_coa, e_pen; + real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; + real Cf7ij, Cf7jk, Cf8j, Cf9j; + real f7_ij, f7_jk, f8_Dj, f9_Dj; + real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; + real r_ij, r_jk; + real BOA_ij, BOA_jk; + real vlpadj; + rvec force, ext_press; + // rtensor temp_rtensor, total_rtensor; + real *total_bo; + three_body_header *thbh; + three_body_parameters *thbp; + three_body_interaction_data *p_ijk, *p_kji; + bond_data *pbond_ij, *pbond_jk, *pbond_jt; + bond_order_data *bo_ij, *bo_jk, *bo_jt; + list *bonds, *thb_intrs; + bond_data *bond_list; + three_body_interaction_data *thb_list; + static_storage *workspace = &p_workspace; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + + + total_bo = workspace->total_bond_order; + bonds = &p_bonds; + bond_list = bonds->select.bond_list; + thb_intrs = &p_thb_intrs; + thb_list = thb_intrs->select.three_body_list; + + /* global parameters used in these calculations */ + p_val6 = g_params.l[14]; + p_val8 = g_params.l[33]; + p_val9 = g_params.l[16]; + p_val10 = g_params.l[17]; + + //TODO check this, initially this was zero, + // I am changing it to the starting index for this atom. + //num_thb_intrs = j * MAX_TH_BODY; + + //for( j = 0; j < system->N; ++j ) { + // fprintf( out_control->eval, "j: %d\n", j ); + type_j = atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + + p_val3 = sbp[ type_j ].p_val3; + p_val5 = sbp[ type_j ].p_val5; + + SBOp = 0, prod_SBO = 1; + for( t = start_j; t < end_j; ++t ) { + bo_jt = &(bond_list[t].bo_data); + SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); + temp = SQR( bo_jt->BO ); + temp *= temp; + temp *= temp; + prod_SBO *= EXP( -temp ); + } + + /* modifications to match Adri's code - 09/01/09 */ + if( workspace->vlpex[j] >= 0 ){ + vlpadj = 0; + dSBO2 = prod_SBO - 1; + } + else{ + vlpadj = workspace->nlp[j]; + dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); + } + + SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); + dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); + + if( SBO <= 0 ) + SBO2 = 0, CSBO2 = 0; + else if( SBO > 0 && SBO <= 1 ) { + SBO2 = POW( SBO, p_val9 ); + CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); + } + else if( SBO > 1 && SBO < 2 ) { + SBO2 = 2 - POW( 2-SBO, p_val9 ); + CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); + } + else + SBO2 = 2, CSBO2 = 0; + + expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); + + /* unlike 2-body intrs where we enforce i<j, we cannot put any such + restrictions here. such a restriction would prevent us from producing + all 4-body intrs correctly */ + for( pi = start_j; pi < end_j; ++pi ) { + + //TODO + //num_thb_intrs = pi * MAX_THREE_BODIES; + //TODO + + //Set_Start_Index( pi, num_thb_intrs, thb_intrs ); + num_thb_intrs = Start_Index (pi, thb_intrs); + + pbond_ij = &(bond_list[pi]); + bo_ij = &(pbond_ij->bo_data); + BOA_ij = bo_ij->BO - control->thb_cut; + + + if( BOA_ij/*bo_ij->BO*/ > 0.0 ) { + i = pbond_ij->nbr; + r_ij = pbond_ij->d; + type_i = atoms[i].type; + // fprintf( out_control->eval, "i: %d\n", i ); + + + /* first copy 3-body intrs from previously computed ones where i>k. +IMPORTANT: if it is less costly to compute theta and its +derivative, we should definitely re-compute them, +instead of copying! +in the second for-loop below, we compute only new 3-body intrs +where i < k */ + for( pk = start_j; pk < pi; ++pk ) { + // fprintf( out_control->eval, "pk: %d\n", pk ); + start_pk = Start_Index( pk, thb_intrs ); + end_pk = End_Index( pk, thb_intrs ); + + for( t = start_pk; t < end_pk; ++t ) + if( thb_list[t].thb == i ) { + p_ijk = &(thb_list[num_thb_intrs]); + p_kji = &(thb_list[t]); + + p_ijk->thb = bond_list[pk].nbr; + p_ijk->pthb = pk; + p_ijk->theta = p_kji->theta; + rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); + rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); + rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); + + ++num_thb_intrs; + break; + } + } + + + /* and this is the second for loop mentioned above */ + for( pk = pi+1; pk < end_j; ++pk ) { + pbond_jk = &(bond_list[pk]); + bo_jk = &(pbond_jk->bo_data); + BOA_jk = bo_jk->BO - control->thb_cut; + k = pbond_jk->nbr; + type_k = atoms[k].type; + p_ijk = &( thb_list[num_thb_intrs] ); + + //CHANGE ORIGINAL + if (BOA_jk <= 0) continue; + //CHANGE ORIGINAL + + d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &theta, &cos_theta ); + + d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &(p_ijk->dcos_di), &(p_ijk->dcos_dj), + &(p_ijk->dcos_dk) ); + + p_ijk->thb = k; + p_ijk->pthb = pk; + p_ijk->theta = theta; + + sin_theta = SIN( theta ); + if( sin_theta < 1.0e-5 ) + sin_theta = 1.0e-5; + + ++num_thb_intrs; + + + if( BOA_jk > 0.0 && + (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) { + r_jk = pbond_jk->d; + thbh = &( d_thbp[ index_thbp(type_i,type_j,type_k,num_atom_types) ] ); + flag = 0; + + /* if( workspace->orig_id[i] < workspace->orig_id[k] ) + fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta ); + else + fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + workspace->orig_id[k], workspace->orig_id[j], + workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ + + //TODO: + //pbond_jk->scratch = thbh->cnt; + + for( cnt = 0; cnt < thbh->cnt; ++cnt ) { + // fprintf( out_control->eval, + // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 ); + + if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { + thbp = &( thbh->prm[cnt] ); + + /* ANGLE ENERGY */ + p_val1 = thbp->p_val1; + p_val2 = thbp->p_val2; + p_val4 = thbp->p_val4; + p_val7 = thbp->p_val7; + theta_00 = thbp->theta_00; + + exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); + f7_ij = 1.0 - exp3ij; + Cf7ij = p_val3 * p_val4 * + POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; + + exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); + f7_jk = 1.0 - exp3jk; + Cf7jk = p_val3 * p_val4 * + POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; + + expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); + trm8 = 1.0 + expval6 + expval7; + f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); + Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * + (p_val6 * expval6 * trm8 - + (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 )); + + theta_0 = 180.0 - + theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); + theta_0 = DEG2RAD( theta_0 ); + + expval2theta = EXP(-p_val2 * SQR(theta_0-theta)); + if( p_val1 >= 0 ) + expval12theta = p_val1 * (1.0 - expval2theta); + else // To avoid linear Me-H-Me angles (6/6/06) + expval12theta = p_val1 * -expval2theta; + + CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; + CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; + CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; + CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * + expval2theta * (theta_0 - theta); + + Ctheta_0 = p_val10 * DEG2RAD(theta_00) * + exp( -p_val10 * (2.0 - SBO2) ); + + CEval5 = -CEval4 * Ctheta_0 * CSBO2; + CEval6 = CEval5 * dSBO1; + CEval7 = CEval5 * dSBO2; + CEval8 = -CEval4 / sin_theta; + + e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; + //PERFORMANCE IMPACT + //MYATOMICADD(&data->E_Ang, e_ang); + E_Ang [j] += e_ang; + /* END ANGLE ENERGY*/ + + + /* PENALTY ENERGY */ + p_pen1 = thbp->p_pen1; + p_pen2 = g_params.l[19]; + p_pen3 = g_params.l[20]; + p_pen4 = g_params.l[21]; + + exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); + exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); + exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); + exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); + trm_pen34 = 1.0 + exp_pen3 + exp_pen4; + f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; + Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - + (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + + p_pen4 * exp_pen4 )) / + SQR( trm_pen34 ); + + e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; + //PERFORMANCE IMPACT + //MYATOMICADD(&data->E_Pen, e_pen); + E_Pen [j] += e_pen; + + + CEpen1 = e_pen * Cf9j / f9_Dj; + temp = -2.0 * p_pen2 * e_pen; + CEpen2 = temp * (BOA_ij - 2.0); + CEpen3 = temp * (BOA_jk - 2.0); + /* END PENALTY ENERGY */ + + + /* COALITION ENERGY */ + p_coa1 = thbp->p_coa1; + p_coa2 = g_params.l[2]; + p_coa3 = g_params.l[38]; + p_coa4 = g_params.l[30]; + + exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); + e_coa = + p_coa1 / (1. + exp_coa2) * + EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * + EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * + EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * + EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); + + //PERFORMANCE IMPACT + //MYATOMICADD(&data->E_Coa, e_coa); + E_Coa [j] += e_coa; + + CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; + CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; + CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2); + CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa; + CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa; + /* END COALITION ENERGY */ + + /* FORCES */ + /* + MYATOMICADD(&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ); + MYATOMICADD(&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ); + MYATOMICADD(&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) ); + MYATOMICADD(&workspace->CdDelta[i], CEcoa4 ); + MYATOMICADD(&workspace->CdDelta[k], CEcoa5 ); + */ + + bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ; + bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ; + workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ; + //MYATOMICADD(&workspace->CdDelta[i], CEcoa4 ); + pbond_ij->CdDelta_ij += CEcoa4 ; + //MYATOMICADD(&workspace->CdDelta[k], CEcoa5 ); + pbond_jk->CdDelta_ij += CEcoa5; + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + // fprintf( out_control->eval, "%6d%12.8f\n", + // workspace->orig_id[ bond_list[t].nbr ], + // (CEval6 * pBOjt7) ); + + /* + MYATOMICADD(&bo_jt->Cdbo, (CEval6 * pBOjt7) ); + MYATOMICADD(&bo_jt->Cdbopi, CEval5 ); + MYATOMICADD(&bo_jt->Cdbopi2, CEval5 ); + */ + bo_jt->Cdbo += (CEval6 * pBOjt7) ; + bo_jt->Cdbopi += CEval5 ; + bo_jt->Cdbopi2 += CEval5 ; + } + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + /* + atomic_rvecScaledAdd( atoms[i].f, CEval8, p_ijk->dcos_di ); + atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); + atomic_rvecScaledAdd( atoms[k].f, CEval8, p_ijk->dcos_dk ); + */ + rvec_ScaledAdd( pbond_ij->f, CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( pbond_jk->f, CEval8, p_ijk->dcos_dk ); + + + } + else { + /* terms not related to bond order derivatives + are added directly into + forces and pressure vector/tensor */ + rvec_Scale( force, CEval8, p_ijk->dcos_di ); + //atomic_rvecAdd( atoms[i].f, force ); + rvec_Add( pbond_ij->f, force ); + + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + //atomic_rvecAdd( data->ext_press, ext_press ); + rvec_Add( aux_ext_press [j], ext_press ); + + //atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); + + rvec_Scale( force, CEval8, p_ijk->dcos_dk ); + //atomic_rvecAdd( atoms[k].f, force ); + rvec_Add( pbond_jk->f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + //atomic_rvecAdd( data->ext_press, ext_press ); + rvec_Add( aux_ext_press [j], ext_press ); + + + /* This part is for a fully-flexible box */ + /* rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_di, system->atoms[i].x ); + rtensor_Scale( total_rtensor, +CEval8, temp_rtensor ); + + rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_dj, system->atoms[j].x ); + rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); + + rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_dk, system->atoms[k].x ); + rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); + + if( pbond_ij->imaginary || pbond_jk->imaginary ) + rtensor_ScaledAdd( data->flex_bar.P, + -1.0, total_rtensor ); + else + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } + +#ifdef TEST_ENERGY + //TODO -- check this + // fprintf( out_control->eval, + //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e", + // "%6d%6d%6d%23.15e%23.15e%23.15e\n", + // i+1, j+1, k+1, + //workspace->orig_id[i]+1, + //workspace->orig_id[j]+1, + //workspace->orig_id[k]+1, + //workspace->Delta_boc[j], + // RAD2DEG(theta), /*BOA_ij, BOA_jk, */ + // e_ang, data->E_Ang ); + + /*fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e", + p_val3, p_val4, BOA_ij, BOA_jk ); + fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e", + f7_ij, f7_jk, f8_Dj, expval12theta ); + fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e%23.15e\n", + CEval1, CEval2, CEval3, CEval4, CEval5 + //CEval6, CEval7, CEval8 );*/ + + /*fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + -p_ijk->dcos_di[0]/sin_theta, + -p_ijk->dcos_di[1]/sin_theta, + -p_ijk->dcos_di[2]/sin_theta, + -p_ijk->dcos_dj[0]/sin_theta, + -p_ijk->dcos_dj[1]/sin_theta, + -p_ijk->dcos_dj[2]/sin_theta, + -p_ijk->dcos_dk[0]/sin_theta, + -p_ijk->dcos_dk[1]/sin_theta, + -p_ijk->dcos_dk[2]/sin_theta );*/ + + /* fprintf( out_control->epen, + "%23.15e%23.15e%23.15e\n", + CEpen1, CEpen2, CEpen3 ); + fprintf( out_control->epen, + "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], RAD2DEG(theta), + BOA_ij, BOA_jk, e_pen, data->E_Pen ); */ + + // fprintf( out_control->ecoa, + // "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + // workspace->orig_id[i], + // workspace->orig_id[j], + // workspace->orig_id[k], + // RAD2DEG(theta), BOA_ij, BOA_jk, + // e_coa, data->E_Coa ); +#endif + +#ifdef TEST_FORCES /* angle forces */ + //TODO -- check this + /* + Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); + Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); + Add_dDelta( system, lists, + j, CEval3 + CEval7, workspace->f_ang ); + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + Add_dBO( system, lists, j, t, pBOjt7 * CEval6, + workspace->f_ang ); + Add_dBOpinpi2( system, lists, j, t, + CEval5, CEval5, + workspace->f_ang, workspace->f_ang ); + } + + rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); + // end angle forces + + // penalty forces + Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); + Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); + Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); + // end penalty forces + + // coalition forces + Add_dBO( system, lists, + j, pi, CEcoa1-CEcoa4, workspace->f_coa ); + Add_dBO( system, lists, + j, pk, CEcoa2-CEcoa5, workspace->f_coa ); + Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); + Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); + Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); + // end coalition forces + + */ +#endif + } + } + } + } + } + + Set_End_Index(pi, num_thb_intrs, thb_intrs ); + } + // } // end of the main for loop here + + + //TODO - to be done on the CPU + /* + + if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) { + workspace->realloc.num_3body = num_thb_intrs; + if( num_thb_intrs > thb_intrs->num_intrs ) { + fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d", + data->step, num_thb_intrs, thb_intrs->num_intrs ); + exit( INSUFFICIENT_SPACE ); + } + } + */ + + //fprintf( stderr,"%d: Number of angle interactions: %d\n", + // data->step, num_thb_intrs ); + +#ifdef TEST_ENERGY + /* + fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs ); + + fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n", + data->E_Ang, data->E_Pen, data->E_Coa ); + + fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] ); + */ +#endif +} + + +GLOBAL void k_Three_Body_Interactions_results ( reax_atom *atoms, control_params *control, + static_storage p_workspace, + list p_bonds, int N ) +{ + int i, pj; + + bond_data *pbond; + bond_data *sym_index_bond; + list *bonds = &p_bonds; + static_storage *workspace = &p_workspace; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= N) return; + + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + + pbond = &(bonds->select.bond_list[pj]); + sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); + + workspace->CdDelta [i] += sym_index_bond->CdDelta_ij; + + rvec_Add (atoms[i].f, sym_index_bond->f ); + } +} + + +/* this is a 3-body interaction in which the main role is + played by j which sits in the middle of the other two. */ +GLOBAL void k_Three_Body_Estimate ( reax_atom *atoms, + control_params *control, + list p_bonds, int N, + int *count) +{ + int i, j, pi, k, pk, t; + int type_i, type_j, type_k; + int start_j, end_j ; + int flag, cnt, num_thb_intrs; + + real r_ij, r_jk; + real BOA_ij, BOA_jk; + list *bonds; + + bond_order_data *bo_ij, *bo_jk, *bo_jt; + bond_data *bond_list; + bond_data *pbond_ij, *pbond_jk, *pbond_jt; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + + bonds = &p_bonds; + bond_list = bonds->select.bond_list; + + type_j = atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + + + for( pi = start_j; pi < end_j; ++pi ) { + + num_thb_intrs = 0; + count [pi] = 0; + + pbond_ij = &(bond_list[pi]); + bo_ij = &(pbond_ij->bo_data); + BOA_ij = bo_ij->BO - control->thb_cut; + + if( BOA_ij/*bo_ij->BO*/ > 0.0 ) { + i = pbond_ij->nbr; + r_ij = pbond_ij->d; + type_i = atoms[i].type; + + /* + for( pk = start_j; pk < pi; ++pk ) { + start_pk = Start_Index( pk, thb_intrs ); + end_pk = End_Index( pk, thb_intrs ); + + for( t = start_pk; t < end_pk; ++t ) + if( thb_list[t].thb == i ) { + + ++num_thb_intrs; + break; + } + } + */ + + /* and this is the second for loop mentioned above */ + for( pk = start_j; pk < end_j; ++pk ) { + if (pk == pi) continue; + + pbond_jk = &(bond_list[pk]); + bo_jk = &(pbond_jk->bo_data); + BOA_jk = bo_jk->BO - control->thb_cut; + + if (BOA_jk <= 0) continue; + + ++num_thb_intrs; + } + } + + count [pi] = num_thb_intrs; + } +} + + +GLOBAL void k_Hydrogen_Bonds(reax_atom *atoms, + single_body_parameters *sbp, + hbond_parameters *d_hbp, + control_params *control, + simulation_data *data, + static_storage p_workspace, + list p_bonds, list p_hbonds, + int N, int num_atom_types, + real *E_HB, rvec *aux_ext_press, rvec *atoms_f ) +{ + extern __shared__ real t_hb[]; + extern __shared__ real t_f[]; + //extern __shared__ rvec t_cdbo[]; + //extern __shared__ rvec t_hf []; + + real *sh_hb = t_hb; + rvec *sh_atomf = (rvec *)(t_hb + blockDim.x); + //real *sh_cdbo = t_hb + blockDim.x; + //rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); + + int i, j, k, pi, pk, itr, top; + int type_i, type_j, type_k; + int start_j, end_j, hb_start_j, hb_end_j; + int hblist[MAX_BONDS]; + int num_hb_intrs = 0; + real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; + rvec dvec_jk, force, ext_press; + ivec rel_jk; + // rtensor temp_rtensor, total_rtensor; + hbond_parameters *hbp; + bond_order_data *bo_ij; + bond_data *pbond_ij; + far_neighbor_data *nbr_jk; + list *bonds, *hbonds; + bond_data *bond_list; + hbond_data *hbond_list, *hbond_jk; + static_storage *workspace = &p_workspace; + + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + + //j = blockIdx.x; + + bonds = &p_bonds; + bond_list = bonds->select.bond_list; + + hbonds = &p_hbonds; + hbond_list = hbonds->select.hbond_list; + + // loops below discover the Hydrogen bonds between i-j-k triplets. + // here j is H atom and there has to be some bond between i and j. + // Hydrogen bond is between j and k. + // so in this function i->X, j->H, k->Z when we map + // variables onto the ones in the handout. + + //for( j = 0; j < system->N; ++j ) + sh_hb [threadIdx.x] = 0; + rvec_MakeZero ( sh_atomf[ threadIdx.x] ); + + if( sbp[atoms[j].type].p_hbond==1) {// j must be H + //set j's variables + type_j = atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); + hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); + + top = 0; + for( pi = start_j; pi < end_j; ++pi ) { + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + bo_ij = &(pbond_ij->bo_data); + type_i = atoms[i].type; + + if( sbp[type_i].p_hbond == 2 && + bo_ij->BO >= HB_THRESHOLD ) + hblist[top++] = pi; + } + + // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", + // j, top, hb_start_j, hb_end_j ); + + for( pk = hb_start_j; pk < hb_end_j; ++pk ) + //pk = hb_start_j + threadIdx.x; + //while (pk < hb_end_j) + { + // set k's varibles + //TODO + hbond_jk = &( hbond_list[pk] ); + //TODO + k = hbond_list[pk].nbr; + type_k = atoms[k].type; + nbr_jk = hbond_list[pk].ptr; + r_jk = nbr_jk->d; + rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); + + //TODO Double check this Hydrogen Bonds fix + //rvec_MakeZero ( nbr_jk->h_f ); + rvec_MakeZero ( hbond_jk->h_f ); + //TODO Double check this Hydrogen Bonds fix + + //sh_hb [threadIdx.x] = 0; + + + //itr = threadIdx.x; + for( itr=0; itr < top; ++itr ) { + //while (itr < top) { + pi = hblist[itr]; + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + + //TODO + //rvec_MakeZero (sh_hf [threadIdx.x]); + //sh_cdbo [threadIdx.x] = 0; + + //rvec_MakeZero ( sh_atomf[ threadIdx.x] ); + + + if( i != k ) { + bo_ij = &(pbond_ij->bo_data); + type_i = atoms[i].type; + r_ij = pbond_ij->d; + hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); + ++num_hb_intrs; + + d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &theta, &cos_theta ); + // the derivative of cos(theta) + d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &dcos_theta_di, &dcos_theta_dj, + &dcos_theta_dk ); + + // hydrogen bond energy + sin_theta2 = SIN( theta/2.0 ); + sin_xhz4 = SQR(sin_theta2); + sin_xhz4 *= sin_xhz4; + cos_xhz1 = ( 1.0 - cos_theta ); + exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); + exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + + r_jk / hbp->r0_hb - 2.0 ) ); + + //PERFORMANCE IMPACT + e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; + //MYATOMICADD( &data->E_HB, e_hb ); + //E_HB [j] += e_hb; + sh_hb [threadIdx.x] += e_hb; + + CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; + CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; + CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + + 1.0 / hbp->r0_hb); + + //this is the problem here + //TODO + // hydrogen bond forces + bo_ij->Cdbo += CEhb1; // dbo term + //sh_cdbo[threadIdx.x] += CEhb1; + //TODO + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + + //PERFORMANCE IMPACT + /* + atomic_rvecScaledAdd( atoms[i].f, + +CEhb2, dcos_theta_di ); //dcos terms + atomic_rvecScaledAdd( atoms[j].f, + +CEhb2, dcos_theta_dj ); + atomic_rvecScaledAdd( atoms[k].f, + +CEhb2, dcos_theta_dk ); + //dr terms + atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk ); + */ + + //PERFORMANCE IMPACT + rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms + //rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms + + //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); + + //TODO you forgot here + //TODO Hydrogen bonds fix. -- BE VERY CAREFUL ***** + rvec_ScaledAdd( hbond_jk->h_f, + +CEhb2, dcos_theta_dk ); + + //rvec_ScaledAdd( nbr_jk->h_f, + // +CEhb2, dcos_theta_dk ); + + //dr terms + //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); + + //atoms_f [j] ++; + + //TODO you forgot + rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk ); + //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk ); + } + else + { + // for pressure coupling, terms that are not related + // to bond order derivatives are added directly into + // pressure vector/tensor + rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms + rvec_Add( pbond_ij->h_f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press ); + + rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); + + ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); + rvec_Scale( force, +CEhb2, dcos_theta_dk ); + + //rvec_Add( nbr_jk->h_f, force ); + rvec_Add( hbond_jk->h_f, force ); + + rvec_iMultiply( ext_press, rel_jk, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); + + //dr terms + rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + + rvec_Scale( force, CEhb3/r_jk, dvec_jk ); + rvec_Add( hbond_jk->h_f, force ); + rvec_iMultiply( ext_press, rel_jk, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); + + } + + //do the reduction for the bond_ij here + /* + if (threadIdx.x < 16){ + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); + } + if (threadIdx.x < 8){ + //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); + } + if (threadIdx.x < 4){ + //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); + } + if (threadIdx.x < 2){ + //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); + } + if (threadIdx.x < 1){ + //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); + + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); + } + if (threadIdx.x == 0){ + //bo_ij->Cdbo += sh_cdbo [threadIdx.x]; + //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); + + E_HB [j] += sh_hb [threadIdx.x]; + //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + } + */ + + + } // i != k if statement + + + //itr += blockDim.x; + + } //itr for statement + + /* + __syncthreads (); + + for (int x = 1; x < blockDim.x; x++) + sh_hb [0] += sh_hb [x]; + + E_HB [j] += sh_hb[0]; + if (threadIdx.x < 16) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; + if (threadIdx.x < 8) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; + if (threadIdx.x < 4) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; + if (threadIdx.x < 2) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; + if (threadIdx.x < 1) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; + if (threadIdx.x == 0) E_HB [j] += sh_hb [threadIdx.x]; + */ + + + //pk += blockDim.x; + + } // pk for statement + } // main if statment + + //do the reduction for the bond_ij here + /* + if (threadIdx.x < 16){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); + } + if (threadIdx.x < 8){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); + } + if (threadIdx.x < 4){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); + } + if (threadIdx.x < 2){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); + } + if (threadIdx.x < 1){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; + //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); + } + if (threadIdx.x == 0){ + E_HB [j] += sh_hb [threadIdx.x]; + //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + } + */ + + E_HB [j] += sh_hb [threadIdx.x]; + rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + + //rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]); +} + + +DEVICE void warpReduce(volatile real* sdata, int tid) +{ + if (tid < 16) sdata[tid] += sdata[tid + 16]; + if (tid < 8) sdata[tid] += sdata[tid + 8]; + if (tid < 4) sdata[tid] += sdata[tid + 4]; + if (tid < 2) sdata[tid] += sdata[tid + 2]; + if (tid < 1) sdata[tid] += sdata[tid + 1]; +} + + + + +GLOBAL void k_Hydrogen_Bonds_HB(reax_atom *atoms, + single_body_parameters *sbp, + hbond_parameters *d_hbp, + control_params *control, + simulation_data *data, + static_storage p_workspace, + list p_bonds, list p_hbonds, + int N, int num_atom_types, + real *E_HB, rvec *aux_ext_press, rvec *atoms_f ) +{ + extern __shared__ real t_hb[]; + extern __shared__ rvec t__f[]; + extern __shared__ rvec t_cdbo[]; + extern __shared__ rvec t_hf []; + + real *sh_hb = t_hb; + real *sh_cdbo = t_hb + blockDim.x; + rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x); + rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); + + int __THREADS_PER_ATOM__ = HBONDS_THREADS_PER_ATOM; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = thread_id / __THREADS_PER_ATOM__; + int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); + int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; + + if (warp_id >= N ) return; + + + int i, j, k, pi, pk, itr, top; + int type_i, type_j, type_k; + int start_j, end_j, hb_start_j, hb_end_j; + int hblist[MAX_BONDS]; + int num_hb_intrs = 0; + real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; + rvec dvec_jk, force, ext_press; + ivec rel_jk; + // rtensor temp_rtensor, total_rtensor; + hbond_parameters *hbp; + bond_order_data *bo_ij; + bond_data *pbond_ij; + far_neighbor_data *nbr_jk; + list *bonds, *hbonds; + bond_data *bond_list; + hbond_data *hbond_list, *hbond_jk; + static_storage *workspace = &p_workspace; + + /* + j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= N) return; + */ + + // j = blockIdx.x; + + j = warp_id; + + bonds = &p_bonds; + bond_list = bonds->select.bond_list; + + hbonds = &p_hbonds; + hbond_list = hbonds->select.hbond_list; + + // loops below discover the Hydrogen bonds between i-j-k triplets. + // here j is H atom and there has to be some bond between i and j. + // Hydrogen bond is between j and k. + // so in this function i->X, j->H, k->Z when we map + // variables onto the ones in the handout. + + //for( j = 0; j < system->N; ++j ) + sh_hb [threadIdx.x] = 0; + rvec_MakeZero ( sh_atomf[ threadIdx.x] ); + + if( sbp[atoms[j].type].p_hbond==1) {// j must be H + //set j's variables + type_j = atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); + hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); + + top = 0; + for( pi = start_j; pi < end_j; ++pi ) { + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + bo_ij = &(pbond_ij->bo_data); + type_i = atoms[i].type; + + if( sbp[type_i].p_hbond == 2 && + bo_ij->BO >= HB_THRESHOLD ) { + hblist[top++] = pi; + } + } + + // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", + // j, top, hb_start_j, hb_end_j ); + + for( itr=0; itr < top; ++itr ) { + pi = hblist[itr]; + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + + //TODO + rvec_MakeZero (sh_hf [threadIdx.x]); + sh_cdbo [threadIdx.x] = 0; + + + //for( pk = hb_start_j; pk < hb_end_j; ++pk ) + int loopcount = (hb_end_j - hb_start_j) / HBONDS_THREADS_PER_ATOM + (((hb_end_j - hb_start_j)%HBONDS_THREADS_PER_ATOM == 0) ? 0 : 1); + int count = 0; + //jpk = hb_start_j + threadIdx.x; + pk = hb_start_j + lane_id; + //while (pk < hb_end_j) + while (count < loopcount) + { + + if (pk < hb_end_j) + { + // set k's varibles + //TODO + hbond_jk = &( hbond_list[pk] ); + //TODO + k = hbond_list[pk].nbr; + type_k = atoms[k].type; + nbr_jk = hbond_list[pk].ptr; + r_jk = nbr_jk->d; + rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); + } + else k = -1; + + //TODO Double check this Hydrogen Bonds fix + //rvec_MakeZero ( nbr_jk->h_f ); + //rvec_MakeZero ( hbond_jk->h_f ); + //TODO Double check this Hydrogen Bonds fix + + //sh_hb [threadIdx.x] = 0; + //rvec_MakeZero ( sh_atomf[ threadIdx.x] ); + //__syncthreads (); + + + if(( i != k ) && (k != -1)) { + bo_ij = &(pbond_ij->bo_data); + type_i = atoms[i].type; + r_ij = pbond_ij->d; + hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); + ++num_hb_intrs; + + d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &theta, &cos_theta ); + // the derivative of cos(theta) + d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &dcos_theta_di, &dcos_theta_dj, + &dcos_theta_dk ); + + // hydrogen bond energy + sin_theta2 = SIN( theta/2.0 ); + sin_xhz4 = SQR(sin_theta2); + sin_xhz4 *= sin_xhz4; + cos_xhz1 = ( 1.0 - cos_theta ); + exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); + exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + + r_jk / hbp->r0_hb - 2.0 ) ); + + //PERFORMANCE IMPACT + e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; + //MYATOMICADD( &data->E_HB, e_hb ); + //E_HB [j] += e_hb; + sh_hb [threadIdx.x] += e_hb; + + CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; + CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; + CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + + 1.0 / hbp->r0_hb); + + //this is the problem here + //TODO + // hydrogen bond forces + //bo_ij->Cdbo += CEhb1; // dbo term + sh_cdbo[threadIdx.x] += CEhb1; + //TODO + //warpReduce (sh_cdbo, threadIdx.x); + //if (threadIdx.x == 0) + // bo_ij->Cdbo += sh_cdbo [0]; + + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { + + //PERFORMANCE IMPACT + /* + atomic_rvecScaledAdd( atoms[i].f, + +CEhb2, dcos_theta_di ); //dcos terms + atomic_rvecScaledAdd( atoms[j].f, + +CEhb2, dcos_theta_dj ); + atomic_rvecScaledAdd( atoms[k].f, + +CEhb2, dcos_theta_dk ); + //dr terms + atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk ); + */ + + //PERFORMANCE IMPACT + //rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms + rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms + + //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); + + + //TODO you forgot here + //TODO Hydrogen bonds fix. -- BE VERY CAREFUL ***** + rvec_ScaledAdd( hbond_jk->h_f, +CEhb2, dcos_theta_dk ); + + //rvec_ScaledAdd( nbr_jk->h_f, + // +CEhb2, dcos_theta_dk ); + + //dr terms + //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); + + //TODO you forgot + rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk ); + //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk ); + } + else + { + // for pressure coupling, terms that are not related + // to bond order derivatives are added directly into + // pressure vector/tensor + //rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms + //rvec_Add( pbond_ij->h_f, force ); + //rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press ); + + //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); + + //ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); + //rvec_Scale( force, +CEhb2, dcos_theta_dk ); + + //rvec_Add( nbr_jk->h_f, force ); + //rvec_Add( hbond_jk->h_f, force ); + + //rvec_iMultiply( ext_press, rel_jk, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); + + //dr terms + //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); + + //rvec_Scale( force, CEhb3/r_jk, dvec_jk ); + //rvec_Add( hbond_jk->h_f, force ); + //rvec_iMultiply( ext_press, rel_jk, force ); + //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); + //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); + + } + + } // i != k if statement + + pk += __THREADS_PER_ATOM__; + count ++; + + } // pk for statement + + //__syncthreads (); + + //at this point done with one bond.... + //do the reduction now + //if (threadIdx.x == 0){ + if (lane_id < 16) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); + } + if (lane_id < 8) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); + } + if (lane_id < 4) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); + } + if (lane_id < 2) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); + } + if (lane_id < 1) { + sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; + rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); + + bo_ij->Cdbo += sh_cdbo [threadIdx.x]; + rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); + } + /* + if (lane_id == 0){ + for (i = 1; i < 32; i++) + { + //sh_cdbo [threadIdx.x] += sh_cdbo [i]; + //rvec_Add (sh_hf [threadIdx.x], sh_hf [i]); + + sh_cdbo [lane_id] += sh_cdbo [lane_id + i]; + rvec_Add (sh_hf [lane_id], sh_hf [lane_id + i]); + } + + //bo_ij->Cdbo += sh_cdbo [threadIdx.x]; + //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); + + bo_ij->Cdbo += sh_cdbo [lane_id]; + rvec_Add (pbond_ij->h_f, sh_hf [lane_id]); + } + */ + + } //itr for statement + + //__syncthreads (); + } // main if statment + + //__syncthreads (); + + //do the reduction for the bond_ij here + if (lane_id < 16){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); + } + if (lane_id < 8){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); + } + if (lane_id < 4){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); + } + if (lane_id < 2){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); + } + if (lane_id < 1){ + sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; + rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); + + E_HB [j] += sh_hb [threadIdx.x]; + rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + } + /* + if (lane == 0){ + //E_HB [j] += sh_hb [threadIdx.x]; + rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]); + } + */ + //if (threadIdx.x == 0){ + /* + if (lane_id == 0){ + for (i = 1; i < 32; i++) + { + //sh_hb [threadIdx.x] += sh_hb [i]; + //rvec_Add (sh_atomf [threadIdx.x], sh_atomf [i]); + sh_hb [lane_id] += sh_hb [lane_id + i]; + rvec_Add (sh_atomf [lane_id], sh_atomf [lane_id + i]); + } + + //E_HB [j] += sh_hb [threadIdx.x]; + //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); + + E_HB [j] += sh_hb [lane_id]; + rvec_Add (atoms[j].f, sh_atomf [lane_id]); + //rvec_Copy (atoms_f[j], sh_atomf [threadIdx.x]); + } + */ + + //E_HB [j] += sh_hb [threadIdx.x]; + //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); +} + + +GLOBAL void k_Hydrogen_Bonds_Postprocess(reax_atom *atoms, + single_body_parameters *sbp, + static_storage p_workspace, + list p_bonds, list p_hbonds, list p_far_nbrs, int N, + real *e_hb) +{ + + int i, pj, hj, nbr, k, j; + int start, end; + + bond_data *pbond; + bond_data *sym_index_bond; + far_neighbor_data *nbr_pj, *sym_index_nbr; + + list *bonds = &p_bonds; + list *far_nbrs = &p_far_nbrs; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if ( i >= N) return; + + // For processing ij information + start = Start_Index(i, bonds); + end = End_Index(i, bonds); + + //rvec_Scale (atoms[i].f, e_hb[i], atoms[i].f); + + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + + pbond = &(bonds->select.bond_list[pj]); + sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); + + rvec_Add (atoms[i].f, sym_index_bond->h_f ); + } + + /* + for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++) + { + // check if the neighbor is of h_type + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + + sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]); + rvec_Add (atoms[i].f, sym_index_nbr->h_f ); + } + */ + + // if (workspace->hbond_index [j] != -1) + // { + // hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); + // hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); + + // for ( hj = hb_start_j; hj < hb_end_j; hj ++ ) + // { + // h_bond_data = &( hbonds->select.hbond_list [hj] ); + // nbr = h_bond_data->nbr; + + // if (nbr == i) { + // rvec_Add (atoms[i].f, h_bond_data->h_f ); + // } + // } + // } +} + + +GLOBAL void k_Hydrogen_Bonds_Far_Nbrs(reax_atom *atoms, + single_body_parameters *sbp, + static_storage p_workspace, + list p_bonds, list p_hbonds, list p_far_nbrs, int N ) +{ + + extern __shared__ rvec __f[]; + int i, pj,j; + int start, end; + + far_neighbor_data *nbr_pj, *sym_index_nbr; + list *far_nbrs = &p_far_nbrs; + + i = blockIdx.x; + + start = Start_Index (i, far_nbrs); + end = End_Index (i, far_nbrs); + pj = start + threadIdx.x; + + rvec_MakeZero (__f[threadIdx.x]); + + while (pj < end) + { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + + //sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]); + // + //rvec_Add (atoms[i].f, sym_index_nbr->h_f ); + // + //rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f ); + + pj += blockDim.x; + } + + if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); + if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); + if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); + if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); + if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); + + if (threadIdx.x == 0) + rvec_Add (atoms[i].f, __f[0]); +} + + +GLOBAL void k_Hydrogen_Bonds_HNbrs(reax_atom *atoms, + single_body_parameters *sbp, + static_storage p_workspace, + list p_bonds, list p_hbonds, list p_far_nbrs, int N ) +{ + + extern __shared__ rvec __f[]; + int i, pj,j; + int start, end; + + hbond_data *nbr_pj, *sym_index_nbr; + list *hbonds = &p_hbonds; + + i = blockIdx.x; + + start = Start_Index (i, hbonds); + end = End_Index (i, hbonds); + pj = start + threadIdx.x; + + rvec_MakeZero (__f[threadIdx.x]); + + while (pj < end) + { + nbr_pj = &( hbonds->select.hbond_list[pj] ); + j = nbr_pj->nbr; + + sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]); + rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f ); + + pj += blockDim.x; + } + + if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); + if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); + if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); + if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); + if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); + + if (threadIdx.x == 0) + rvec_Add (atoms[i].f, __f[0]); +} diff --git a/PuReMD-GPU/src/cuda_three_body_interactions.h b/PuReMD-GPU/src/cuda_three_body_interactions.h new file mode 100644 index 0000000000000000000000000000000000000000..4a87fcfe42852c50ac58a1ef52e8353cf24971a2 --- /dev/null +++ b/PuReMD-GPU/src/cuda_three_body_interactions.h @@ -0,0 +1,71 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_THREE_BODY_INTERACTIONS_H_ +#define __CUDA_THREE_BODY_INTERACTIONS_H_ + +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +DEVICE void d_Calculate_Theta( rvec, real, rvec, real, real*, real* ); + +DEVICE void d_Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* ); + +GLOBAL void k_Three_Body_Interactions( reax_atom *, single_body_parameters *, three_body_header *, + global_parameters , control_params *, simulation_data *, static_storage , + list , list , int , int , real *, real *, real *, rvec *); + +GLOBAL void k_Three_Body_Interactions_results( reax_atom *, + control_params *, static_storage , list , int ); + +GLOBAL void k_Three_Body_Estimate( reax_atom *atoms, + control_params *control, list p_bonds, int N, int *count); + +GLOBAL void k_Hydrogen_Bonds( reax_atom *, + single_body_parameters *, hbond_parameters *, + control_params *, simulation_data *, static_storage , + list , list , int , int, real *, rvec *, rvec *); + +GLOBAL void k_Hydrogen_Bonds_HB( reax_atom *, + single_body_parameters *, hbond_parameters *, + control_params *, simulation_data *, static_storage , + list , list , int , int, real *, rvec *, rvec *); + +GLOBAL void k_Hydrogen_Bonds_Postprocess( reax_atom *, + single_body_parameters *, + static_storage , list, + list , list , int, real * ); + +GLOBAL void k_Hydrogen_Bonds_Far_Nbrs( reax_atom *, + single_body_parameters *, static_storage , list, list , list , int ); + +GLOBAL void k_Hydrogen_Bonds_HNbrs( reax_atom *, single_body_parameters *, + static_storage , list, list , list , int ); + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/PuReMD-GPU/src/cuda_two_body_interactions.cu b/PuReMD-GPU/src/cuda_two_body_interactions.cu new file mode 100644 index 0000000000000000000000000000000000000000..d5f6abd64b16cc4582a404566560768eca49b866 --- /dev/null +++ b/PuReMD-GPU/src/cuda_two_body_interactions.cu @@ -0,0 +1,1047 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "cuda_two_body_interactions.h" + +#include "bond_orders.h" +#include "index_utils.h" +#include "list.h" +#include "lookup.h" +#include "vector.h" +#include "index_utils.h" + +#include "cuda_helpers.h" + + +GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, + single_body_parameters *sbp, two_body_parameters *tbp, + simulation_data *data, + static_storage p_workspace, list p_bonds, + int N, int num_atom_types, real *E_BE) +{ + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + real ebond, pow_BOs_be2, exp_be12, CEbo; + real gp3, gp4, gp7, gp10, gp37; + real exphu, exphua1, exphub1, exphuov, hulpov, estriph; + real decobdbo, decobdboua, decobdboub; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_order_data *bo_ij; + list *bonds; + static_storage *workspace; + + i = blockIdx.x * blockDim.x + threadIdx.x; + if ( i >= N ) return; + + bonds = &p_bonds; + workspace = &p_workspace; + + gp3 = g_params.l[3]; + gp4 = g_params.l[4]; + gp7 = g_params.l[7]; + gp10 = g_params.l[10]; + gp37 = (int) g_params.l[37]; + + //for( i=0; i < system->N; ++i ) + start_i = Start_Index(i, bonds); + end_i = End_Index(i, bonds); + //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i ); + for( pj = start_i; pj < end_i; ++pj ) + { + //TODO + //if( i < bonds->select.bond_list[pj].nbr ) + if( i < bonds->select.bond_list[pj].nbr ) + { + //TODO + /* set the pointers */ + j = bonds->select.bond_list[pj].nbr; + type_i = atoms[i].type; + type_j = atoms[j].type; + sbp_i = &( sbp[type_i] ); + sbp_j = &( sbp[type_j] ); + twbp = &( tbp[ index_tbp(type_i,type_j,num_atom_types) ] ); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + + /* calculate the constants */ + pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); + exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); + CEbo = -twbp->De_s * exp_be12 * + ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); + + /* calculate the Bond Energy */ + ebond = + -twbp->De_s * bo_ij->BO_s * exp_be12 + -twbp->De_p * bo_ij->BO_pi + -twbp->De_pp * bo_ij->BO_pi2; + + //PERFORMANCE IMAPCT + //MYATOMICADD(&data->E_BE, ebond); + //TODO + //E_BE [ i ] += ebond/2.0; + E_BE [ i ] += ebond; + //data->E_BE += ebond; + + /* calculate derivatives of Bond Orders */ + bo_ij->Cdbo += CEbo; + bo_ij->Cdbopi -= (CEbo + twbp->De_p); + bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); + +#ifdef TEST_ENERGY + //TODO + //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", + // workspace->orig_id[i], workspace->orig_id[j], + // i+1, j+1, + // bo_ij->BO, ebond/*, data->E_BE*/ ); + /* + fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", + workspace->orig_id[i], workspace->orig_id[j], + CEbo, -twbp->De_p, -twbp->De_pp );*/ +#endif +#ifdef TEST_FORCES + //TODO + /* + Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); + Add_dBOpinpi2( system, lists, i, pj, + -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), + workspace->f_be, workspace->f_be ); + */ + //TODO +#endif + + /* Stabilisation terminal triple bond */ + if( bo_ij->BO >= 1.00 ) { + if( gp37 == 2 || + (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || + (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { + // ba = SQR(bo_ij->BO - 2.50); + exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); + //oboa=abo(j1)-boa; + //obob=abo(j2)-boa; + exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO)); + exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO)); + //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2); + exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j])); + hulpov = 1.0 / (1.0 + 25.0 * exphuov); + + estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); + //estrain(j1) = estrain(j1) + 0.50*estriph; + //estrain(j2) = estrain(j2) + 0.50*estriph; + + //PERFORMANCE IMPACT + //MYATOMICADD(&data->E_BE, estriph); + E_BE [ i] += estriph; + //data->E_BE += estriph; + + decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * + ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); + decobdboua = -gp10 * exphu * hulpov * + (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + decobdboub = -gp10 * exphu * hulpov * + (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + + bo_ij->Cdbo += decobdbo; + + //PERFORMANCE IMAPCT + workspace->CdDelta[i] += decobdboua; + //MYATOMICADD(&workspace->CdDelta[j], decobdboub); + //CdDelta [ i * N + i ] += decobdboua; + //CdDelta [ i * N + j ] += decobdboua; + //workspace->CdDelta [i] += decobdboua; + //workspace->CdDelta [j] += decobdboub; + +#ifdef TEST_ENERGY + /* + fprintf( out_control->ebond, + "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + //i+1, j+1, + estriph, decobdbo, decobdboua, decobdboub ); + */ +#endif +#ifdef TEST_FORCES + /* + Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); + Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); + Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); + */ +#endif + } + } + } + } //TODO commented out the if statement for processing i < j. + // we process all teh bonds and add only half the energy +} + + +/* + + GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, + two_body_parameters *tbp, + global_parameters g_p, + control_params *control, + simulation_data *data, + list p_far_nbrs, + real *E_vdW, real *E_Ele, rvec *aux_ext_press, + int num_atom_types, int N ) + { + int i, j, pj; + int start_i, end_i; + real self_coef; + real p_vdW1, p_vdW1i; + real powr_vdW1, powgi_vdW1; + real tmp, r_ij, fn13, exp1, exp2; + real Tap, dTap, dfn13, CEvd, CEclmb; + real dr3gamij_1, dr3gamij_3; + real e_ele, e_vdW, e_core, de_core; + rvec temp, ext_press; +// rtensor temp_rtensor, total_rtensor; +two_body_parameters *twbp; +far_neighbor_data *nbr_pj; +list *far_nbrs = &p_far_nbrs; + +i = blockIdx.x * blockDim.x + threadIdx.x; +if ( i >= N ) return; + +p_vdW1 = g_p.l[28]; +p_vdW1i = 1.0 / p_vdW1; +e_ele = 0; +e_vdW = 0; +e_core = 0; +de_core = 0; + +//for( i = 0; i < system->N; ++i ) { +start_i = Start_Index(i, far_nbrs); +end_i = End_Index(i, far_nbrs); +// fprintf( stderr, "i: %d, start: %d, end: %d\n", +// i, start_i, end_i ); + +for( pj = start_i; pj < end_i; ++pj ) +if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { +nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); +j = nbr_pj->nbr; +r_ij = nbr_pj->d; +twbp = &(tbp[ index_tbp(atoms[i].type, atoms[j].type, num_atom_types) ]); +self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! + +//CHANGE ORIGINAL +//if (i <= j) continue; +//CHANGE ORIGINAL + +// Calculate Taper and its derivative +// Tap = nbr_pj->Tap; -- precomputed during compte_H +Tap = control->Tap7 * r_ij + control->Tap6; +Tap = Tap * r_ij + control->Tap5; +Tap = Tap * r_ij + control->Tap4; +Tap = Tap * r_ij + control->Tap3; +Tap = Tap * r_ij + control->Tap2; +Tap = Tap * r_ij + control->Tap1; +Tap = Tap * r_ij + control->Tap0; + +dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; +dTap = dTap * r_ij + 5*control->Tap5; +dTap = dTap * r_ij + 4*control->Tap4; +dTap = dTap * r_ij + 3*control->Tap3; +dTap = dTap * r_ij + 2*control->Tap2; +dTap += control->Tap1/r_ij; + +//vdWaals Calculations +if(g_p.vdw_type==1 || g_p.vdw_type==3) { + // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + E_vdW [i] += e_vdW / 2.0; + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) * dfn13 ); +} +else{ // no shielding + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + + e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + E_vdW [i] += e_vdW / 2.0; + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) ); +} + +if(g_p.vdw_type==2 || g_p.vdw_type==3) { + // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); + e_vdW = self_coef * Tap * e_core; + + //TODO check this + E_vdW [i] += e_vdW / 2.0; + //TODO check this + + de_core = -(twbp->acore/twbp->rcore) * e_core; + CEvd += self_coef * ( dTap * e_core + Tap * de_core ); +} + +//Coulomb Calculations +dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); +dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + +tmp = Tap / dr3gamij_3; +//tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H +e_ele = +self_coef * C_ele * atoms[i].q * atoms[j].q * tmp; +E_Ele [i] += e_ele / 2.0; + +CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q * +( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; +//CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* +// ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3; + +if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + if (i >= j) + rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec ); + else + rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec ); +} +else { // NPT, iNPT or sNPT + // for pressure coupling, terms not related to bond order + // derivatives are added directly into pressure vector/tensor + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + + if ( i >= j) + rvec_ScaledAdd( atoms[i].f, -1., temp ); + else + rvec_Add( atoms[i].f, temp ); + + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + + //rvec_Add( data->ext_press, ext_press ); + rvec_Copy (aux_ext_press[i], ext_press); + + //TODO CHECK THIS calculation here, it should be divided by two somehow. +} +} +//} +} + +*/ + + +GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, + two_body_parameters *tbp, + global_parameters g_p, + control_params *control, + simulation_data *data, + list p_far_nbrs, + real *E_vdW, real *E_Ele, rvec *aux_ext_press, + int num_atom_types, int N ) +{ + extern __shared__ real _vdw[]; + extern __shared__ real _ele[]; + extern __shared__ rvec _force []; + + real *sh_vdw; + real *sh_ele; + rvec *sh_force; + + int i, j, pj; + int start_i, end_i; + real self_coef; + real p_vdW1, p_vdW1i; + real powr_vdW1, powgi_vdW1; + real tmp, r_ij, fn13, exp1, exp2; + real Tap, dTap, dfn13, CEvd, CEclmb; + real dr3gamij_1, dr3gamij_3; + real e_ele, e_vdW, e_core, de_core; + rvec temp, ext_press; + // rtensor temp_rtensor, total_rtensor; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + list *far_nbrs = &p_far_nbrs; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warpid = thread_id / VDW_THREADS_PER_ATOM; + int laneid = thread_id & (VDW_THREADS_PER_ATOM -1); + + i = warpid; + + sh_vdw = _vdw; + sh_ele = _vdw + blockDim.x; + sh_force = (rvec *)( _vdw + 2*blockDim.x); + + sh_vdw[threadIdx.x] = 0.0; + sh_ele[threadIdx.x] = 0.0; + rvec_MakeZero ( sh_force [threadIdx.x] ); + + if (i < N) + { + + p_vdW1 = g_p.l[28]; + p_vdW1i = 1.0 / p_vdW1; + e_ele = 0; + e_vdW = 0; + e_core = 0; + de_core = 0; + + //for( i = 0; i < system->N; ++i ) { + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + // fprintf( stderr, "i: %d, start: %d, end: %d\n", + // i, start_i, end_i ); + + pj = start_i + laneid; + //for( pj = start_i; pj < end_i; ++pj ) + while (pj < end_i) + { + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + r_ij = nbr_pj->d; + twbp = &(tbp[ index_tbp(atoms[i].type, atoms[j].type, num_atom_types) ]); + self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! + + //CHANGE ORIGINAL + //if (i <= j) continue; + //CHANGE ORIGINAL + + // Calculate Taper and its derivative + // Tap = nbr_pj->Tap; -- precomputed during compte_H + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; + dTap = dTap * r_ij + 5*control->Tap5; + dTap = dTap * r_ij + 4*control->Tap4; + dTap = dTap * r_ij + 3*control->Tap3; + dTap = dTap * r_ij + 2*control->Tap2; + dTap += control->Tap1/r_ij; + + //vdWaals Calculations + if(g_p.vdw_type==1 || g_p.vdw_type==3) { + // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + + + //E_vdW [i] += e_vdW / 2.0; + sh_vdw [threadIdx.x] += e_vdW/2.0; + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) * dfn13 ); + } + else{ // no shielding + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + + e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + + + //E_vdW [i] += e_vdW / 2.0; + sh_vdw [threadIdx.x] += e_vdW/2.0; + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) ); + } + + if(g_p.vdw_type==2 || g_p.vdw_type==3) { + // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); + e_vdW = self_coef * Tap * e_core; + + //TODO check this + //E_vdW [i] += e_vdW / 2.0; + sh_vdw [threadIdx.x] += e_vdW / 2.0; + //TODO check this + + de_core = -(twbp->acore/twbp->rcore) * e_core; + CEvd += self_coef * ( dTap * e_core + Tap * de_core ); + } + + //Coulomb Calculations + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + tmp = Tap / dr3gamij_3; + //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H + e_ele = + self_coef * C_ele * atoms[i].q * atoms[j].q * tmp; + + //E_Ele [i] += e_ele / 2.0; + sh_ele [threadIdx.x] += e_ele / 2.0; + + CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q * + ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + //CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* + // ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3; + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + if (i >= j){ + //rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force[threadIdx.x], -(CEvd+CEclmb), nbr_pj->dvec ); + } + else + { + //rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force[threadIdx.x], +(CEvd+CEclmb), nbr_pj->dvec ); + } + } + else { // NPT, iNPT or sNPT + // for pressure coupling, terms not related to bond order + // derivatives are added directly into pressure vector/tensor + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + + if ( i >= j) + { + //rvec_ScaledAdd( atoms[i].f, -1., temp ); + rvec_ScaledAdd( sh_force[threadIdx.x], -1., temp ); + } + else + { + //rvec_Add( atoms[i].f, temp ); + rvec_Add( sh_force[threadIdx.x], temp ); + } + + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + + //rvec_Add( data->ext_press, ext_press ); + rvec_Copy (aux_ext_press[i], ext_press); + + //TODO CHECK THIS calculation here, it should be divided by two somehow. + } + } // if condition for far neighbors + + + pj += VDW_THREADS_PER_ATOM; + + } // end of while loop for pj < end_i condition + } // if (i < N ) condition + //} + + __syncthreads (); + + if (laneid < 16) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); + } + __syncthreads (); + if (laneid < 8) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); + } + __syncthreads (); + if (laneid < 4) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); + } + __syncthreads (); + if (laneid < 2) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); + } + __syncthreads (); + if (laneid < 1) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); + } + __syncthreads (); + if (laneid == 0) { + E_vdW [i] += sh_vdw[threadIdx.x]; + E_Ele [i] += sh_ele[threadIdx.x]; + rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]); + } +} + + +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy(reax_atom *atoms, + control_params *control, + simulation_data *data, + list p_far_nbrs, + real *E_vdW, real *E_Ele, rvec *aux_ext_press, + LR_lookup_table *d_LR, + int num_atom_types, + int energy_update_freq, + int N ) +{ + + extern __shared__ real _vdw[]; + extern __shared__ real _ele[]; + extern __shared__ rvec _force []; + + real *sh_vdw; + real *sh_ele; + rvec *sh_force; + + int i, j, pj, r, steps, update_freq, update_energies; + int type_i, type_j, tmin, tmax; + int start_i, end_i; + real r_ij, self_coef, base, dif; + real e_vdW, e_ele; + real CEvd, CEclmb; + rvec temp, ext_press; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + list *far_nbrs = &p_far_nbrs; + + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int warpid = thread_id / VDW_THREADS_PER_ATOM; + int laneid = thread_id & (VDW_THREADS_PER_ATOM -1); + + i = warpid; + + sh_vdw = _vdw; + sh_ele = _vdw + blockDim.x; + sh_force = (rvec *)( _vdw + 2*blockDim.x); + + sh_vdw[threadIdx.x] = 0.0; + sh_ele[threadIdx.x] = 0.0; + rvec_MakeZero ( sh_force [threadIdx.x] ); + + if ( i < N ) + { + + reax_atom local_atom ; + local_atom.q = atoms[i].q; + //local_atom.q = d_far_data.q[i]; + local_atom.type = atoms[i].type; + //local_atom.type = d_far_data.type[i]; + + /* + sh_vdw = _vdw; + sh_ele = _vdw + warpid; + sh_force = (rvec *)( _vdw + 2*warpid); + + sh_vdw[threadIdx.x] = 0.0; + sh_ele[threadIdx.x] = 0.0; + rvec_MakeZero ( sh_force [threadIdx.x] ); + */ + + + steps = data->step - data->prev_steps; + update_freq = energy_update_freq; + update_energies = update_freq > 0 && steps % update_freq == 0; + + //for( i = 0; i < system->N; ++i ) { + type_i = local_atom.type; + start_i = Start_Index(i,far_nbrs); + end_i = End_Index(i,far_nbrs); + + pj = start_i + laneid; + + //for( pj = start_i; pj < end_i; ++pj ) + while (pj < end_i) + { + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) + //if( d_far_data.d[pj] <= control->r_cut ) + { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + //j = d_far_data.nbrs[pj]; + type_j = atoms[j].type; + //type_j = d_far_data.type[j]; + r_ij = nbr_pj->d; + //r_ij = d_far_data.d[pj]; + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); + + //TODO + //CHANGE ORIGINAL + //if (i <= j) { pj += blockDim.x; continue; } + //CHANGE ORIGINAL + + /* Cubic Spline Interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + + if(( update_energies )) + { + e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + + t->vdW[r].a; + e_vdW *= self_coef; + + e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + t->ele[r].a; + e_ele *= self_coef * local_atom.q * atoms[j].q; + + + //data->E_vdW += e_vdW; + //TODO + //E_vdW [i] += e_vdW / 2.0; + //E_vdW [i] = __dadd_rd (E_vdW [i], e_vdW/2.0); + sh_vdw [threadIdx.x] += e_vdW/2.0; + //E_vdW [i] += e_vdW; + + //TODO + //data->E_Ele += e_ele; + //E_Ele [i] += e_ele / 2.0; + //E_Ele [i] = __dadd_rd ( E_Ele [i], e_ele / 2.0); + sh_ele [threadIdx.x] += e_ele/2.0; + //E_Ele [i] += e_ele; + } + + CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + + t->CEvd[r].a; + CEvd *= self_coef; + + CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + + t->CEclmb[r].a; + CEclmb *= self_coef * local_atom.q * atoms[j].q; + //CEclmb *= self_coef * local_atom.q * d_far_data.q[j]; + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + if ( i >= j) + //rvec_ScaledAdd( atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec ); + //rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), d_far_data.dvec[pj] ); + else + //rvec_ScaledAdd( atoms[i].f, +(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec ); + //rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), d_far_data.dvec[pj] ); + } + else { // NPT, iNPT or sNPT + // for pressure coupling, terms not related to bond order + // derivatives are added directly into pressure vector/tensor / + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + if (i >= j) + rvec_ScaledAdd( atoms[i].f, -1., temp ); + else + rvec_Add( atoms[i].f, temp ); + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + + //rvec_Add( data->ext_press, ext_press ); + rvec_Copy (aux_ext_press [i], ext_press ); + + //TODO CHECK THIS + } + + + + } + + pj += VDW_THREADS_PER_ATOM; + } + + }// if i < n condition + + __syncthreads (); + + if (laneid < 16) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); + } + __syncthreads (); + if (laneid < 8) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); + } + __syncthreads (); + if (laneid < 4) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); + } + __syncthreads (); + if (laneid < 2) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); + } + __syncthreads (); + if (laneid < 1) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); + } + __syncthreads (); + if (laneid == 0) { + E_vdW [i] += sh_vdw[threadIdx.x]; + E_Ele [i] += sh_ele[threadIdx.x]; + rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]); + } +} + + +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1(reax_atom *atoms, + control_params *control, + simulation_data *data, + list p_far_nbrs, + real *E_vdW, real *E_Ele, rvec *aux_ext_press, + LR_lookup_table *d_LR, + int num_atom_types, + int energy_update_freq, + int N ) +{ + + extern __shared__ real _vdw[]; + extern __shared__ real _ele[]; + + real *sh_vdw; + real *sh_ele; + + int i, j, pj, r, steps, update_freq, update_energies; + int type_i, type_j, tmin, tmax; + int start_i, end_i; + real r_ij, self_coef, base, dif; + real e_vdW, e_ele; + real CEvd, CEclmb; + rvec temp, ext_press; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + list *far_nbrs = &p_far_nbrs; + + i = blockIdx.x; + + reax_atom local_atom; + local_atom.q = atoms[i].q; + local_atom.type = atoms[i].type; + + sh_vdw = _vdw; + sh_ele = _vdw + blockDim.x; + + sh_vdw[threadIdx.x] = 0.0; + sh_ele[threadIdx.x] = 0.0; + + + steps = data->step - data->prev_steps; + update_freq = energy_update_freq; + update_energies = update_freq > 0 && steps % update_freq == 0; + + type_i = local_atom.type; + start_i = Start_Index(i,far_nbrs); + end_i = End_Index(i,far_nbrs); + + pj = start_i + threadIdx.x; + + while (pj < end_i) + { + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) + { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + type_j = atoms[j].type; + r_ij = nbr_pj->d; + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); + + /* Cubic Spline Interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + + if(( update_energies )) + { + e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + + t->vdW[r].a; + e_vdW *= self_coef; + + e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + e_ele *= self_coef * local_atom.q * atoms[j].q; + + sh_vdw [threadIdx.x] += e_vdW/2.0; + sh_ele [threadIdx.x] += e_ele/2.0; + } + } + + pj += blockDim.x; + } + + // now do a reduce inside the warp for E_vdW, E_Ele and force. + if (threadIdx.x < 16) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; + } + if (threadIdx.x < 8) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; + } + if (threadIdx.x < 4) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; + } + if (threadIdx.x < 2) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; + } + if (threadIdx.x < 1) { + sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; + sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; + } + if (threadIdx.x == 0) { + E_vdW [i] += sh_vdw[0]; + E_Ele [i] += sh_ele[0]; + } +} + + +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2(reax_atom *atoms, + control_params *control, + simulation_data *data, + list p_far_nbrs, + real *E_vdW, real *E_Ele, rvec *aux_ext_press, + LR_lookup_table *d_LR, + int num_atom_types, + int energy_update_freq, + int N ) +{ + + extern __shared__ rvec _force []; + + rvec *sh_force; + + int i, j, pj, r, steps, update_freq, update_energies; + int type_i, type_j, tmin, tmax; + int start_i, end_i; + real r_ij, self_coef, base, dif; + real e_vdW, e_ele; + real CEvd, CEclmb; + rvec temp, ext_press; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + list *far_nbrs = &p_far_nbrs; + + i = blockIdx.x; + + reax_atom local_atom; + local_atom.q = atoms[i].q; + local_atom.type = atoms[i].type; + + sh_force = _force; + rvec_MakeZero ( sh_force [threadIdx.x] ); + + + steps = data->step - data->prev_steps; + update_freq = energy_update_freq; + update_energies = update_freq > 0 && steps % update_freq == 0; + + //for( i = 0; i < system->N; ++i ) { + type_i = local_atom.type; + start_i = Start_Index(i,far_nbrs); + end_i = End_Index(i,far_nbrs); + + pj = start_i + threadIdx.x; + + while (pj < end_i) + { + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) + { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + type_j = atoms[j].type; + r_ij = nbr_pj->d; + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); + + /* Cubic Spline Interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + + CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + + t->CEvd[r].a; + CEvd *= self_coef; + + CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + + t->CEclmb[r].a; + CEclmb *= self_coef * local_atom.q * atoms[j].q; + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { + if ( i >= j) + rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec ); + else + rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec ); + } + else { // NPT, iNPT or sNPT + // for pressure coupling, terms not related to bond order + // derivatives are added directly into pressure vector/tensor / + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + if (i >= j) + rvec_ScaledAdd( atoms[i].f, -1., temp ); + else + rvec_Add( atoms[i].f, temp ); + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + + rvec_Copy (aux_ext_press [i], ext_press ); + } + } + + pj += blockDim.x; + } + + // now do a reduce inside the warp for E_vdW, E_Ele and force. + if (threadIdx.x < 16) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); + } + if (threadIdx.x < 8) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); + } + if (threadIdx.x < 4) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); + } + if (threadIdx.x < 2) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); + } + if (threadIdx.x < 1) { + rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); + } + if (threadIdx.x == 0) { + rvec_Add (atoms[i].f, sh_force [ 0 ]); + } +} diff --git a/PuReMD-GPU/src/cuda_two_body_interactions.h b/PuReMD-GPU/src/cuda_two_body_interactions.h new file mode 100644 index 0000000000000000000000000000000000000000..fe3e273775f67e17f88d742a52707f72bfbac56c --- /dev/null +++ b/PuReMD-GPU/src/cuda_two_body_interactions.h @@ -0,0 +1,172 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#ifndef __CUDA_TWO_BODY_INTERACTIONS_H_ +#define __CUDA_TWO_BODY_INTERACTIONS_H_ + +#include "mytypes.h" + +#include "index_utils.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void Cuda_Bond_Energy( reax_atom *, global_parameters , single_body_parameters *, two_body_parameters *, + simulation_data *, static_storage , list , int , int, real * ); + +GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *, two_body_parameters *, + global_parameters , control_params *, simulation_data *, list , real *, real *, rvec *, + int , int ); + +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( reax_atom *, control_params *, simulation_data *, + list , real *, real *, rvec *, + LR_lookup_table *, int , int , int ) ; + +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1( reax_atom *, control_params *, simulation_data *, + list , real *, real *, rvec *, + LR_lookup_table *, int , int , int ) ; + +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2( reax_atom *, control_params *, simulation_data *, + list , real *, real *, rvec *, + LR_lookup_table *, int , int , int ) ; + +static DEVICE void d_LR_vdW_Coulomb( global_parameters g_params, two_body_parameters *tbp, + control_params *control, int i, int j, real r_ij, LR_data *lr, int num_atom_types ) +{ + real p_vdW1 = g_params.l[28]; + real p_vdW1i = 1.0 / p_vdW1; + real powr_vdW1, powgi_vdW1; + real tmp, fn13, exp1, exp2; + real Tap, dTap, dfn13; + real dr3gamij_1, dr3gamij_3; + real e_core, de_core; + two_body_parameters *twbp; + + twbp = &(tbp[ index_tbp (i, j, num_atom_types) ]); + e_core = 0; + de_core = 0; + + /* calculate taper and its derivative */ + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6; + dTap = dTap * r_ij + 5 * control->Tap5; + dTap = dTap * r_ij + 4 * control->Tap4; + dTap = dTap * r_ij + 3 * control->Tap3; + dTap = dTap * r_ij + 2 * control->Tap2; + dTap += control->Tap1 / r_ij; + + + /* vdWaals calculations */ + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\ + Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n", + Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), + powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */ + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0); + + lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; + + /*vdWaals Calculations*/ + if (g_params.vdw_type == 1 || g_params.vdw_type == 3) + { + // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); + + lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; + } + else // no shielding + { + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + + lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); + } + + if (g_params.vdw_type == 2 || g_params.vdw_type == 3) + { + // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore))); + lr->e_vdW += Tap * e_core; + + de_core = -(twbp->acore / twbp->rcore) * e_core; + lr->CEvd += dTap * e_core + Tap * de_core; + } + + /* Coulomb calculations */ + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + tmp = Tap / dr3gamij_3; + lr->H = EV_to_KCALpMOL * tmp; + lr->e_ele = C_ele * tmp; + /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\ + Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n", + i, system->atoms[i].type, j, system->atoms[j].type, + twbp->gamma, Tap, dr3gamij_3, + system->atoms[i].q, system->atoms[j].q ); */ + + lr->CEclmb = C_ele * ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + /* fprintf( stdout, "%d %d\t%g\t%g %g\t%g %g\t%g %g\n", + i+1, j+1, r_ij, e_vdW, CEvd * r_ij, + system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */ + + /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n", + i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */ +} + +#ifdef __cplusplus +} +#endif + + +#endif + diff --git a/PuReMD-GPU/src/cuda_utils.cu b/PuReMD-GPU/src/cuda_utils.cu index 2c632c058e2419fbfc82310bb6f5354f863e2b39..c420db769053e46ed871973200d2de3b2cb7f871 100644 --- a/PuReMD-GPU/src/cuda_utils.cu +++ b/PuReMD-GPU/src/cuda_utils.cu @@ -18,120 +18,136 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ - - - #include "cuda_utils.h" -#include "mytypes.h" -void cuda_malloc (void **ptr, int size, int memset, int err_code) { +void cuda_malloc( void **ptr, int size, int memset, int err_code ) +{ cudaError_t retVal = cudaSuccess; //fprintf (stderr, "&ptr --. %ld \n", &ptr); //fprintf (stderr, "ptr --> %ld \n", ptr ); - retVal = cudaMalloc (ptr, size); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to allocate memory on device for the res: %d... exiting with code: %d size: %d \n", - err_code, retVal, size); - exit (err_code); + retVal = cudaMalloc( ptr, size ); + if ( retVal != cudaSuccess ) + { + fprintf( stderr, "Failed to allocate memory on device for the res: %d... exiting with code: %d size: %d \n", + err_code, retVal, size ); + exit( err_code ); } //fprintf (stderr, "&ptr --. %ld \n", &ptr); //fprintf (stderr, "ptr --> %ld \n", ptr ); - if (memset) { - retVal = cudaMemset (*ptr, 0, size); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to memset memory on device... exiting with code %d\n", - err_code); - exit (err_code); + if ( memset ) { + retVal = cudaMemset( *ptr, 0, size ); + if ( retVal != cudaSuccess ) + { + fprintf( stderr, "Failed to memset memory on device... exiting with code %d\n", + err_code ); + exit( err_code ); } } } -void cuda_free (void *ptr, int err_code) { +void cuda_free( void *ptr, int err_code ) +{ cudaError_t retVal = cudaSuccess; if (!ptr) return; - retVal = cudaFree (ptr); + retVal = cudaFree( ptr ); - if (retVal != cudaSuccess) { - fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld\n", - err_code, retVal, ptr); + if ( retVal != cudaSuccess ) + { + fprintf( stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld\n", + err_code, retVal, ptr ); return; } } -void cuda_memset (void *ptr, int data, size_t count, int err_code){ + + +void cuda_memset( void *ptr, int data, size_t count, int err_code ) +{ cudaError_t retVal = cudaSuccess; - retVal = cudaMemset (ptr, data, count); + retVal = cudaMemset( ptr, data, count ); if (retVal != cudaSuccess) { - fprintf (stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr); - fprintf (stderr, " size to memset: %d \n", count); - fprintf (stderr, " target data is : %d \n", data); - fprintf (stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d\n", - err_code, retVal); - exit (err_code); + fprintf( stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr ); + fprintf( stderr, " size to memset: %d \n", count ); + fprintf( stderr, " target data is : %d \n", data ); + fprintf( stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d\n", + err_code, retVal ); + exit( err_code ); } } -void copy_host_device (void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid) + +void copy_host_device( void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid ) { - cudaError_t retVal = cudaErrorNotReady; + cudaError_t retVal = cudaErrorNotReady; - if (dir == cudaMemcpyHostToDevice) - retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice); + if ( dir == cudaMemcpyHostToDevice ) + { + retVal = cudaMemcpy( dev, host, size, cudaMemcpyHostToDevice ); + } else - retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost); + { + retVal = cudaMemcpy( host, dev, size, cudaMemcpyDeviceToHost ); + } - if (retVal != cudaSuccess) { - fprintf (stderr, "could not copy resource %d from host to device: reason %d \n", - resid, retVal); - exit (resid); + if ( retVal != cudaSuccess ) { + fprintf( stderr, "could not copy resource %d from host to device: reason %d \n", + resid, retVal ); + exit( resid ); } } -void copy_device (void *dest, void *src, int size, int resid) + +void copy_device( void *dest, void *src, int size, int resid ) { - cudaError_t retVal = cudaErrorNotReady; + cudaError_t retVal = cudaErrorNotReady; - retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice); - if (retVal != cudaSuccess) { - fprintf (stderr, "could not copy resource %d from host to device: reason %d \n", - resid, retVal); - exit (resid); + retVal = cudaMemcpy( dest, src, size, cudaMemcpyDeviceToDevice ); + if ( retVal != cudaSuccess ) + { + fprintf( stderr, "could not copy resource %d from host to device: reason %d \n", + resid, retVal ); + exit( resid ); } } -void compute_blocks ( int *blocks, int *block_size, int count ) + +void compute_blocks( int *blocks, int *block_size, int count ) { *block_size = CUDA_BLOCK_SIZE; *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1); } -void compute_nearest_pow_2 (int blocks, int *result) + +void compute_nearest_pow_2( int blocks, int *result ) { int power = 1; - while (power < blocks) power *= 2; + while (power < blocks) + { + power *= 2; + } *result = power; } -void print_device_mem_usage () +void print_device_mem_usage( ) { size_t total, free; - cudaMemGetInfo (&free, &total); - if (cudaGetLastError () != cudaSuccess ) + cudaMemGetInfo( &free, &total ); + if ( cudaGetLastError() != cudaSuccess ) { - fprintf (stderr, "Error on the memory call \n"); + fprintf( stderr, "Error on the memory call \n" ); return; } - fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", + fprintf( stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", total, total/(1024*1024), total/ (1024*1024*1024), free, free/(1024*1024), free/ (1024*1024*1024) ); } diff --git a/PuReMD-GPU/src/cuda_utils.h b/PuReMD-GPU/src/cuda_utils.h index ba793e4048c7367438a88b20b758b3090200f4b4..c8976d081bbead0048faa7dd12238c40f2eda5d7 100644 --- a/PuReMD-GPU/src/cuda_utils.h +++ b/PuReMD-GPU/src/cuda_utils.h @@ -21,35 +21,39 @@ #ifndef __CUDA_UTILS_H_ #define __CUDA_UTILS_H_ -#include "cuda.h" -#include "cublas_v2.h" -#include "cusparse_v2.h" -#include "stdlib.h" -#include "stdio.h" +#include "mytypes.h" + +#include <stdlib.h> +#include <stdio.h> #define IDX2C(i,j,ld) (((j)*(ld))+(i)) -static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta) + +#ifdef __cplusplus +extern "C" { +#endif + +static __inline__ void modify( cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta ) { - cublasSscal (handle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm); - cublasSscal (handle, ldm - p, &beta, &m[IDX2C(p, q, ldm)], 1); + cublasSscal( handle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm ); + cublasSscal( handle, ldm - p, &beta, &m[IDX2C(p, q, ldm)], 1 ); } -void cuda_malloc (void **, int , int , int); -void cuda_free (void *, int); -void cuda_memset (void *, int , size_t , int ); -void copy_host_device (void *, void *, int , enum cudaMemcpyKind, int); -void copy_device (void *, void *, int , int ); +void cuda_malloc( void **, int , int , int ); +void cuda_free( void *, int ); +void cuda_memset( void *, int , size_t , int ); +void copy_host_device( void *, void *, int , enum cudaMemcpyKind, int ); +void copy_device( void *, void *, int , int ); -void compute_blocks (int *, int *, int); -void compute_nearest_pow_2 (int blocks, int *result); +void compute_blocks( int *, int *, int ); +void compute_nearest_pow_2( int blocks, int *result ); -void print_device_mem_usage (); +void print_device_mem_usage( ); #define cusparseCheckError(cusparseStatus) __cusparseCheckError (cusparseStatus, __FILE__, __LINE__) -inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line ) +static inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line ) { - if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) + if ( cusparseStatus != CUSPARSE_STATUS_SUCCESS ) { fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cusparseStatus); exit (-1); @@ -59,35 +63,42 @@ inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *f #define cublasCheckError(cublasStatus) __cublasCheckError (cublasStatus, __FILE__, __LINE__) -inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line ) +static inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line ) { - if (cublasStatus != CUBLAS_STATUS_SUCCESS) + if ( cublasStatus != CUBLAS_STATUS_SUCCESS ) { - fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cublasStatus); - exit (-1); + fprintf( stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cublasStatus ); + exit( -1 ); } return; } -#define cudaCheckError() __cudaCheckError( __FILE__, __LINE__ ) -inline void __cudaCheckError( const char *file, const int line ) + +#define cudaCheckError() __cudaCheckError( __FILE__, __LINE__ ) +static inline void __cudaCheckError( const char *file, const int line ) { - cudaError err = cudaGetLastError(); + cudaError err = cudaGetLastError( ); if ( cudaSuccess != err ) { - fprintf (stderr, "Failed .. %s:%d -- gpu erro code %d\n", file, line, err ); + fprintf( stderr, "Failed .. %s:%d -- gpu erro code %d\n", file, line, err ); exit( -1 ); } // More careful checking. However, this will affect performance. - // Comment away if needed. /* - err = cudaDeviceSynchronize(); + err = cudaDeviceSynchronize( ); if( cudaSuccess != err ) { exit( -1 ); } */ + return; } + +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/forces.c b/PuReMD-GPU/src/forces.c new file mode 100644 index 0000000000000000000000000000000000000000..c95d4896e32f60e954d79b0b623520afb042e9ea --- /dev/null +++ b/PuReMD-GPU/src/forces.c @@ -0,0 +1,910 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "forces.h" + +#include "box.h" +#include "bond_orders.h" +#include "single_body_interactions.h" +#include "two_body_interactions.h" +#include "three_body_interactions.h" +#include "four_body_interactions.h" +#include "list.h" +#include "print_utils.h" +#include "system_props.h" +#include "QEq.h" +#include "vector.h" +#include "index_utils.h" + + +void Dummy_Interaction( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ +} + + +void Init_Bonded_Force_Functions( control_params *control ) +{ + Interaction_Functions[0] = Calculate_Bond_Orders; + Interaction_Functions[1] = Bond_Energy; //*/Dummy_Interaction; + Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy; + //*/Dummy_Interaction; + Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction; + Interaction_Functions[4] = Four_Body_Interactions; //*/Dummy_Interaction; + if( control->hb_cut > 0 ) + Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction; + else Interaction_Functions[5] = Dummy_Interaction; + Interaction_Functions[6] = Dummy_Interaction; //empty + Interaction_Functions[7] = Dummy_Interaction; //empty + Interaction_Functions[8] = Dummy_Interaction; //empty + Interaction_Functions[9] = Dummy_Interaction; //empty +} + + +void Compute_Bonded_Forces( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + + int i; + real t_start, t_elapsed; + +#ifdef TEST_ENERGY + /* Mark beginning of a new timestep in each energy file */ + fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", + data->step, "atom1", "atom2", "bo", "ebond", "total" ); + fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", + data->step, "atom", "nlp", "elp", "total" ); + fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", + data->step, "atom", "eov", "total" ); + fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", + data->step, "atom", "eun", "total" ); + fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", + "angle", "bo(12)", "bo(23)", "eval", "epen", "total" ); + fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", + "angle", "bo(12)", "bo(23)", "epen", "total" ); + fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", + "angle", "bo(12)", "bo(23)", "ecoa", "total" ); + fprintf( out_control->ehb, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", + "r(23)", "angle", "bo(12)", "ehb", "total" ); + fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", "atom4", + "phi", "bo(23)", "etor", "total" ); + fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "atom3", "atom4", + "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" ); +#endif + + /* Implement all the function calls as function pointers */ + for( i = 0; i < NO_OF_INTERACTIONS; i++ ) { + //for( i = 0; i < 5; i++ ) { + t_start = Get_Time (); + (Interaction_Functions[i])(system, control, data, workspace, + lists, out_control); + t_elapsed = Get_Timing_Info ( t_start ); + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed ); +#endif + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "f%d-", i ); +#endif +#ifdef TEST_FORCES + (Print_Interactions[i])(system, control, data, workspace, + lists, out_control); +#endif + } +} + + +void Compute_NonBonded_Forces( reax_system *system, control_params *control, + simulation_data *data,static_storage *workspace, + list** lists, output_controls *out_control ) +{ + real t_start, t_elapsed; +#ifdef TEST_ENERGY + fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n", + data->step, "atom1", "atom2", "r12", "evdw", "total" ); + fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n", + data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" ); +#endif + + t_start = Get_Time( ); + QEq( system, control, data, workspace, lists[FAR_NBRS], out_control ); + t_elapsed = Get_Timing_Info( t_start ); + data->timing.QEq += t_elapsed; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "qeq - " ); +#endif + + if ( control->tabulate == 0) + vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control ); + else + Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, + lists, out_control ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nonb forces - " ); +#endif + +#ifdef TEST_FORCES + Print_vdW_Coulomb_Forces( system, control, data, workspace, + lists, out_control ); +#endif +} + + +/* This version of Compute_Total_Force computes forces from coefficients + accumulated by all interaction functions. Saves enormous time & space! */ +void Compute_Total_Force( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists ) +{ + int i, pj; + list *bonds = (*lists) + BONDS; + + for( i = 0; i < system->N; ++i ) + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) + if( i < bonds->select.bond_list[pj].nbr ) { + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) + Add_dBond_to_Forces( i, pj, system, data, workspace, lists ); + else + Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists ); + } +} + + +void Validate_Lists( static_storage *workspace, list **lists, int step, int n, + int Hmax, int Htop, int num_bonds, int num_hbonds ) +{ + int i, flag; + list *bonds, *hbonds; + + bonds = *lists + BONDS; + hbonds = *lists + HBONDS; + + /* far neighbors */ + if( Htop > Hmax * DANGER_ZONE ) { + workspace->realloc.Htop = Htop; + if( Htop > Hmax ) { + fprintf( stderr, + "step%d - ran out of space on H matrix: Htop=%d, max = %d", + step, Htop, Hmax ); + exit(INSUFFICIENT_SPACE); + } + } + + /* bond list */ + flag = -1; + workspace->realloc.num_bonds = num_bonds; + for( i = 0; i < n-1; ++i ) + if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) { + workspace->realloc.bonds = 1; + if( End_Index(i, bonds) > Start_Index(i+1, bonds) ) + flag = i; + } + + if( flag > -1 ) { + fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n", + step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) ); + exit(INSUFFICIENT_SPACE); + } + + if( End_Index(i, bonds) >= bonds->num_intrs-2 ) { + workspace->realloc.bonds = 1; + + if( End_Index(i, bonds) > bonds->num_intrs ) { + fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n", + step, flag, End_Index(i,bonds), bonds->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + + + /* hbonds list */ + if( workspace->num_H > 0 ) { + flag = -1; + workspace->realloc.num_hbonds = num_hbonds; + for( i = 0; i < workspace->num_H-1; ++i ) + if( Num_Entries(i, hbonds) >= + (Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) { + workspace->realloc.hbonds = 1; + if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) ) + flag = i; + } + + if( flag > -1 ) { + fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n", + step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) ); + exit(INSUFFICIENT_SPACE); + } + + if( Num_Entries(i,hbonds) >= + (hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) { + workspace->realloc.hbonds = 1; + + if( End_Index(i, hbonds) > hbonds->num_intrs ) { + fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n", + step, flag, End_Index(i,hbonds), hbonds->num_intrs ); + exit(INSUFFICIENT_SPACE); + } + } + } +} + + +void Init_Forces( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop, btop_i, btop_j, num_bonds, num_hbonds; + int ihb, jhb, ihb_top, jhb_top; + int flag; + real r_ij, r2, self_coef; + real dr3gamij_1, dr3gamij_3, Tap; + //real val, dif, base; + real C12, C34, C56; + real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + sparse_matrix *H; + list *far_nbrs, *bonds, *hbonds; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + //LR_lookup_table *t; + reax_atom *atom_i, *atom_j; + bond_data *ibond, *jbond; + bond_order_data *bo_ij, *bo_ji; + + far_nbrs = *lists + FAR_NBRS; + bonds = *lists + BONDS; + hbonds = *lists + HBONDS; + + H = &workspace->H; + Htop = 0; + num_bonds = 0; + num_hbonds = 0; + btop_i = btop_j = 0; + p_boc1 = system->reaxprm.gp.l[0]; + p_boc2 = system->reaxprm.gp.l[1]; + + for( i = 0; i < system->N; ++i ) { + atom_i = &(system->atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + H->start[i] = Htop; + btop_i = End_Index( i, bonds ); + sbp_i = &(system->reaxprm.sbp[type_i]); + ihb = ihb_top = -1; + if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 ) + ihb_top = End_Index( workspace->hbond_index[i], hbonds ); + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(system->atoms[j]); + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if( nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box), + nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + + if( flag ){ + type_j = system->atoms[j].type; + r_ij = nbr_pj->d; + sbp_j = &(system->reaxprm.sbp[type_j]); + twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]); + self_coef = (i == j) ? 0.5 : 1.0; + + /* H matrix entry */ + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + H->entries[Htop].j = j; + H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3; + ++Htop; + + /* hydrogen bond lists */ + if( control->hb_cut > 0 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + jhb = sbp_j->p_hbond; + if( ihb == 1 && jhb == 2 ) { + hbonds->select.hbond_list[ihb_top].nbr = j; + hbonds->select.hbond_list[ihb_top].scl = 1; + hbonds->select.hbond_list[ihb_top].ptr = nbr_pj; + ++ihb_top; + ++num_hbonds; + } + else if( ihb == 2 && jhb == 1 ) { + jhb_top = End_Index( workspace->hbond_index[j], hbonds ); + hbonds->select.hbond_list[jhb_top].nbr = i; + hbonds->select.hbond_list[jhb_top].scl = -1; + hbonds->select.hbond_list[jhb_top].ptr = nbr_pj; + Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds ); + ++num_hbonds; + } + } + + /* uncorrected bond orders */ + if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) { + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + num_bonds += 2; + /****** bonds i-j and j-i ******/ + ibond = &( bonds->select.bond_list[btop_i] ); + btop_j = End_Index( j, bonds ); + jbond = &(bonds->select.bond_list[btop_j]); + + ibond->nbr = j; + jbond->nbr = i; + ibond->d = r_ij; + jbond->d = r_ij; + rvec_Copy( ibond->dvec, nbr_pj->dvec ); + rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); + ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); + ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); + ibond->dbond_index = btop_i; + jbond->dbond_index = btop_i; + ibond->sym_index = btop_j; + jbond->sym_index = btop_i; + ++btop_i; + Set_End_Index( j, btop_j+1, bonds ); + + bo_ij = &( ibond->bo_data ); + bo_ji = &( jbond->bo_data ); + bo_ji->BO = bo_ij->BO = BO; + bo_ji->BO_s = bo_ij->BO_s = BO_s; + bo_ji->BO_pi = bo_ij->BO_pi = BO_pi; + bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2; + + /* Bond Order page2-3, derivative of total bond order prime */ + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + + /* Only dln_BOp_xx wrt. dr_i is stored here, note that + dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ + rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi2, + -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); + rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s); + rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi ); + rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 ); + + /* Only dBOp wrt. dr_i is stored here, note that + dBOp/dr_i = -dBOp/dr_j and all others are 0 */ + rvec_Scale( bo_ij->dBOp, + -(bo_ij->BO_s * Cln_BOp_s + + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); + rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp ); + + rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp ); + rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp ); + + bo_ij->BO_s -= control->bo_cut; + bo_ij->BO -= control->bo_cut; + bo_ji->BO_s -= control->bo_cut; + bo_ji->BO -= control->bo_cut; + workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp + workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp + bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; + bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; + + /*fprintf( stderr, "%d %d %g %g %g\n", + i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/ + + /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", + Cln_BOp_s, twbp->p_bo2, C12 ); + fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", + Cln_BOp_pi, twbp->p_bo4, C34 ); + fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n", + Cln_BOp_pi2, twbp->p_bo6, C56 );*/ + /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2); + fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4); + fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6); + fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", + twbp->r_s, twbp->r_p, twbp->r_pp ); + fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/ + + /*fprintf( stderr, "\tfactors: %g %g %g\n", + -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pp), + -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/ + /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", + bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] ); + fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", + bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], + bo_ij->dln_BOp_pi[2] ); + fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n", + bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], + bo_ij->dln_BOp_pi2[2] );*/ + + Set_End_Index( j, btop_j+1, bonds ); + } + } + } + } + + H->entries[Htop].j = i; + H->entries[Htop].val = system->reaxprm.sbp[type_i].eta; + ++Htop; + + Set_End_Index( i, btop_i, bonds ); + if( ihb == 1 ) + Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds ); + //fprintf( stderr, "%d bonds start: %d, end: %d\n", + // i, Start_Index( i, bonds ), End_Index( i, bonds ) ); + } + + // mark the end of j list + H->start[i] = Htop; + /* validate lists - decide if reallocation is required! */ + Validate_Lists( workspace, lists, + data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", + data->step, Htop, num_bonds, num_hbonds ); +#endif +} + + +void Init_Forces_Tab( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) { + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int Htop, btop_i, btop_j, num_bonds, num_hbonds; + int tmin, tmax, r; + int ihb, jhb, ihb_top, jhb_top; + int flag; + real r_ij, r2, self_coef; + real val, dif, base; + real C12, C34, C56; + real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + sparse_matrix *H; + list *far_nbrs, *bonds, *hbonds; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + LR_lookup_table *t; + reax_atom *atom_i, *atom_j; + bond_data *ibond, *jbond; + bond_order_data *bo_ij, *bo_ji; + + far_nbrs = *lists + FAR_NBRS; + bonds = *lists + BONDS; + hbonds = *lists + HBONDS; + + H = &workspace->H; + Htop = 0; + num_bonds = 0; + num_hbonds = 0; + btop_i = btop_j = 0; + p_boc1 = system->reaxprm.gp.l[0]; + p_boc2 = system->reaxprm.gp.l[1]; + + for( i = 0; i < system->N; ++i ) { + atom_i = &(system->atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + H->start[i] = Htop; + btop_i = End_Index( i, bonds ); + sbp_i = &(system->reaxprm.sbp[type_i]); + ihb = ihb_top = -1; + if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 ) + ihb_top = End_Index( workspace->hbond_index[i], hbonds ); + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(system->atoms[j]); + + flag = 0; + if((data->step-data->prev_steps) % control->reneighbor == 0) { + if(nbr_pj->d <= control->r_cut) + flag = 1; + else flag = 0; + } + else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box), + nbr_pj->dvec))<=SQR(control->r_cut)){ + nbr_pj->d = sqrt(nbr_pj->d); + flag = 1; + } + + if( flag ){ + type_j = system->atoms[j].type; + r_ij = nbr_pj->d; + sbp_j = &(system->reaxprm.sbp[type_j]); + twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]); + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); + + /* cubic spline interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + val *= EV_to_KCALpMOL / C_ele; + + H->entries[Htop].j = j; + H->entries[Htop].val = self_coef * val; + ++Htop; + + /* hydrogen bond lists */ + if( control->hb_cut > 0 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + jhb = sbp_j->p_hbond; + if( ihb == 1 && jhb == 2 ) { + hbonds->select.hbond_list[ihb_top].nbr = j; + hbonds->select.hbond_list[ihb_top].scl = 1; + hbonds->select.hbond_list[ihb_top].ptr = nbr_pj; + ++ihb_top; + ++num_hbonds; + } + else if( ihb == 2 && jhb == 1 ) { + jhb_top = End_Index( workspace->hbond_index[j], hbonds ); + hbonds->select.hbond_list[jhb_top].nbr = i; + hbonds->select.hbond_list[jhb_top].scl = -1; + hbonds->select.hbond_list[jhb_top].ptr = nbr_pj; + Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds ); + ++num_hbonds; + } + } + + /* uncorrected bond orders */ + if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) { + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + num_bonds += 2; + /****** bonds i-j and j-i ******/ + ibond = &( bonds->select.bond_list[btop_i] ); + btop_j = End_Index( j, bonds ); + jbond = &(bonds->select.bond_list[btop_j]); + + ibond->nbr = j; + jbond->nbr = i; + ibond->d = r_ij; + jbond->d = r_ij; + rvec_Copy( ibond->dvec, nbr_pj->dvec ); + rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); + ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); + ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); + ibond->dbond_index = btop_i; + jbond->dbond_index = btop_i; + ibond->sym_index = btop_j; + jbond->sym_index = btop_i; + ++btop_i; + Set_End_Index( j, btop_j+1, bonds ); + + bo_ij = &( ibond->bo_data ); + bo_ji = &( jbond->bo_data ); + bo_ji->BO = bo_ij->BO = BO; + bo_ji->BO_s = bo_ij->BO_s = BO_s; + bo_ji->BO_pi = bo_ij->BO_pi = BO_pi; + bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2; + + /* Bond Order page2-3, derivative of total bond order prime */ + Cln_BOp_s = twbp->p_bo2 * C12 / r2; + Cln_BOp_pi = twbp->p_bo4 * C34 / r2; + Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; + + /* Only dln_BOp_xx wrt. dr_i is stored here, note that + dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ + rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); + rvec_Scale(bo_ij->dln_BOp_pi2, + -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); + rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s); + rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi ); + rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 ); + + /* Only dBOp wrt. dr_i is stored here, note that + dBOp/dr_i = -dBOp/dr_j and all others are 0 */ + rvec_Scale( bo_ij->dBOp, + -(bo_ij->BO_s * Cln_BOp_s + + bo_ij->BO_pi * Cln_BOp_pi + + bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); + rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp ); + + rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp ); + rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp ); + + bo_ij->BO_s -= control->bo_cut; + bo_ij->BO -= control->bo_cut; + bo_ji->BO_s -= control->bo_cut; + bo_ji->BO -= control->bo_cut; + workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp + workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp + bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; + bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; + + Set_End_Index( j, btop_j+1, bonds ); + } + } + } + } + + H->entries[Htop].j = i; + H->entries[Htop].val = system->reaxprm.sbp[type_i].eta; + ++Htop; + + Set_End_Index( i, btop_i, bonds ); + if( ihb == 1 ) + Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds ); + } + + // mark the end of j list + H->start[i] = Htop; + /* validate lists - decide if reallocation is required! */ + Validate_Lists( workspace, lists, + data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", + data->step, Htop, num_bonds, num_hbonds ); + //Print_Bonds( system, bonds, "sbonds.out" ); + //Print_Bond_List2( system, bonds, "sbonds.out" ); + //Print_Sparse_Matrix2( H, "H.out" ); +#endif +} + + +void Estimate_Storage_Sizes( reax_system *system, control_params *control, + list **lists, int *Htop, int *hb_top, + int *bond_top, int *num_3body ) { + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + int ihb, jhb; + real r_ij, r2; + real C12, C34, C56; + real BO, BO_s, BO_pi, BO_pi2; + real p_boc1, p_boc2; + list *far_nbrs; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + reax_atom *atom_i, *atom_j; + + far_nbrs = *lists + FAR_NBRS; + p_boc1 = system->reaxprm.gp.l[0]; + p_boc2 = system->reaxprm.gp.l[1]; + + for( i = 0; i < system->N; ++i ) { + atom_i = &(system->atoms[i]); + type_i = atom_i->type; + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + sbp_i = &(system->reaxprm.sbp[type_i]); + ihb = sbp_i->p_hbond; + + for( pj = start_i; pj < end_i; ++pj ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + atom_j = &(system->atoms[j]); + type_j = atom_j->type; + sbp_j = &(system->reaxprm.sbp[type_j]); + twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]); + + if( nbr_pj->d <= control->r_cut ) { + ++(*Htop); + + /* hydrogen bond lists */ + if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && + nbr_pj->d <= control->hb_cut ) { + jhb = sbp_j->p_hbond; + if( ihb == 1 && jhb == 2 ) + ++hb_top[i]; + else if( ihb == 2 && jhb == 1 ) + ++hb_top[j]; + } + + /* uncorrected bond orders */ + if( nbr_pj->d <= control->nbr_cut ) { + r_ij = nbr_pj->d; + r2 = SQR(r_ij); + + if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { + C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); + BO_s = (1.0 + control->bo_cut) * EXP( C12 ); + } + else BO_s = C12 = 0.0; + + if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { + C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); + BO_pi = EXP( C34 ); + } + else BO_pi = C34 = 0.0; + + if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { + C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); + BO_pi2= EXP( C56 ); + } + else BO_pi2 = C56 = 0.0; + + /* Initially BO values are the uncorrected ones, page 1 */ + BO = BO_s + BO_pi + BO_pi2; + + if( BO >= control->bo_cut ) { + ++bond_top[i]; + ++bond_top[j]; + } + } + } + } + } + + *Htop += system->N; + *Htop *= SAFE_ZONE; + + for( i = 0; i < system->N; ++i ) { + hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS ); + *num_3body += SQR(bond_top[i]); + bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS ); + } + *num_3body *= SAFE_ZONE; +} + + +void Compute_Forces( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list** lists, output_controls *out_control ) +{ + real t_start, t_elapsed; + + t_start = Get_Time( ); + if( !control->tabulate ) + Init_Forces( system, control, data, workspace, lists, out_control ); + else Init_Forces_Tab( system, control, data, workspace, lists, out_control ); + t_elapsed = Get_Timing_Info( t_start ); + data->timing.init_forces += t_elapsed; + +#if defined(DEBUG_FOCUS) + print_sparse_matrix (system, workspace); + fprintf( stderr, "init_forces - "); +#endif + + + //analyze_hbonds (system, workspace, lists); + + t_start = Get_Time( ); + Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); + t_elapsed = Get_Timing_Info( t_start ); + data->timing.bonded += t_elapsed; + + //print_bond_list (system, workspace, lists); + //exit (0); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "bonded_forces - "); +#endif + + t_start = Get_Time( ); + Compute_NonBonded_Forces( system, control, data, workspace, + lists, out_control ); + t_elapsed = Get_Timing_Info( t_start ); + data->timing.nonb += t_elapsed; + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed); +#endif + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "nonbondeds - "); +#endif + + Compute_Total_Force( system, control, data, workspace, lists ); + //Print_Total_Force( system, control, data, workspace, lists, out_control ); +#if defined(DEBUG_FOCUS) + fprintf( stderr, "totalforces - "); + //Print_Total_Force( system, control, data, workspace, lists, out_control ); +#endif + +#ifdef TEST_FORCES + Print_Total_Force( system, control, data, workspace, lists, out_control ); + Compare_Total_Forces( system, control, data, workspace, lists, out_control ); +#endif +#if defined(DEBUG_FOCUS) + fprintf( stderr, "forces - "); +#endif +} diff --git a/PuReMD-GPU/src/forces.cu b/PuReMD-GPU/src/forces.cu deleted file mode 100644 index e8e1e2917b34eefa913f25cab67b1123d08f892e..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/forces.cu +++ /dev/null @@ -1,2880 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#include "forces.h" -#include "box.h" -#include "bond_orders.h" -#include "single_body_interactions.h" -#include "two_body_interactions.h" -#include "three_body_interactions.h" -#include "four_body_interactions.h" -#include "list.h" -#include "print_utils.h" -#include "system_props.h" -#include "QEq.h" -#include "vector.h" - -#include "index_utils.h" -#include "cuda_utils.h" -#include "cuda_init.h" -#include "reduction.h" -//#include "matrix.h" - -#include "validation.h" - -#include "cudaProfiler.h" - - -void Dummy_Interaction( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) -{ -} - - -void Init_Bonded_Force_Functions( control_params *control ) -{ - Interaction_Functions[0] = Calculate_Bond_Orders; - Interaction_Functions[1] = Bond_Energy; //*/Dummy_Interaction; - Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy; - //*/Dummy_Interaction; - Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction; - Interaction_Functions[4] = Four_Body_Interactions; //*/Dummy_Interaction; - if( control->hb_cut > 0 ) - Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction; - else Interaction_Functions[5] = Dummy_Interaction; - Interaction_Functions[6] = Dummy_Interaction; //empty - Interaction_Functions[7] = Dummy_Interaction; //empty - Interaction_Functions[8] = Dummy_Interaction; //empty - Interaction_Functions[9] = Dummy_Interaction; //empty -} - - -void Compute_Bonded_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) -{ - - int i; - real t_start, t_elapsed; - -#ifdef TEST_ENERGY - /* Mark beginning of a new timestep in each energy file */ - fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", - data->step, "atom1", "atom2", "bo", "ebond", "total" ); - fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", - data->step, "atom", "nlp", "elp", "total" ); - fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", - data->step, "atom", "eov", "total" ); - fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", - data->step, "atom", "eun", "total" ); - fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", - "angle", "bo(12)", "bo(23)", "eval", "epen", "total" ); - fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", - "angle", "bo(12)", "bo(23)", "epen", "total" ); - fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", - "angle", "bo(12)", "bo(23)", "ecoa", "total" ); - fprintf( out_control->ehb, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", - "r(23)", "angle", "bo(12)", "ehb", "total" ); - fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", "atom4", - "phi", "bo(23)", "etor", "total" ); - fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "atom3", "atom4", - "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" ); -#endif - - /* Implement all the function calls as function pointers */ - for( i = 0; i < NO_OF_INTERACTIONS; i++ ) { - //for( i = 0; i < 5; i++ ) { - t_start = Get_Time (); - (Interaction_Functions[i])(system, control, data, workspace, - lists, out_control); - t_elapsed = Get_Timing_Info ( t_start ); - -#ifdef __DEBUG_CUDA__ - fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed ); -#endif - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "f%d-", i ); -#endif -#ifdef TEST_FORCES - (Print_Interactions[i])(system, control, data, workspace, - lists, out_control); -#endif - } - } - - void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - real t_start, t_elapsed; - real *spad = (real *)scratch; - rvec *rvec_spad; - - //Compute the bonded for interaction here. - //Step 1. -#ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); - fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE); -#endif - - Cuda_Calculate_Bond_Orders_Init <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, - *dev_workspace, system->reaxprm.num_atom_types, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Calculate_Bond_Orders <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, - system->reaxprm.d_tbp, *dev_workspace, - *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), - system->reaxprm.num_atom_types, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Update_Uncorrected_BO <<<BLOCKS, BLOCK_SIZE>>> - (*dev_workspace, *(dev_lists + BONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Update_Workspace_After_Bond_Orders <<<BLOCKS, BLOCK_SIZE>>> - (system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, - *dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n"); -#endif - - - - //Step 2. -#ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); -#endif - //cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE + system->N * REAL_SIZE + 16 * REAL_SIZE), RES_SCRATCH ); - cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE ) , RES_SCRATCH ); - - Cuda_Bond_Energy <<< BLOCKS, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, system->reaxprm.d_tbp, - (simulation_data *)data->d_simulation_data, *dev_workspace, *(dev_lists + BONDS), - system->N, system->reaxprm.num_atom_types, spad ); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_BE - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - //(spad + system->N, spad + system->N + 16, 16); - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_BE, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, "Cuda_Bond_Energy Done... \n"); -#endif - - //Step 3. -#ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); -#endif - cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N ), RES_SCRATCH ); - - test_LonePair_OverUnder_Coordination_Energy_LP <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - *dev_workspace, (simulation_data *)data->d_simulation_data, - *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, - spad, spad + 2 * system->N, spad + 4*system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - test_LonePair_OverUnder_Coordination_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - *dev_workspace, (simulation_data *)data->d_simulation_data, - *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, - spad, spad + 2 * system->N, spad + 4*system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - test_LonePair_Postprocess <<<BLOCKS, BLOCK_SIZE, 0>>>( system->d_atoms, system->reaxprm.d_gp, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - *dev_workspace, (simulation_data *)data->d_simulation_data, - *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types); - cudaThreadSynchronize (); - cudaCheckError (); - - - //Reduction for E_Lp - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Lp, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Ov - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 2*system->N, spad + 3*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ov, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Un - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 4*system->N, spad + 5*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Un, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, "test_LonePair_postprocess Done... \n"); -#endif - - //Step 4. -#ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); -#endif - - cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH); - Three_Body_Estimate <<<BLOCKS, BLOCK_SIZE>>> - (system->d_atoms, - (control_params *)control->d_control, - *(dev_lists + BONDS), - system->N, (int *)spad); - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Three_Body_Estimate... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); -#endif - - int *thbody = (int *) malloc (sizeof (int) * (dev_lists + BONDS)->num_intrs); - memset (thbody, 0, sizeof (int) * (dev_lists + BONDS)->num_intrs); - copy_host_device (thbody, spad, (dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, RES_SCRATCH); - - int total_3body = thbody [0] * SAFE_ZONE; - for (int x = 1; x < (dev_lists + BONDS)->num_intrs; x++) { - total_3body += thbody [x]*SAFE_ZONE; - thbody [x] += thbody [x-1]; - } - system->num_thbodies = thbody [(dev_lists+BONDS)->num_intrs-1]; - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Total Three body estimate is %d (bonds: %d) \n", total_3body, (dev_lists+BONDS)->num_intrs); -#endif - - if (!system->init_thblist) - { - system->init_thblist = true; - if(!Make_List((dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) { - fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); - exit( INIT_ERR ); - } -#ifdef __CUDA_MEM__ - fprintf (stderr, "Device memory allocated: three body list = %d MB\n", - sizeof (three_body_interaction_data) * total_3body / (1024*1024)); -#endif - } else { - if ((dev_workspace->realloc.bonds > 0) || (system->num_thbodies > (dev_lists+THREE_BODIES)->num_intrs )) { - int size = MAX (dev_workspace->realloc.num_bonds, (dev_lists+BONDS)->num_intrs); - - /*Delete Three-body list*/ - Delete_List( dev_lists + THREE_BODIES, TYP_DEVICE ); - -#ifdef __CUDA_MEM__ - fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", - data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies); -#endif - /*Recreate Three-body list */ - if(!Make_List(size, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) { - fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" ); - exit( INIT_ERR ); - } - } - } - - //copy the indexes into the thb list; - copy_host_device (thbody, ((dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), - cudaMemcpyHostToDevice, LIST_INDEX); - copy_host_device (thbody, ((dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), - cudaMemcpyHostToDevice, LIST_END_INDEX); - - free (thbody ); - -#ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); -#endif - - cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); - - Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, - system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *dev_workspace, - *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), - system->N, system->reaxprm.num_atom_types, - spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N)); - cudaThreadSynchronize (); - cudaCheckError (); - - //Not necessary to validate three-body list anymore, - // Estimate is already done at the beginning which makes sure that - // we have sufficient size for this list - //Cuda_Threebody_List( system, workspace, dev_lists + THREE_BODIES, data->step ); - - //Reduction for E_Ang - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Ang, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Pen - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 2*system->N, spad + 3*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Pen, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Coa - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 4*system->N, spad + 5*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for ext_pres - rvec_spad = (rvec *) (spad + 6*system->N); - Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> - (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - real t_1, t_2; - t_1 = Get_Time (); - //Sum up the f vector for each atom and collect the CdDelta from all the bonds - Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, - (control_params *)control->d_control, - *dev_workspace, - *(dev_lists + BONDS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - t_2 = Get_Timing_Info (t_1); - -#ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Three_Body_Interactions post process Timing %lf \n", t_2); - fprintf (stderr, "Three_Body_Interactions ... Timing %lf \n", t_elapsed ); - fprintf (stderr, "Three_Body_Interactions Done... \n"); -#endif - - //Step 5. -#ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); -#endif - - cuda_memset (spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); - Four_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> - //Four_Body_Interactions <<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>> - ( system->d_atoms, - system->reaxprm.d_gp, - system->reaxprm.d_fbp, - (control_params *)control->d_control, - *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), - (simulation_box *)system->d_box, - (simulation_data *)data->d_simulation_data, - *dev_workspace, - system->N, system->reaxprm.num_atom_types, - spad, spad + 2*system->N, (rvec *) (spad + 4*system->N)); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Tor - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Tor, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for E_Con - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad + 2*system->N, spad + 3*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Con, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Reduction for ext_pres - rvec_spad = (rvec *) (spad + 4*system->N); - Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> - (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Post process here - Four_Body_Postprocess <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, - *dev_workspace, - *(dev_lists + BONDS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, " Four_Body_ Done... \n"); -#endif - - - //Step 6. - if (control->hb_cut > 0) { - -#ifdef __DEBUG_CUDA__ - t_start = Get_Time( ); -#endif - cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH ); - - /* - Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>> - ( system->d_atoms, - system->reaxprm.d_sbp, - system->reaxprm.d_hbp, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *dev_workspace, - *(dev_lists + BONDS), *(dev_lists + HBONDS), - system->N, system->reaxprm.num_atom_types, - spad, (rvec *) (spad + 2*system->N), NULL); - cudaThreadSynchronize (); - cudaCheckError (); - */ - -#ifdef __DEBUG_CUDA__ - real test1,test2; - test1 = Get_Time (); -#endif - - int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + - (((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1); - Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE ) >>> - ( system->d_atoms, - system->reaxprm.d_sbp, - system->reaxprm.d_hbp, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *dev_workspace, - *(dev_lists + BONDS), *(dev_lists + HBONDS), - system->N, system->reaxprm.num_atom_types, - spad, (rvec *) (spad + 2*system->N), NULL); - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - test2 = Get_Timing_Info (test1); - fprintf (stderr, "Timing for the hb and forces ---> %f \n", test2); -#endif - - //Reduction for E_HB - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_HB, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - - //Reduction for ext_pres - rvec_spad = (rvec *) (spad + 2*system->N); - Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> - (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> - (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //Post process here -#ifdef __DEBUG_CUDA__ - real t_1, t_2; - t_1 = Get_Time (); -#endif - Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>> - ( system->d_atoms, - system->reaxprm.d_sbp, - *dev_workspace, - *(dev_lists + BONDS), - *(dev_lists + HBONDS), - *(dev_lists + FAR_NBRS), - system->N, - spad); //this is for the fix to use the shared memory - cudaThreadSynchronize (); - cudaCheckError (); - -#ifdef __DEBUG_CUDA__ - t_2 = Get_Timing_Info ( t_1 ); - fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2); - t_1 = Get_Time (); -#endif - - //Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>> - Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>> - ( system->d_atoms, - system->reaxprm.d_sbp, - *dev_workspace, - *(dev_lists + BONDS), - *(dev_lists + HBONDS), - *(dev_lists + FAR_NBRS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - t_2 = Get_Timing_Info ( t_1 ); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2); - t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Hydrogen bonds post process return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, "Hydrogen_Bond Done... \n"); -#endif - } - return; - } - - void Compute_NonBonded_Forces( reax_system *system, control_params *control, - simulation_data *data,static_storage *workspace, - list** lists, output_controls *out_control ) - { - real t_start, t_elapsed; -#ifdef TEST_ENERGY - fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n", - data->step, "atom1", "atom2", "r12", "evdw", "total" ); - fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n", - data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" ); -#endif - - t_start = Get_Time( ); - QEq( system, control, data, workspace, lists[FAR_NBRS], out_control ); - t_elapsed = Get_Timing_Info( t_start ); - data->timing.QEq += t_elapsed; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "qeq - " ); -#endif - - if ( control->tabulate == 0) - vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control ); - else - Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, - lists, out_control ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nonb forces - " ); -#endif - -#ifdef TEST_FORCES - Print_vdW_Coulomb_Forces( system, control, data, workspace, - lists, out_control ); -#endif - } - - void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, - simulation_data *data,static_storage *workspace, - list** lists, output_controls *out_control ) - { - real t_start, t_elapsed; - real t1 = 0, t2 = 0; - real *spad = (real *) scratch; - rvec *rvec_spad; - int cblks; - - t_start = Get_Time( ); - Cuda_QEq( system, control, data, workspace, lists[FAR_NBRS], out_control ); - t_elapsed = Get_Timing_Info( t_start ); - d_timing.QEq += t_elapsed; - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Cuda_QEq done with timing %lf \n", t_elapsed ); -#endif - cuda_memset (spad, 0, system->N * ( 4 * REAL_SIZE + 2 * RVEC_SIZE), RES_SCRATCH ); - - t_start = Get_Time (); - if ( control->tabulate == 0) - { - cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + - ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1); - Cuda_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE * ( 2*REAL_SIZE + RVEC_SIZE) >>> - ( system->d_atoms, - system->reaxprm.d_tbp, - system->reaxprm.d_gp, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *(dev_lists + FAR_NBRS), - spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), - system->reaxprm.num_atom_types, - system->N ) ; - cudaThreadSynchronize (); - cudaCheckError (); - } - else - { - cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + - ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1); - Cuda_Tabulated_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE* (2*REAL_SIZE + RVEC_SIZE)>>> - ( (reax_atom *)system->d_atoms, - (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, - *(dev_lists + FAR_NBRS), - spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), - d_LR, - system->reaxprm.num_atom_types, - out_control->energy_update_freq, - system->N ) ; - - cudaThreadSynchronize (); - cudaCheckError (); - } - - t_elapsed = Get_Timing_Info (t_start ); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed - t2)); - fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed)); -#endif - - //Reduction on E_vdW - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_vdW, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - //reduction on E_Ele - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (spad + 2*system->N, spad + 3*system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> - (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ele, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - rvec_spad = (rvec *) (spad + 4*system->N); - - //reduction on ext_press - Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE>>> - (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2>>> - (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - } - - - /* This version of Compute_Total_Force computes forces from coefficients - accumulated by all interaction functions. Saves enormous time & space! */ - void Compute_Total_Force( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists ) - { - int i, pj; - list *bonds = (*lists) + BONDS; - - for( i = 0; i < system->N; ++i ) - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) - if( i < bonds->select.bond_list[pj].nbr ) { - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) - Add_dBond_to_Forces( i, pj, system, data, workspace, lists ); - else - Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists ); - } - } - - - void Validate_Lists( static_storage *workspace, list **lists, int step, int n, - int Hmax, int Htop, int num_bonds, int num_hbonds ) - { - int i, flag; - list *bonds, *hbonds; - - bonds = *lists + BONDS; - hbonds = *lists + HBONDS; - - /* far neighbors */ - if( Htop > Hmax * DANGER_ZONE ) { - workspace->realloc.Htop = Htop; - if( Htop > Hmax ) { - fprintf( stderr, - "step%d - ran out of space on H matrix: Htop=%d, max = %d", - step, Htop, Hmax ); - exit(INSUFFICIENT_SPACE); - } - } - - /* bond list */ - flag = -1; - workspace->realloc.num_bonds = num_bonds; - for( i = 0; i < n-1; ++i ) - if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) { - workspace->realloc.bonds = 1; - if( End_Index(i, bonds) > Start_Index(i+1, bonds) ) - flag = i; - } - - if( flag > -1 ) { - fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n", - step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) ); - exit(INSUFFICIENT_SPACE); - } - - if( End_Index(i, bonds) >= bonds->num_intrs-2 ) { - workspace->realloc.bonds = 1; - - if( End_Index(i, bonds) > bonds->num_intrs ) { - fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n", - step, flag, End_Index(i,bonds), bonds->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - - - /* hbonds list */ - if( workspace->num_H > 0 ) { - flag = -1; - workspace->realloc.num_hbonds = num_hbonds; - for( i = 0; i < workspace->num_H-1; ++i ) - if( Num_Entries(i, hbonds) >= - (Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) { - workspace->realloc.hbonds = 1; - if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) ) - flag = i; - } - - if( flag > -1 ) { - fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n", - step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) ); - exit(INSUFFICIENT_SPACE); - } - - if( Num_Entries(i,hbonds) >= - (hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) { - workspace->realloc.hbonds = 1; - - if( End_Index(i, hbonds) > hbonds->num_intrs ) { - fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n", - step, flag, End_Index(i,hbonds), hbonds->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - } - } - - - void Cuda_Validate_Lists( reax_system *system, static_storage *workspace, list **lists, int step, int n, - int num_bonds, int num_hbonds ) - { - int i, flag; - list *bonds, *hbonds, *thblist; - int *bonds_start, *bonds_end; - int *hbonds_start, *hbonds_end; - int *mat_start, *mat_end; - int max_sparse_entries = 0; - - bonds = *lists + BONDS; - hbonds = *lists + HBONDS; - - bonds_start = (int *) calloc (bonds->n, INT_SIZE); - bonds_end = (int *) calloc (bonds->n, INT_SIZE); - - hbonds_start = (int *) calloc (hbonds->n, INT_SIZE ); - hbonds_end = (int *) calloc (hbonds->n, INT_SIZE ); - - mat_start = (int *) calloc (workspace->H.n, INT_SIZE ); - mat_end = (int *) calloc (workspace->H.n, INT_SIZE ); - - copy_host_device (bonds_start, bonds->index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (bonds_end, bonds->end_index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - - copy_host_device (hbonds_start, hbonds->index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (hbonds_end, hbonds->end_index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - - copy_host_device (mat_start, workspace->H.start, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (mat_end, workspace->H.end, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - - /* Sparse Matrix entries */ - -#ifdef __CUDA_TEST__ - /* - workspace->realloc.Htop = 0; - for (i = 0; i < workspace->H.n-1; i++) { - if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])){ - workspace->realloc.Htop = mat_end[i] - mat_start[i]; - } - } - */ -#endif - - flag = -1; - workspace->realloc.Htop = 0; - for ( i = 0; i < n-1; i ++){ - - if( (mat_end[i] - mat_start[i]) > - (system->max_sparse_matrix_entries * DANGER_ZONE )) { - //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index: %d (%d %d) \n", - // step, i, mat_start[i], mat_end[i]); - if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])) - workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ; - } - - if ( (mat_end[i] > mat_start[i+1]) ){ - fprintf( stderr, "step%d-matcheck failed: i=%d end(i)=%d start(i+1)=%d\n", - step, flag, mat_end[i], mat_start[i+1]); - exit(INSUFFICIENT_SPACE); - } - } - - if( (mat_end[i] - mat_start[i]) > system->max_sparse_matrix_entries * DANGER_ZONE ) { - if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])) - workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ; - //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index %d (%d %d) -- %d \n", - // step, i, mat_start[i], mat_end[i], - // (int) (system->max_sparse_matrix_entries * DANGER_ZONE)); - - if( mat_end[i] > system->N * system->max_sparse_matrix_entries ) { - fprintf( stderr, "step%d-matchk failed: i=%d end(i)=%d mat_end=%d\n", - step, flag, mat_end[i], system->N * system->max_sparse_matrix_entries); - exit(INSUFFICIENT_SPACE); - } - } - - - /* bond list */ -#ifdef __CUDA_TEST__ - //workspace->realloc.bonds = 1; -#endif - flag = -1; - workspace->realloc.num_bonds = 0; - for( i = 0; i < n-1; ++i ) { - workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS ); - if( bonds_end[i] >= bonds_start[i+1]-2 ) { - workspace->realloc.bonds = 1; - //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", - // step, i, bonds_start [i], bonds_end[i]); - if( bonds_end[i] > bonds_start[i+1] ) - flag = i; - } - } - - if( flag > -1 ) { - fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n", - step, flag, bonds_end[flag], bonds_start[flag+1] ); - exit(INSUFFICIENT_SPACE); - } - - workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS ); - if( bonds_end[i] >= bonds->num_intrs-2 ) { - workspace->realloc.bonds = 1; - //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", - // step, i, bonds_start [i], bonds_end[i]); - - if( bonds_end[i] > bonds->num_intrs ) { - fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n", - step, flag, bonds_end[i], bonds->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - - //fprintf (stderr, "step:%d Total bonds: %d \n", step, workspace->realloc.num_bonds); - - /* hbonds list */ - if( workspace->num_H > 0 ) { -#ifdef __CUDA_TEST__ - //workspace->realloc.hbonds = 1; -#endif - flag = -1; - workspace->realloc.num_hbonds = 0; - for( i = 0; i < workspace->num_H-1; ++i ) { - workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS ); - - if( (hbonds_end[i] - hbonds_start[i]) >= - (hbonds_start[i+1] - hbonds_start[i]) * DANGER_ZONE ) { - workspace->realloc.hbonds = 1; - //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", - // step, i, hbonds_start [i], hbonds_end[i]); - if( hbonds_end[i] > hbonds_start[i+1] ) - flag = i; - } - } - - if( flag > -1 ) { - fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d,end(i)=%d str(i+1)=%d\n", - step, flag, hbonds_start[(flag)],hbonds_end[(flag)], hbonds_start[(flag+1)] ); - exit(INSUFFICIENT_SPACE); - } - - workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS ); - if( (hbonds_end[i] - hbonds_start[i]) >= - (hbonds->num_intrs - hbonds_start[i]) * DANGER_ZONE ) { - workspace->realloc.hbonds = 1; - //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", - // step, i, hbonds_start [i], hbonds_end[i]); - - if( hbonds_end[i] > hbonds->num_intrs ) { - fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n", - step, flag, hbonds_end[i], hbonds->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - } - - //fprintf (stderr, "step:%d Total Hbonds: %d \n", step, workspace->realloc.num_hbonds); - - free (bonds_start); - free (bonds_end ); - - free (hbonds_start ); - free (hbonds_end ); - - free (mat_start ); - free (mat_end ); - } - - void Cuda_Threebody_List( reax_system *system, static_storage *workspace, list *thblist, int step ) - { - int *thb_start, *thb_end; - int i, flag; - - thb_start = (int *) calloc (thblist->n, INT_SIZE); - thb_end = (int *) calloc (thblist->n, INT_SIZE ); - - copy_host_device (thb_start, thblist->index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (thb_end, thblist->end_index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); - - /*three_body list*/ - flag = -1; - workspace->realloc.num_3body = 0; - for( i = 0; i < thblist->n-1; ++i ){ - if( (thb_end[i] - thb_start[i]) >= (thb_start[i+1] - thb_start[i])*DANGER_ZONE ) { - workspace->realloc.thbody = 1; - if( thb_end[i] > thb_end[i+1] || thb_end[i] > thblist->num_intrs ) { - flag = i; - break; - } - } - } - - if( flag > -1 ) { - //fprintf( stderr, "step%d-thbchk failed: i=%d end(i)=%d str(i+1)=%d\n", - // step, flag, thb_end[flag], thb_start[flag+1] ); - fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", - step, flag-1, thb_start[flag-1], thb_end[flag-1], thblist->num_intrs ); - fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", - step, flag, thb_start[flag], thb_end[flag], thblist->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - - if( (thb_end[i]-thb_start[i]) >= (thblist->num_intrs - thb_start[i])*DANGER_ZONE ) { - workspace->realloc.thbody = 1; - - if( thb_end[i] > thblist->num_intrs ) { - fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", - step, i-1, thb_start[i-1], thb_end[i-1], thblist->num_intrs ); - fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n", - step, i, thb_start[i], thb_end[i], thblist->num_intrs ); - exit(INSUFFICIENT_SPACE); - } - } - - free (thb_start); - free (thb_end); - } - - - void Init_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop, btop_i, btop_j, num_bonds, num_hbonds; - int ihb, jhb, ihb_top, jhb_top; - int flag; - real r_ij, r2, self_coef; - real dr3gamij_1, dr3gamij_3, Tap; - //real val, dif, base; - real C12, C34, C56; - real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - sparse_matrix *H; - list *far_nbrs, *bonds, *hbonds; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - //LR_lookup_table *t; - reax_atom *atom_i, *atom_j; - bond_data *ibond, *jbond; - bond_order_data *bo_ij, *bo_ji; - - far_nbrs = *lists + FAR_NBRS; - bonds = *lists + BONDS; - hbonds = *lists + HBONDS; - - H = &workspace->H; - Htop = 0; - num_bonds = 0; - num_hbonds = 0; - btop_i = btop_j = 0; - p_boc1 = system->reaxprm.gp.l[0]; - p_boc2 = system->reaxprm.gp.l[1]; - - for( i = 0; i < system->N; ++i ) { - atom_i = &(system->atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - H->start[i] = Htop; - btop_i = End_Index( i, bonds ); - sbp_i = &(system->reaxprm.sbp[type_i]); - ihb = ihb_top = -1; - if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 ) - ihb_top = End_Index( workspace->hbond_index[i], hbonds ); - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(system->atoms[j]); - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if( nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box), - nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - - if( flag ){ - type_j = system->atoms[j].type; - r_ij = nbr_pj->d; - sbp_j = &(system->reaxprm.sbp[type_j]); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - self_coef = (i == j) ? 0.5 : 1.0; - - /* H matrix entry */ - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - H->entries[Htop].j = j; - H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3; - ++Htop; - - /* hydrogen bond lists */ - if( control->hb_cut > 0 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - jhb = sbp_j->p_hbond; - if( ihb == 1 && jhb == 2 ) { - hbonds->select.hbond_list[ihb_top].nbr = j; - hbonds->select.hbond_list[ihb_top].scl = 1; - hbonds->select.hbond_list[ihb_top].ptr = nbr_pj; - ++ihb_top; - ++num_hbonds; - } - else if( ihb == 2 && jhb == 1 ) { - jhb_top = End_Index( workspace->hbond_index[j], hbonds ); - hbonds->select.hbond_list[jhb_top].nbr = i; - hbonds->select.hbond_list[jhb_top].scl = -1; - hbonds->select.hbond_list[jhb_top].ptr = nbr_pj; - Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds ); - ++num_hbonds; - } - } - - /* uncorrected bond orders */ - if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) { - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - num_bonds += 2; - /****** bonds i-j and j-i ******/ - ibond = &( bonds->select.bond_list[btop_i] ); - btop_j = End_Index( j, bonds ); - jbond = &(bonds->select.bond_list[btop_j]); - - ibond->nbr = j; - jbond->nbr = i; - ibond->d = r_ij; - jbond->d = r_ij; - rvec_Copy( ibond->dvec, nbr_pj->dvec ); - rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); - ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); - ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); - ibond->dbond_index = btop_i; - jbond->dbond_index = btop_i; - ibond->sym_index = btop_j; - jbond->sym_index = btop_i; - ++btop_i; - Set_End_Index( j, btop_j+1, bonds ); - - bo_ij = &( ibond->bo_data ); - bo_ji = &( jbond->bo_data ); - bo_ji->BO = bo_ij->BO = BO; - bo_ji->BO_s = bo_ij->BO_s = BO_s; - bo_ji->BO_pi = bo_ij->BO_pi = BO_pi; - bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2; - - /* Bond Order page2-3, derivative of total bond order prime */ - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - - /* Only dln_BOp_xx wrt. dr_i is stored here, note that - dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ - rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi2, - -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); - rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s); - rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi ); - rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 ); - - /* Only dBOp wrt. dr_i is stored here, note that - dBOp/dr_i = -dBOp/dr_j and all others are 0 */ - rvec_Scale( bo_ij->dBOp, - -(bo_ij->BO_s * Cln_BOp_s + - bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); - rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp ); - - rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp ); - rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp ); - - bo_ij->BO_s -= control->bo_cut; - bo_ij->BO -= control->bo_cut; - bo_ji->BO_s -= control->bo_cut; - bo_ji->BO -= control->bo_cut; - workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp - workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp - bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; - bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; - - /*fprintf( stderr, "%d %d %g %g %g\n", - i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/ - - /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", - Cln_BOp_s, twbp->p_bo2, C12 ); - fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", - Cln_BOp_pi, twbp->p_bo4, C34 ); - fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n", - Cln_BOp_pi2, twbp->p_bo6, C56 );*/ - /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2); - fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4); - fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6); - fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", - twbp->r_s, twbp->r_p, twbp->r_pp ); - fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/ - - /*fprintf( stderr, "\tfactors: %g %g %g\n", - -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pp), - -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/ - /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", - bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] ); - fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", - bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], - bo_ij->dln_BOp_pi[2] ); - fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n", - bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], - bo_ij->dln_BOp_pi2[2] );*/ - - Set_End_Index( j, btop_j+1, bonds ); - } - } - } - } - - H->entries[Htop].j = i; - H->entries[Htop].val = system->reaxprm.sbp[type_i].eta; - ++Htop; - - Set_End_Index( i, btop_i, bonds ); - if( ihb == 1 ) - Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds ); - //fprintf( stderr, "%d bonds start: %d, end: %d\n", - // i, Start_Index( i, bonds ), End_Index( i, bonds ) ); - } - - // mark the end of j list - H->start[i] = Htop; - /* validate lists - decide if reallocation is required! */ - Validate_Lists( workspace, lists, - data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", - data->step, Htop, num_bonds, num_hbonds ); - -#endif - } - - - GLOBAL void Estimate_Sparse_Matrix_Entries ( reax_atom *atoms, control_params *control, - simulation_data *data, simulation_box *box, list far_nbrs, int N, int *indices ) { - - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop; - int flag; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - - int temp; - - Htop = 0; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - atom_i = &(atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, &far_nbrs); - end_i = End_Index(i, &far_nbrs); - indices[i] = Htop; - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(atoms[j]); - - //CHANGE ORIGINAL - //if (i < j) continue; - //CHANGE ORIGINAL - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if( nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec)) <= - SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - - if( flag ){ - ++Htop; - } - } - - ++Htop; - - // mark the end of j list - indices[i] = Htop; - } - - - - - GLOBAL void Init_Forces( reax_atom *atoms, global_parameters g_params, control_params *control, - single_body_parameters *sbp, two_body_parameters *tbp, - simulation_data *data, simulation_box *box, static_storage workspace, - list far_nbrs, list bonds, list hbonds, - int N, int max_sparse_entries, int num_atom_types ) - { - - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop, btop_i, btop_j, num_bonds, num_hbonds; - int ihb, jhb, ihb_top, jhb_top; - int flag; - real r_ij, r2, self_coef; - real dr3gamij_1, dr3gamij_3, Tap; - //real val, dif, base; - real C12, C34, C56; - real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - sparse_matrix *H; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - //LR_lookup_table *t; - reax_atom *atom_i, *atom_j; - bond_data *ibond, *jbond; - bond_order_data *bo_ij, *bo_ji; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - H = &( workspace.H ); - //CHANGE ORIGINAL - //Htop = 0; - Htop = i * max_sparse_entries; - //CHANGE ORIGINAL - num_bonds = 0; - num_hbonds = 0; - btop_i = btop_j = 0; - p_boc1 = g_params.l[0]; - p_boc2 = g_params.l[1]; - - //for( i = 0; i < system->N; ++i ) - atom_i = &(atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, &far_nbrs); - end_i = End_Index(i, &far_nbrs); - - H->start[i] = Htop; - H->end[i] = Htop; - - btop_i = End_Index( i, &bonds ); - sbp_i = &(sbp[type_i]); - ihb = ihb_top = -1; - - ihb = sbp_i->p_hbond; - - if( control->hb_cut > 0 && (ihb==1 || ihb == 2)) - ihb_top = End_Index( workspace.hbond_index[i], &hbonds ); - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(atoms[j]); - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if( nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if (i > j) { - if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - } else if (i < j) { - if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - } - - if( flag ){ - - type_j = atoms[j].type; - r_ij = nbr_pj->d; - sbp_j = &(sbp[type_j]); - twbp = &(tbp[ index_tbp (type_i,type_j, num_atom_types) ]); - self_coef = (i == j) ? 0.5 : 1.0; - - /* H matrix entry */ - - //CHANGE ORIGINAL - //if (i > j) { - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - H->entries[Htop].j = j; - H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3; - - ++Htop; - //} - //CHANGE ORIGINAL - - /* hydrogen bond lists */ - if( control->hb_cut > 0 && (ihb==1 || ihb == 2) && - nbr_pj->d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - jhb = sbp_j->p_hbond; - - if (ihb == 1 && jhb == 2) { - if (i > j) { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = 1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } else { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = -1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } - } else if (ihb == 2 && jhb == 1) { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = 1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - //TODO - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } - } - - /* uncorrected bond orders */ - if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) { - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - - if( BO >= control->bo_cut ) { - //CHANGE ORIGINAL - num_bonds += 1; - //CHANGE ORIGINAL - - /****** bonds i-j and j-i ******/ - - /* Bond Order page2-3, derivative of total bond order prime */ - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - - - if (i > j) - { - ibond = &( bonds.select.bond_list[btop_i] ); - ibond->nbr = j; - ibond->d = r_ij; - rvec_Copy( ibond->dvec, nbr_pj->dvec ); - ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); - - //ibond->dbond_index = btop_i; - //ibond->sym_index = btop_j; - ++btop_i; - - bo_ij = &( ibond->bo_data ); - bo_ij->BO = BO; - bo_ij->BO_s = BO_s; - bo_ij->BO_pi = BO_pi; - bo_ij->BO_pi2 = BO_pi2; - - //Auxilary data structures - ibond->scratch = 0; - ibond->CdDelta_ij = 0; - rvec_MakeZero (ibond->f); - - ibond->l = -1; - ibond->CdDelta_jk = 0; - ibond->Cdbo_kl = 0; - rvec_MakeZero (ibond->i_f); - rvec_MakeZero (ibond->k_f); - - rvec_MakeZero (ibond->h_f); - - rvec_MakeZero (ibond->t_f); - - // Only dln_BOp_xx wrt. dr_i is stored here, note that - // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 - rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi2, - -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); - - // Only dBOp wrt. dr_i is stored here, note that - // dBOp/dr_i = -dBOp/dr_j and all others are 0 - rvec_Scale( bo_ij->dBOp, - -(bo_ij->BO_s * Cln_BOp_s + - bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); - - rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp ); - - bo_ij->BO_s -= control->bo_cut; - bo_ij->BO -= control->bo_cut; - workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp - - bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; - - - } else if ( i < j ) - { - rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2; - rvec dBOp; - - btop_j = btop_i; - - jbond = &(bonds.select.bond_list[btop_j]); - jbond->nbr = j; - jbond->d = r_ij; - rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); - ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); - - btop_i ++; - //jbond->dbond_index = btop_i; - //jbond->sym_index = btop_i; - - bo_ji = &( jbond->bo_data ); - bo_ji->BO = BO; - bo_ji->BO_s = BO_s; - bo_ji->BO_pi = BO_pi; - bo_ji->BO_pi2 = BO_pi2; - - //Auxilary data structures - jbond->scratch = 0; - jbond->CdDelta_ij = 0; - rvec_MakeZero (jbond->f); - - jbond->l = -1; - jbond->CdDelta_jk = 0; - jbond->Cdbo_kl = 0; - rvec_MakeZero (jbond->i_f); - rvec_MakeZero (jbond->k_f); - - rvec_MakeZero (jbond->h_f); - - rvec_MakeZero (jbond->t_f); - - // Only dln_BOp_xx wrt. dr_i is stored here, note that - // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 - rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec); - rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec); - rvec_Scale(dln_BOp_pi2, - -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec); - - rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s); - rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi ); - rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 ); - - // Only dBOp wrt. dr_i is stored here, note that - // dBOp/dr_i = -dBOp/dr_j and all others are 0 - rvec_Scale( dBOp, - -(BO_s * Cln_BOp_s + - BO_pi * Cln_BOp_pi + - BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec ); - rvec_Scale( bo_ji->dBOp, -1., dBOp ); - - rvec_Add( workspace.dDeltap_self[i] , bo_ji->dBOp ); - - bo_ji->BO_s -= control->bo_cut; - bo_ji->BO -= control->bo_cut; - workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp - - bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; - - } - } - } - } - } - - H->entries[Htop].j = i; - H->entries[Htop].val = sbp[type_i].eta; - ++Htop; - - H->end[i] = Htop; - - Set_End_Index( i, btop_i, &bonds ); - if( ihb == 1 || ihb == 2) - Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds ); - - //fprintf( stderr, "%d bonds start: %d, end: %d\n", - // i, Start_Index( i, bonds ), End_Index( i, bonds ) ); - //} - - // mark the end of j list - //H->start[i] = Htop; - /* validate lists - decide if reallocation is required! */ - //Validate_Lists( workspace, lists, - // data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); -} - -GLOBAL void Init_Forces_Tab ( reax_atom *atoms, global_parameters g_params, control_params *control, - single_body_parameters *sbp, two_body_parameters *tbp, - simulation_data *data, simulation_box *box, static_storage workspace, - list far_nbrs, list bonds, list hbonds, - int N, int max_sparse_entries, int num_atom_types, - LR_lookup_table *d_LR) -{ - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop, btop_i, btop_j, num_bonds, num_hbonds; - int tmin, tmax, r; - int ihb, jhb, ihb_top, jhb_top; - int flag; - real r_ij, r2, self_coef; - real val, dif, base; - real C12, C34, C56; - real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - sparse_matrix *H; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - reax_atom *atom_i, *atom_j; - bond_data *ibond, *jbond; - bond_order_data *bo_ij, *bo_ji; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - H = &(workspace.H); - //CHANGE ORIGINAL - Htop = i * max_sparse_entries; - //CHANGE ORIGINAL - num_bonds = 0; - num_hbonds = 0; - btop_i = btop_j = 0; - p_boc1 = g_params.l[0]; - p_boc2 = g_params.l[1]; - - //for( i = 0; i < system->N; ++i ) - atom_i = &(atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, &far_nbrs); - end_i = End_Index(i, &far_nbrs); - H->start[i] = Htop; - H->end[i] = Htop; - btop_i = End_Index( i, &bonds ); - sbp_i = &(sbp[type_i]); - ihb = ihb_top = -1; - - ihb = sbp_i->p_hbond; - - if( control->hb_cut > 0 && (ihb==1 || ihb == 2)) - ihb_top = End_Index( workspace.hbond_index[i], &hbonds ); - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(atoms[j]); - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if(nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if (i > j) { - if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - } - else if ( i < j) { - if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - } - - if( flag ){ - type_j = atoms[j].type; - r_ij = nbr_pj->d; - sbp_j = &(sbp[type_j]); - twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]); - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( d_LR[ index_lr (tmin, tmax, num_atom_types) ]); - - /* cubic spline interpolation */ - //CHANGE ORIGINAL - //if (i > j) { - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - val *= EV_to_KCALpMOL / C_ele; - - H->entries[Htop].j = j; - H->entries[Htop].val = self_coef * val; - //H->j [Htop] = j; - //H->val [Htop] = self_coef * val; - ++Htop; - //} - //CHANGE ORIGINAL - - /* hydrogen bond lists */ - if( control->hb_cut > 0 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - jhb = sbp_j->p_hbond; - - if ( ihb == 1 && jhb == 2 ) { - if (i > j) { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = 1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } else { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = -1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } - } else if (ihb == 2 && jhb == 1) { - hbonds.select.hbond_list[ihb_top].nbr = j; - hbonds.select.hbond_list[ihb_top].scl = 1; - hbonds.select.hbond_list[ihb_top].ptr = nbr_pj; - - //Auxilary data structures - rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f); - hbonds.select.hbond_list[ihb_top].sym_index= -1; - ++ihb_top; - ++num_hbonds; - } - } - - /* uncorrected bond orders */ - if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) { - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - - //CHANGE ORIGINAL - num_bonds += 1; - //CHANGE ORIGINAL - - /****** bonds i-j and j-i ******/ - if ( i > j ) - { - ibond = &( bonds.select.bond_list[btop_i] ); - ibond->nbr = j; - ibond->d = r_ij; - - rvec_Copy( ibond->dvec, nbr_pj->dvec ); - ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); - - //ibond->dbond_index = btop_i; - //ibond->sym_index = btop_j; - - ++btop_i; - - bo_ij = &( ibond->bo_data ); - bo_ij->BO = BO; - bo_ij->BO_s = BO_s; - bo_ij->BO_pi = BO_pi; - bo_ij->BO_pi2 = BO_pi2; - - //Auxilary data strucutres to resolve dependencies - ibond->scratch = 0; - ibond->CdDelta_ij = 0; - rvec_MakeZero (ibond->f); - - ibond->l = -1; - ibond->CdDelta_jk = 0; - ibond->Cdbo_kl = 0; - rvec_MakeZero (ibond->i_f); - rvec_MakeZero (ibond->k_f); - - rvec_MakeZero (ibond->h_f); - - rvec_MakeZero (ibond->t_f); - - /* Bond Order page2-3, derivative of total bond order prime */ - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - - /* Only dln_BOp_xx wrt. dr_i is stored here, note that - dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ - rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi2, - -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); - - /* Only dBOp wrt. dr_i is stored here, note that - dBOp/dr_i = -dBOp/dr_j and all others are 0 */ - rvec_Scale( bo_ij->dBOp, - -(bo_ij->BO_s * Cln_BOp_s + - bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); - - rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp ); - - bo_ij->BO_s -= control->bo_cut; - bo_ij->BO -= control->bo_cut; - - workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp - - bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; - } - else { - rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2; - rvec dBOp; - - btop_j = btop_i; - - jbond = &( bonds.select.bond_list[btop_j] ); - jbond->nbr = j; - jbond->d = r_ij; - - rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); - ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); - - //jbond->dbond_index = btop_i; - //jbond->sym_index = btop_i; - - ++btop_i; - - bo_ji = &( jbond->bo_data ); - - bo_ji->BO = BO; - bo_ji->BO_s = BO_s; - bo_ji->BO_pi = BO_pi; - bo_ji->BO_pi2 = BO_pi2; - - // Auxilary data structures to resolve dependencies - jbond->scratch = 0; - jbond->CdDelta_ij = 0; - rvec_MakeZero (jbond->f); - - jbond->l = -1; - jbond->CdDelta_jk = 0; - jbond->Cdbo_kl = 0; - rvec_MakeZero (jbond->i_f); - rvec_MakeZero (jbond->k_f); - - rvec_MakeZero (jbond->h_f); - - rvec_MakeZero (jbond->t_f); - - // Bond Order page2-3, derivative of total bond order prime - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - - // Only dln_BOp_xx wrt. dr_i is stored here, note that - // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 - - rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec); - rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec); - rvec_Scale(dln_BOp_pi2, -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec); - - rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s); - rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi ); - rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 ); - - // Only dBOp wrt. dr_i is stored here, note that - // dBOp/dr_i = -dBOp/dr_j and all others are 0 - //CHANGE ORIGINAL - rvec_Scale( dBOp, - -(BO_s * Cln_BOp_s + - BO_pi * Cln_BOp_pi + - BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec); - rvec_Scale( bo_ji->dBOp, -1., dBOp); - //CHANGE ORIGINAL - - rvec_Add( workspace.dDeltap_self[i], bo_ji->dBOp ); - - bo_ji->BO_s -= control->bo_cut; - bo_ji->BO -= control->bo_cut; - - workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp - - bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; - } - } - } - } - } - - H->entries[Htop].j = i; - H->entries[Htop].val = sbp[type_i].eta; - - //H->j [Htop] = i; - //H->val [Htop] = sbp[type_i].eta; - - ++Htop; - - H->end[i] = Htop; - Set_End_Index( i, btop_i, &bonds ); - if( ihb == 1 || ihb == 2) - Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds ); -} - -GLOBAL void fix_sym_dbond_indices (list pbonds, int N) -{ - int i, nbr; - bond_data *ibond, *jbond; - int atom_j; - - list *bonds = &pbonds; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) - { - ibond = &( bonds->select.bond_list [j] ); - nbr = ibond->nbr; - - for (int k = Start_Index (nbr, bonds); k < End_Index (nbr, bonds); k ++) - { - jbond = &( bonds->select.bond_list[ k ] ); - atom_j = jbond->nbr; - - if ( (atom_j == i) ) - { - if (i > nbr) { - ibond->dbond_index = j; - jbond->dbond_index = j; - - ibond->sym_index = k; - jbond->sym_index = j; - } - } - } - } -} - - -GLOBAL void fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N) -{ - static_storage *workspace = &p_workspace; - hbond_data *ihbond, *jhbond; - int nbr; - - //int i = (blockIdx.x * blockDim.x + threadIdx.x) >> 4; - int i = (blockIdx.x); - int start = Start_Index (workspace->hbond_index[i], &hbonds); - int end = End_Index (workspace->hbond_index[i], &hbonds); - //int j = start + threadIdx.x; - //int j = start + (threadIdx.x % 16); - - //for (int j = Start_Index (workspace->hbond_index[i], &hbonds); - // j < End_Index (workspace->hbond_index[i], &hbonds); j++) - int j = start + threadIdx.x; - while (j < end) - //for (int j = start; j < end; j++) - { - ihbond = &( hbonds.select.hbond_list [j] ); - nbr = ihbond->nbr; - - int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds); - int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds); - - for (int k = nbrstart; k < nbrend; k++) - //k = nbrstart + threadIdx.x; - //while (k < nbrend) - { - jhbond = &( hbonds.select.hbond_list [k] ); - - if (jhbond->nbr == i){ - ihbond->sym_index = k; - jhbond->sym_index = j; - break; - } - - //k += blockDim.x; - } - - j += 32; - } -} - -GLOBAL void New_fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N ) -{ - - static_storage *workspace = &p_workspace; - hbond_data *ihbond, *jhbond; - - int __THREADS_PER_ATOM__ = HBONDS_SYM_THREADS_PER_ATOM; - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1); - int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; - - if (warp_id >= N) return; - - int i = warp_id; - int nbr; - int k; - int start = Start_Index (workspace->hbond_index[i], &hbonds); - int end = End_Index (workspace->hbond_index[i], &hbonds); - int j = start + lane_id; - //for (int j = start; j < end; j++) - while (j < end) - { - ihbond = &( hbonds.select.hbond_list [j] ); - nbr = ihbond->nbr; - - int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds); - int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds); - - //k = nbrstart + lane_id; - //if (lane_id == 0) found [my_bucket] = 0; - //while (k < nbrend) - for (k = nbrstart; k < nbrend; k++) - { - jhbond = &( hbonds.select.hbond_list [k] ); - - if (jhbond->nbr == i){ - ihbond->sym_index = k; - jhbond->sym_index = j; - break; - } - } - - j += __THREADS_PER_ATOM__; - } -} - - -void Init_Forces_Tab( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int Htop, btop_i, btop_j, num_bonds, num_hbonds; - int tmin, tmax, r; - int ihb, jhb, ihb_top, jhb_top; - int flag; - real r_ij, r2, self_coef; - real val, dif, base; - real C12, C34, C56; - real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - sparse_matrix *H; - list *far_nbrs, *bonds, *hbonds; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - reax_atom *atom_i, *atom_j; - bond_data *ibond, *jbond; - bond_order_data *bo_ij, *bo_ji; - - far_nbrs = *lists + FAR_NBRS; - bonds = *lists + BONDS; - hbonds = *lists + HBONDS; - - H = &workspace->H; - Htop = 0; - num_bonds = 0; - num_hbonds = 0; - btop_i = btop_j = 0; - p_boc1 = system->reaxprm.gp.l[0]; - p_boc2 = system->reaxprm.gp.l[1]; - - for( i = 0; i < system->N; ++i ) { - atom_i = &(system->atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - H->start[i] = Htop; - btop_i = End_Index( i, bonds ); - sbp_i = &(system->reaxprm.sbp[type_i]); - ihb = ihb_top = -1; - if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 ) - ihb_top = End_Index( workspace->hbond_index[i], hbonds ); - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(system->atoms[j]); - - flag = 0; - if((data->step-data->prev_steps) % control->reneighbor == 0) { - if(nbr_pj->d <= control->r_cut) - flag = 1; - else flag = 0; - } - else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box), - nbr_pj->dvec))<=SQR(control->r_cut)){ - nbr_pj->d = sqrt(nbr_pj->d); - flag = 1; - } - - if( flag ){ - type_j = system->atoms[j].type; - r_ij = nbr_pj->d; - sbp_j = &(system->reaxprm.sbp[type_j]); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); - - /* cubic spline interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - val *= EV_to_KCALpMOL / C_ele; - - H->entries[Htop].j = j; - H->entries[Htop].val = self_coef * val; - ++Htop; - - /* hydrogen bond lists */ - if( control->hb_cut > 0 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - jhb = sbp_j->p_hbond; - if( ihb == 1 && jhb == 2 ) { - hbonds->select.hbond_list[ihb_top].nbr = j; - hbonds->select.hbond_list[ihb_top].scl = 1; - hbonds->select.hbond_list[ihb_top].ptr = nbr_pj; - ++ihb_top; - ++num_hbonds; - } - else if( ihb == 2 && jhb == 1 ) { - jhb_top = End_Index( workspace->hbond_index[j], hbonds ); - hbonds->select.hbond_list[jhb_top].nbr = i; - hbonds->select.hbond_list[jhb_top].scl = -1; - hbonds->select.hbond_list[jhb_top].ptr = nbr_pj; - Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds ); - ++num_hbonds; - } - } - - /* uncorrected bond orders */ - if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) { - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - num_bonds += 2; - /****** bonds i-j and j-i ******/ - ibond = &( bonds->select.bond_list[btop_i] ); - btop_j = End_Index( j, bonds ); - jbond = &(bonds->select.bond_list[btop_j]); - - ibond->nbr = j; - jbond->nbr = i; - ibond->d = r_ij; - jbond->d = r_ij; - rvec_Copy( ibond->dvec, nbr_pj->dvec ); - rvec_Scale( jbond->dvec, -1, nbr_pj->dvec ); - ivec_Copy( ibond->rel_box, nbr_pj->rel_box ); - ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box ); - ibond->dbond_index = btop_i; - jbond->dbond_index = btop_i; - ibond->sym_index = btop_j; - jbond->sym_index = btop_i; - ++btop_i; - Set_End_Index( j, btop_j+1, bonds ); - - bo_ij = &( ibond->bo_data ); - bo_ji = &( jbond->bo_data ); - bo_ji->BO = bo_ij->BO = BO; - bo_ji->BO_s = bo_ij->BO_s = BO_s; - bo_ji->BO_pi = bo_ij->BO_pi = BO_pi; - bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2; - - /* Bond Order page2-3, derivative of total bond order prime */ - Cln_BOp_s = twbp->p_bo2 * C12 / r2; - Cln_BOp_pi = twbp->p_bo4 * C34 / r2; - Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2; - - /* Only dln_BOp_xx wrt. dr_i is stored here, note that - dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */ - rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec); - rvec_Scale(bo_ij->dln_BOp_pi2, - -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec); - rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s); - rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi ); - rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 ); - - /* Only dBOp wrt. dr_i is stored here, note that - dBOp/dr_i = -dBOp/dr_j and all others are 0 */ - rvec_Scale( bo_ij->dBOp, - -(bo_ij->BO_s * Cln_BOp_s + - bo_ij->BO_pi * Cln_BOp_pi + - bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec ); - rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp ); - - rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp ); - rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp ); - - bo_ij->BO_s -= control->bo_cut; - bo_ij->BO -= control->bo_cut; - bo_ji->BO_s -= control->bo_cut; - bo_ji->BO -= control->bo_cut; - workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp - workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp - bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0; - bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0; - - Set_End_Index( j, btop_j+1, bonds ); - } - } - } - } - - H->entries[Htop].j = i; - H->entries[Htop].val = system->reaxprm.sbp[type_i].eta; - ++Htop; - - Set_End_Index( i, btop_i, bonds ); - if( ihb == 1 ) - Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds ); - } - - // mark the end of j list - H->start[i] = Htop; - /* validate lists - decide if reallocation is required! */ - Validate_Lists( workspace, lists, - data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", - data->step, Htop, num_bonds, num_hbonds ); - //Print_Bonds( system, bonds, "sbonds.out" ); - //Print_Bond_List2( system, bonds, "sbonds.out" ); - //Print_Sparse_Matrix2( H, "H.out" ); -#endif -} - -void Estimate_Storage_Sizes( reax_system *system, control_params *control, - list **lists, int *Htop, int *hb_top, - int *bond_top, int *num_3body ) { - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int ihb, jhb; - real r_ij, r2; - real C12, C34, C56; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - list *far_nbrs; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - - far_nbrs = *lists + FAR_NBRS; - p_boc1 = system->reaxprm.gp.l[0]; - p_boc2 = system->reaxprm.gp.l[1]; - - for( i = 0; i < system->N; ++i ) { - atom_i = &(system->atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - sbp_i = &(system->reaxprm.sbp[type_i]); - ihb = sbp_i->p_hbond; - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &(system->atoms[j]); - type_j = atom_j->type; - sbp_j = &(system->reaxprm.sbp[type_j]); - twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]); - - if( nbr_pj->d <= control->r_cut ) { - ++(*Htop); - - /* hydrogen bond lists */ - if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - jhb = sbp_j->p_hbond; - if( ihb == 1 && jhb == 2 ) - ++hb_top[i]; - else if( ihb == 2 && jhb == 1 ) - ++hb_top[j]; - } - - /* uncorrected bond orders */ - if( nbr_pj->d <= control->nbr_cut ) { - r_ij = nbr_pj->d; - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - ++bond_top[i]; - ++bond_top[j]; - } - } - } - } - } - - *Htop += system->N; - *Htop *= SAFE_ZONE; - - for( i = 0; i < system->N; ++i ) { - hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS ); - *num_3body += SQR(bond_top[i]); - bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS ); - } - *num_3body *= SAFE_ZONE; -} - -void Cuda_Estimate_Storage_Sizes (reax_system *system, control_params *control, int *output) -{ - int *Htop, *num_3body, input_size; - int *hb_top, *bond_top; - int *input = (int *) scratch; - int max_3body = 0; - - Htop = 0; - num_3body = 0; - input_size = INT_SIZE * (2 * system->N + 1 + 1); - - //cuda_malloc ((void **) &input, input_size, 1, __LINE__); - cuda_memset (input, 0, input_size, RES_SCRATCH ); - - Estimate_Storage_Sizes <<<BLOCKS_POW_2, BLOCK_SIZE>>> - (system->d_atoms, system->N, system->reaxprm.d_sbp, system->reaxprm.d_tbp, - system->reaxprm.d_gp, (control_params *)control->d_control, *(dev_lists + FAR_NBRS), - system->reaxprm.num_atom_types, input); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (output, input, input_size, cudaMemcpyDeviceToHost, __LINE__ ); - - Htop = &output[0]; - num_3body = &output[1]; - hb_top = &output[ 2 ]; - bond_top = &output[ 2 + system->N ]; - - *Htop += system->N; - *Htop *= SAFE_ZONE; - - for( int i = 0; i < system->N; ++i ) { - hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS ); - - if (max_3body <= SQR (bond_top[i])) - max_3body = SQR (bond_top[i]); - - *num_3body += SQR(bond_top[i]); - bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS ); - } - - *num_3body = max_3body * SAFE_ZONE; -} - - -GLOBAL void Estimate_Storage_Sizes (reax_atom *atoms, - int N, - single_body_parameters *sbp, - two_body_parameters *tbp, - global_parameters gp, - control_params *control, - list far_nbrs, - int num_atom_types, int *results) -{ - int *Htop = &results[0]; - int *num_3body = &results[1]; - int *hb_top = &results [ 2 ]; - int *bond_top = &results [ 2 + N ]; - - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - int ihb, jhb; - real r_ij, r2; - real C12, C34, C56; - real BO, BO_s, BO_pi, BO_pi2; - real p_boc1, p_boc2; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - reax_atom *atom_i, *atom_j; - - p_boc1 = gp.l[0]; - p_boc2 = gp.l[1]; - - //for( i = 0; i < N; ++i ) { - i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= N ) return ; - - atom_i = &(atoms[i]); - type_i = atom_i->type; - start_i = Start_Index(i, &far_nbrs); - end_i = End_Index(i, &far_nbrs); - sbp_i = &(sbp[type_i]); - ihb = sbp_i->p_hbond; - - for( pj = start_i; pj < end_i; ++pj ) { - nbr_pj = &( far_nbrs.select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - atom_j = &( atoms[j] ); - type_j = atom_j->type; - sbp_j = &( sbp[type_j] ); - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); - - - if( nbr_pj->d <= control->r_cut ) { - //++(*Htop); - atomicAdd (Htop, 1); - - /* hydrogen bond lists */ - //TODO - CHANGE ORIGINAL - if( control->hb_cut > 0 && (ihb==1 || ihb==2) && - nbr_pj->d <= control->hb_cut ) { - jhb = sbp_j->p_hbond; - if( ihb == 1 && jhb == 2 ) - //++hb_top[i]; - atomicAdd (&hb_top[i], 1); - else if( ihb == 2 && jhb == 1 ) - //++hb_top[j]; - //atomicAdd (&hb_top[j], 1); - atomicAdd (&hb_top[i], 1); - } - //TODO -- CHANGE ORIGINAL - - //CHANGE ORIGINAL - if (i < j) continue; - //CHANGE ORIGINAL - - - /* uncorrected bond orders */ - if( nbr_pj->d <= control->nbr_cut ) { - r_ij = nbr_pj->d; - r2 = SQR(r_ij); - - if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) { - C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 ); - BO_s = (1.0 + control->bo_cut) * EXP( C12 ); - } - else BO_s = C12 = 0.0; - - if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) { - C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 ); - BO_pi = EXP( C34 ); - } - else BO_pi = C34 = 0.0; - - if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) { - C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 ); - BO_pi2= EXP( C56 ); - } - else BO_pi2 = C56 = 0.0; - - /* Initially BO values are the uncorrected ones, page 1 */ - BO = BO_s + BO_pi + BO_pi2; - - if( BO >= control->bo_cut ) { - //++bond_top[i]; - //++bond_top[j]; - atomicAdd (&bond_top[i], 1); - atomicAdd (&bond_top[j], 1); - } - } - } - } - //} -} - -void Cuda_Compute_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list** lists, output_controls *out_control ) -{ - real t_start, t_elapsed; - real t_1, t_2; - int *indices; - int *Htop; - int max_sparse_entries = 0; - list *far_nbrs = dev_lists + FAR_NBRS; - int hblocks; - - t_start = Get_Time (); - if ( !control->tabulate ) { - Init_Forces <<<BLOCKS, BLOCK_SIZE>>> - (system->d_atoms, system->reaxprm.d_gp, (control_params *)control->d_control, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace, - *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), - system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types ); - cudaThreadSynchronize (); - cudaCheckError (); - } - else - { - Init_Forces_Tab <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, system->reaxprm.d_gp, (control_params *)control->d_control, - system->reaxprm.d_sbp, system->reaxprm.d_tbp, - (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace, - *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), - system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types, - d_LR ); - cudaThreadSynchronize (); - cudaCheckError (); - } - - /*This is for bonds processing to fix dbond and sym_indexes */ - t_1 = Get_Time (); - fix_sym_dbond_indices <<<BLOCKS, BLOCK_SIZE>>> (*(dev_lists + BONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - t_2 = Get_Timing_Info ( t_1 ); - - //FIX -1 HYDROGEN BOND fix for cases where there are no hbonds. - if ((control->hb_cut > 0) && (dev_workspace->num_H > 0)) - { - - hblocks = (system->N * HBONDS_SYM_THREADS_PER_ATOM / HBONDS_SYM_BLOCK_SIZE) + - ((system->N * HBONDS_SYM_THREADS_PER_ATOM % HBONDS_SYM_BLOCK_SIZE) == 0 ? 0 : 1); - t_1 = Get_Time (); - /* - int bs = system->N; - int ss = 32; - fix_sym_hbond_indices <<<bs, ss>>> (*dev_workspace, *(dev_lists + HBONDS), system->N); - */ - New_fix_sym_hbond_indices <<<hblocks, HBONDS_SYM_BLOCK_SIZE>>> (*dev_workspace, *(dev_lists + HBONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); - } - t_2 = Get_Timing_Info ( t_1 ); - - t_elapsed = Get_Timing_Info (t_start); - d_timing.init_forces+= t_elapsed; - - Cuda_Validate_Lists( system, dev_workspace, &dev_lists, data->step, system->N, - system->num_bonds, system->num_hbonds ); -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Done with Cuda List Validation \n"); -#endif - - //Bonded Force Calculations here. - t_start = Get_Time (); - Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); - t_elapsed = Get_Timing_Info (t_start); - d_timing.bonded += t_elapsed; - - //Compute the Non Bonded Forces here. - t_start = Get_Time (); - Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control ); - t_elapsed = Get_Timing_Info (t_start); - d_timing.nonb += t_elapsed; - - //Compute Total Forces here - Cuda_Compute_Total_Force<<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, - *(dev_lists + BONDS), control->ensemble, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_Compute_Total_Force_PostProcess<<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, - *(dev_lists + BONDS), control->ensemble, system->N); - cudaThreadSynchronize (); - cudaCheckError (); -} - -void Compute_Forces( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list** lists, output_controls *out_control ) -{ - real t_start, t_elapsed; - - t_start = Get_Time( ); - if( !control->tabulate ) - Init_Forces( system, control, data, workspace, lists, out_control ); - else Init_Forces_Tab( system, control, data, workspace, lists, out_control ); - t_elapsed = Get_Timing_Info( t_start ); - data->timing.init_forces += t_elapsed; - -#if defined(DEBUG_FOCUS) - print_sparse_matrix (system, workspace); - fprintf( stderr, "init_forces - "); -#endif - - - //analyze_hbonds (system, workspace, lists); - - t_start = Get_Time( ); - Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); - t_elapsed = Get_Timing_Info( t_start ); - data->timing.bonded += t_elapsed; - - //print_bond_list (system, workspace, lists); - //exit (0); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "bonded_forces - "); -#endif - - t_start = Get_Time( ); - Compute_NonBonded_Forces( system, control, data, workspace, - lists, out_control ); - t_elapsed = Get_Timing_Info( t_start ); - data->timing.nonb += t_elapsed; - -#ifdef __DEBUG_CUDA__ - fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed); -#endif - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "nonbondeds - "); -#endif - - Compute_Total_Force( system, control, data, workspace, lists ); - //Print_Total_Force( system, control, data, workspace, lists, out_control ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "totalforces - "); - //Print_Total_Force( system, control, data, workspace, lists, out_control ); -#endif - -#ifdef TEST_FORCES - Print_Total_Force( system, control, data, workspace, lists, out_control ); - Compare_Total_Forces( system, control, data, workspace, lists, out_control ); -#endif -#if defined(DEBUG_FOCUS) - fprintf( stderr, "forces - "); -#endif -} - - -bool validate_device (reax_system *system, simulation_data *data, static_storage *workspace, list **lists ) -{ - bool retval = false; - -#ifdef __BUILD_DEBUG__ - - retval |= validate_neighbors (system, lists); - retval |= validate_sym_dbond_indices (system, workspace, lists); - retval |= validate_bonds (system, workspace, lists); - retval |= validate_sparse_matrix (system, workspace); - retval |= validate_three_bodies (system, workspace, lists ); - retval |= validate_hbonds (system, workspace, lists); - retval |= validate_workspace (system, workspace, lists); - retval |= validate_data (system, data); - retval |= validate_atoms (system, lists); - //analyze_hbonds (system, workspace, lists); - - if (!retval) { - fprintf (stderr, "Results *DOES NOT* mattch between device and host \n"); - } -#endif - - return retval; -} diff --git a/PuReMD-GPU/src/forces.h b/PuReMD-GPU/src/forces.h index 10ac0ee9db54e411b9527429b9da6733d1d4072d..73323f0419baf383d6bf671158ef85584a710728 100644 --- a/PuReMD-GPU/src/forces.h +++ b/PuReMD-GPU/src/forces.h @@ -28,21 +28,7 @@ void Init_Bonded_Force_Functions( control_params* ); void Compute_Forces( reax_system*, control_params*, simulation_data*, static_storage*, list**, output_controls* ); -void Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); - void Estimate_Storage_Sizes( reax_system*, control_params*, list**, int*, int*, int*, int* ); -//Cuda -void Cuda_Estimate_Storage_Sizes (reax_system *, control_params *, int *); - -GLOBAL void Estimate_Storage_Sizes (reax_atom *, int , single_body_parameters *, - two_body_parameters *, global_parameters , - control_params *, list , int , int *); -GLOBAL void Estimate_Sparse_Matrix_Entries ( reax_atom *, control_params *, - simulation_data *, simulation_box *, list , int , int *); - -void Cuda_Threebody_List( reax_system *, static_storage *, list *, int ); -bool validate_device (reax_system *, simulation_data *, static_storage *, list **); #endif diff --git a/PuReMD-GPU/src/four_body_interactions.c b/PuReMD-GPU/src/four_body_interactions.c new file mode 100644 index 0000000000000000000000000000000000000000..c51601fa991203a77ec4840c10e74e15cfa42c87 --- /dev/null +++ b/PuReMD-GPU/src/four_body_interactions.c @@ -0,0 +1,677 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "four_body_interactions.h" + +#include "bond_orders.h" +#include "box.h" +#include "list.h" +#include "lookup.h" +#include "vector.h" +#include "math.h" +#include "index_utils.h" + + +real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk, + rvec dvec_kl, real r_kl, rvec dvec_li, real r_li, + three_body_interaction_data *p_ijk, + three_body_interaction_data *p_jkl, + rvec dcos_omega_di, rvec dcos_omega_dj, + rvec dcos_omega_dk, rvec dcos_omega_dl, + output_controls *out_control ) +{ + real unnorm_cos_omega, unnorm_sin_omega, omega; + real sin_ijk, cos_ijk, sin_jkl, cos_jkl; + real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe; + real arg, poem, tel; + rvec cross_jk_kl; + + sin_ijk = SIN( p_ijk->theta ); + cos_ijk = COS( p_ijk->theta ); + sin_jkl = SIN( p_jkl->theta ); + cos_jkl = COS( p_jkl->theta ); + + /* omega */ + unnorm_cos_omega = -rvec_Dot( dvec_ij,dvec_jk )*rvec_Dot( dvec_jk,dvec_kl ) + + SQR( r_jk ) * rvec_Dot( dvec_ij,dvec_kl ); + rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl ); + unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl ); + omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); + + /* derivatives */ + /* coef for adjusments to cos_theta's */ + /* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li; + coshd = cos_ijk, coshe = cos_jkl; + sinhd = sin_ijk, sinhe = sin_jkl; */ + htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk ); + htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl; + htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk ); + hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl ); + hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk ); + hnra = r_kl * sin_ijk * sin_jkl; + hnrc = r_ij * sin_ijk * sin_jkl; + hnhd = r_ij * r_kl * cos_ijk * sin_jkl; + hnhe = r_ij * r_kl * sin_ijk * cos_jkl; + + poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl; + if( poem < 1e-20 ) poem = 1e-20; + + tel = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) - + 2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + + r_jk * r_kl * cos_jkl ); + + arg = tel / poem; + if( arg > 1.0 ) + { + arg = 1.0; + } + if( arg < -1.0 ) + { + arg = -1.0; + } + + /*fprintf( out_control->etor, + "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", + r_li, dvec_li[0], dvec_li[1], dvec_li[2] ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", + r_ij, r_jk, r_kl, r_li ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", + cos_ijk, cos_jkl, sin_ijk, sin_jkl ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + poem, tel, arg );*/ + /* fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + -p_ijk->dcos_dk[0]/sin_ijk, + -p_ijk->dcos_dk[1]/sin_ijk, + -p_ijk->dcos_dk[2]/sin_ijk ); + fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n", + -p_jkl->dcos_dk[0]/sin_jkl, + -p_jkl->dcos_dk[1]/sin_jkl, + -p_jkl->dcos_dk[2]/sin_jkl );*/ + + if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) + { + sin_ijk = MIN_SINE; + } + else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) + { + sin_ijk = -MIN_SINE; + } + if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) + { + sin_jkl = MIN_SINE; + } + else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) + { + sin_jkl = -MIN_SINE; + } + + // dcos_omega_di + rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li ); + rvec_ScaledAdd( dcos_omega_di,-(hthd - arg*hnhd)/sin_ijk, p_ijk->dcos_dk ); + rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di ); + + // dcos_omega_dj + rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, + -htrb / r_jk, dvec_jk ); + rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_dj ); + rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_di ); + rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj ); + + // dcos_omega_dk + rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc) / r_kl, dvec_kl, + htrb / r_jk, dvec_jk ); + rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_di ); + rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dj ); + rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk ); + + // dcos_omega_dl + rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc) / r_kl, dvec_kl, 1., dvec_li ); + rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dk ); + rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl ); + + return omega; + //return arg; +} + + +void Four_Body_Interactions( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, j, k, l, pi, pj, pk, pl, pij, plk; + int type_i, type_j, type_k, type_l; + int start_j, end_j, start_k, end_k; + int start_pj, end_pj, start_pk, end_pk; + int num_frb_intrs = 0; + + real Delta_j, Delta_k; + real r_ij, r_jk, r_kl, r_li; + real BOA_ij, BOA_jk, BOA_kl; + + real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl; + real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv; + real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl; + real fn10, f11_DjDk, dfn11, fn12; + + real theta_ijk, theta_jkl; + real sin_ijk, sin_jkl; + real cos_ijk, cos_jkl; + real tan_ijk_i, tan_jkl_i; + + real omega, cos_omega, cos2omega, cos3omega; + rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl; + + real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4; + real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9; + real Cconj, CEconj1, CEconj2, CEconj3; + real CEconj4, CEconj5, CEconj6; + + real e_tor, e_con; + rvec dvec_li; + rvec force, ext_press; + ivec rel_box_jl; + // rtensor total_rtensor, temp_rtensor; + + four_body_header *fbh; + four_body_parameters *fbp; + bond_data *pbond_ij, *pbond_jk, *pbond_kl; + bond_order_data *bo_ij, *bo_jk, *bo_kl; + three_body_interaction_data *p_ijk, *p_jkl; + + real p_tor2 = system->reaxprm.gp.l[23]; + real p_tor3 = system->reaxprm.gp.l[24]; + real p_tor4 = system->reaxprm.gp.l[25]; + real p_cot2 = system->reaxprm.gp.l[27]; + + list *bonds = (*lists) + BONDS; + list *thb_intrs = (*lists) + THREE_BODIES; + + + for( j = 0; j < system->N; ++j ) { + type_j = system->atoms[j].type; + Delta_j = workspace->Delta_boc[j]; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + + + for( pk = start_j; pk < end_j; ++pk ) { + pbond_jk = &( bonds->select.bond_list[pk] ); + k = pbond_jk->nbr; + bo_jk = &( pbond_jk->bo_data ); + BOA_jk = bo_jk->BO - control->thb_cut; + + /* see if there are any 3-body interactions involving j&k + where j is the central atom. Otherwise there is no point in + trying to form a 4-body interaction out of this neighborhood */ + if( j < k && bo_jk->BO > control->thb_cut/*0*/ && + Num_Entries(pk, thb_intrs) ) { + start_k = Start_Index(k, bonds); + end_k = End_Index(k, bonds); + pj = pbond_jk->sym_index; // pj points to j on k's list + + /* do the same check as above: are there any 3-body interactions + involving k&j where k is the central atom */ + if( Num_Entries(pj, thb_intrs) ) { + type_k = system->atoms[k].type; + Delta_k = workspace->Delta_boc[k]; + r_jk = pbond_jk->d; + + start_pk = Start_Index(pk, thb_intrs ); + end_pk = End_Index(pk, thb_intrs ); + start_pj = Start_Index(pj, thb_intrs ); + end_pj = End_Index(pj, thb_intrs ); + + exp_tor2_jk = EXP( -p_tor2 * BOA_jk ); + exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) ); + exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) ); + exp_tor4_DjDk = EXP( p_tor4 * (Delta_j + Delta_k) ); + exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk); + f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv; + + + /* pick i up from j-k interaction where j is the centre atom */ + for( pi = start_pk; pi < end_pk; ++pi ) { + p_ijk = &( thb_intrs->select.three_body_list[pi] ); + pij = p_ijk->pthb; // pij is pointer to i on j's bond_list + pbond_ij = &( bonds->select.bond_list[pij] ); + bo_ij = &( pbond_ij->bo_data ); + + + if( bo_ij->BO > control->thb_cut/*0*/ ) { + i = p_ijk->thb; + type_i = system->atoms[i].type; + r_ij = pbond_ij->d; + BOA_ij = bo_ij->BO - control->thb_cut; + + theta_ijk = p_ijk->theta; + sin_ijk = SIN( theta_ijk ); + cos_ijk = COS( theta_ijk ); + //tan_ijk_i = 1. / TAN( theta_ijk ); + if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) + tan_ijk_i = cos_ijk / MIN_SINE; + else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) + tan_ijk_i = cos_ijk / -MIN_SINE; + else tan_ijk_i = cos_ijk / sin_ijk; + + exp_tor2_ij = EXP( -p_tor2 * BOA_ij ); + exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) ); + + /* pick l up from j-k intr. where k is the centre */ + for( pl = start_pj; pl < end_pj; ++pl ) { + p_jkl = &( thb_intrs->select.three_body_list[pl] ); + l = p_jkl->thb; + plk = p_jkl->pthb; //pointer to l on k's bond_list! + pbond_kl = &( bonds->select.bond_list[plk] ); + bo_kl = &( pbond_kl->bo_data ); + type_l = system->atoms[l].type; + fbh = &(system->reaxprm.fbp[ index_fbp(type_i,type_j,type_k,type_l,system->reaxprm.num_atom_types ) ]); + fbp = &(system->reaxprm.fbp[ index_fbp(type_i,type_j,type_k,type_l,system->reaxprm.num_atom_types )].prm[0]); + + if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ && + bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){ + ++num_frb_intrs; + r_kl = pbond_kl->d; + BOA_kl = bo_kl->BO - control->thb_cut; + + theta_jkl = p_jkl->theta; + sin_jkl = SIN( theta_jkl ); + cos_jkl = COS( theta_jkl ); + //tan_jkl_i = 1. / TAN( theta_jkl ); + if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) + tan_jkl_i = cos_jkl / MIN_SINE; + else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) + tan_jkl_i = cos_jkl / -MIN_SINE; + else tan_jkl_i = cos_jkl /sin_jkl; + + Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, + &(system->box), dvec_li ); + r_li = rvec_Norm( dvec_li ); + + + /* omega and its derivative */ + //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, + omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, + r_jk, pbond_kl->dvec, r_kl, + dvec_li, r_li, p_ijk, p_jkl, + dcos_omega_di, dcos_omega_dj, + dcos_omega_dk, dcos_omega_dl, + out_control); + cos_omega = COS( omega ); + cos2omega = COS( 2. * omega ); + cos3omega = COS( 3. * omega ); + /* end omega calculations */ + + /* torsion energy */ + exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk)); + exp_tor2_kl = EXP( -p_tor2 * BOA_kl ); + exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) ); + fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * + (1.0 - exp_tor2_kl); + + CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + + fbp->V2 * exp_tor1 * (1.0 - cos2omega) + + fbp->V3 * (1.0 + cos3omega) ); + //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + + // fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) + + // fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega); + + data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV; + + dfn11 = (-p_tor3 * exp_tor3_DjDk + + (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) * + (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv; + + CEtors1 = sin_ijk * sin_jkl * CV; + + CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * + (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * + sin_ijk * sin_jkl; + + CEtors3 = CEtors2 * dfn11; + + CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * + (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl); + + CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * + (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl); + + CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl * + (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk); + + cmn = -fn10 * CV; + CEtors7 = cmn * sin_jkl * tan_ijk_i; + CEtors8 = cmn * sin_ijk * tan_jkl_i; + CEtors9 = fn10 * sin_ijk * sin_jkl * + (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + + 1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega))); + //cmn = -fn10 * CV; + //CEtors7 = cmn * sin_jkl * cos_ijk; + //CEtors8 = cmn * sin_ijk * cos_jkl; + //CEtors9 = fn10 * sin_ijk * sin_jkl * + // (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega + + // fbp->V3 * (6*SQR(cos_omega) - 1.50)); + /* end of torsion energy */ + + + /* 4-body conjugation energy */ + fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl; + data->E_Con += e_con = fbp->p_cot1 * fn12 * + (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); + + Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * + (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl); + + CEconj1 = Cconj * (BOA_ij - 1.5e0); + CEconj2 = Cconj * (BOA_jk - 1.5e0); + CEconj3 = Cconj * (BOA_kl - 1.5e0); + + CEconj4 = -fbp->p_cot1 * fn12 * + (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i; + CEconj5 = -fbp->p_cot1 * fn12 * + (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i; + //CEconj4 = -fbp->p_cot1 * fn12 * + // (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk; + //CEconj5 = -fbp->p_cot1 * fn12 * + // (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl; + CEconj6 = 2.0 * fbp->p_cot1 * fn12 * + cos_omega * sin_ijk * sin_jkl; + /* end 4-body conjugation energy */ + + //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ", + // workspace->orig_id[i], workspace->orig_id[j], + // workspace->orig_id[k], workspace->orig_id[l], + // omega, cos_omega, cos2omega, cos3omega ); + //fprintf(stdout, + // "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + // CEtors2, CEtors3, CEtors4, CEtors5, + // CEtors6, CEtors7, CEtors8, CEtors9 ); + //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + // theta_ijk, theta_jkl, sin_ijk, + // sin_jkl, cos_jkl, tan_jkl_i ); + + /* forces */ + bo_jk->Cdbopi += CEtors2; + workspace->CdDelta[j] += CEtors3; + workspace->CdDelta[k] += CEtors3; + bo_ij->Cdbo += (CEtors4 + CEconj1); + bo_jk->Cdbo += (CEtors5 + CEconj2); + + bo_kl->Cdbo += (CEtors6 + CEconj3); + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + /* dcos_theta_ijk */ + rvec_ScaledAdd( system->atoms[i].f, + CEtors7 + CEconj4, p_ijk->dcos_dk ); + rvec_ScaledAdd( system->atoms[j].f, + CEtors7 + CEconj4, p_ijk->dcos_dj ); + rvec_ScaledAdd( system->atoms[k].f, + CEtors7 + CEconj4, p_ijk->dcos_di ); + + /* dcos_theta_jkl */ + rvec_ScaledAdd( system->atoms[j].f, + CEtors8 + CEconj5, p_jkl->dcos_di ); + rvec_ScaledAdd( system->atoms[k].f, + CEtors8 + CEconj5, p_jkl->dcos_dj ); + rvec_ScaledAdd( system->atoms[l].f, + CEtors8 + CEconj5, p_jkl->dcos_dk ); + + /* dcos_omega */ + rvec_ScaledAdd( system->atoms[i].f, + CEtors9 + CEconj6, dcos_omega_di ); + rvec_ScaledAdd( system->atoms[j].f, + CEtors9 + CEconj6, dcos_omega_dj ); + rvec_ScaledAdd( system->atoms[k].f, + CEtors9 + CEconj6, dcos_omega_dk ); + rvec_ScaledAdd( system->atoms[l].f, + CEtors9 + CEconj6, dcos_omega_dl ); + } + else { + ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box); + + /* dcos_theta_ijk */ + rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk ); + rvec_Add( system->atoms[i].f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_ScaledAdd( system->atoms[j].f, + CEtors7 + CEconj4, p_ijk->dcos_dj ); + + rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + + /* dcos_theta_jkl */ + rvec_ScaledAdd( system->atoms[j].f, + CEtors8 + CEconj5, p_jkl->dcos_di ); + + rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk ); + rvec_Add( system->atoms[l].f, force ); + rvec_iMultiply( ext_press, rel_box_jl, force ); + rvec_Add( data->ext_press, ext_press ); + + + /* dcos_omega */ + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di ); + rvec_Add( system->atoms[i].f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_ScaledAdd( system->atoms[j].f, + CEtors9 + CEconj6, dcos_omega_dj ); + + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl ); + rvec_Add( system->atoms[l].f, force ); + rvec_iMultiply( ext_press, rel_box_jl, force ); + rvec_Add( data->ext_press, ext_press ); + + + /* This part is intended for a fully-flexible box */ + /* rvec_ScaledSum( temp_rvec, + CEtors7 + CEconj4, p_ijk->dcos_dk, // i + CEtors9 + CEconj6, dcos_omega_di ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[i].x ); + rtensor_Copy( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, + CEtors7 + CEconj4, p_ijk->dcos_dj, // j + CEtors8 + CEconj5, p_jkl->dcos_di ); + rvec_ScaledAdd( temp_rvec, + CEtors9 + CEconj6, dcos_omega_dj ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[j].x ); + rtensor_Add( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, + CEtors7 + CEconj4, p_ijk->dcos_di, // k + CEtors8 + CEconj5, p_jkl->dcos_dj ); + rvec_ScaledAdd( temp_rvec, + CEtors9 + CEconj6, dcos_omega_dk ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[k].x ); + rtensor_Add( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, + CEtors8 + CEconj5, p_jkl->dcos_dk, // l + CEtors9 + CEconj6, dcos_omega_dl ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[l].x ); + rtensor_Copy( total_rtensor, temp_rtensor ); + + if( pbond_ij->imaginary || pbond_jk->imaginary || + pbond_kl->imaginary ) + rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor ); + else + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } + +#ifdef TEST_ENERGY + /*fprintf( out_control->etor, + //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + //r_ij, r_jk, r_kl, + "%12.8f%12.8f%12.8f%12.8f\n", + cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/ + // fprintf( out_control->etor, "%12.8f\n", dfn11 ); + fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", + fn10, cos_omega, CV ); + + fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + CEtors2, CEtors3, CEtors4, CEtors5, + CEtors6, CEtors7, CEtors8, CEtors9 ); + + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */ + + fprintf( out_control->etor, + "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n", + CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); + /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n", + fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/ + + fprintf( out_control->etor, + //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", + "%6d%6d%6d%6d%12.8f%12.8f\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], workspace->orig_id[l], + e_tor, e_con ); + //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor ); + + fprintf( out_control->econ, + "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], workspace->orig_id[l], + RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, + e_con,data->E_Con ); + + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], + (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], + (CEtors7 + CEconj4)*p_ijk->dcos_dk[2], + (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], + (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], + (CEtors7 + CEconj4)*p_ijk->dcos_dj[2], + (CEtors7 + CEconj4)*p_ijk->dcos_di[0], + (CEtors7 + CEconj4)*p_ijk->dcos_di[1], + (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */ + + + /* fprintf( out_control->etor, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + (CEtors8 + CEconj5)*p_jkl->dcos_di[0], + (CEtors8 + CEconj5)*p_jkl->dcos_di[1], + (CEtors8 + CEconj5)*p_jkl->dcos_di[2], + (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], + (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], + (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], + (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], + (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], + (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */ + + fprintf( out_control->etor, + "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n", + dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], + dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], + dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2], + dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] ); +#endif + +#ifdef TEST_FORCES + // Torsion Forces + Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., + workspace->f_tor, workspace->f_tor); + Add_dDelta( system, lists, j, CEtors3, workspace->f_tor ); + Add_dDelta( system, lists, k, CEtors3, workspace->f_tor ); + Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor ); + Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor ); + Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor ); + + rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk); + rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj); + rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di); + + rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di); + rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj); + rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk); + + rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di ); + rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj ); + rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk ); + rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl ); + + // Conjugation Forces + Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con ); + Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con ); + Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con ); + + rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk); + rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj); + rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di); + + rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di); + rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj); + rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk); + + rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di ); + rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj ); + rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk ); + rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl ); +#endif + } // pl check ends + } // pl loop ends + } // pi check ends + } // pi loop ends + } // k-j neighbor check ends + } // j<k && j-k neighbor check ends + } // pk loop ends + } // j loop + + /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ + +#ifdef TEST_FORCES + fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs ); + fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", + data->E_Tor, data->E_Con ); +#endif +} diff --git a/PuReMD-GPU/src/four_body_interactions.h b/PuReMD-GPU/src/four_body_interactions.h index 402ebe7dc807afd0d14541bf5686a0a674b43b09..8e8dd7c0991a747000e77b2d460711e433db52ef 100644 --- a/PuReMD-GPU/src/four_body_interactions.h +++ b/PuReMD-GPU/src/four_body_interactions.h @@ -23,20 +23,10 @@ #include "mytypes.h" +#define MIN_SINE 1e-10 + + void Four_Body_Interactions( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); - -GLOBAL void Four_Body_Interactions ( reax_atom *, - global_parameters , - four_body_header *, - control_params *, - list , list , - simulation_box *, - simulation_data *, - static_storage , - int , int , real *, real *, rvec *); - -GLOBAL void Four_Body_Postprocess (reax_atom *, - static_storage, - list , int ); + static_storage*, list**, output_controls* ); + #endif diff --git a/PuReMD-GPU/src/grid.cu b/PuReMD-GPU/src/grid.c similarity index 93% rename from PuReMD-GPU/src/grid.cu rename to PuReMD-GPU/src/grid.c index 00e638f4b3ae3828e4e0e208f307d51283d8c2d5..fb09b409194a84b1646da3b779aad8b547ff9db3 100644 --- a/PuReMD-GPU/src/grid.cu +++ b/PuReMD-GPU/src/grid.c @@ -19,12 +19,11 @@ ----------------------------------------------------------------------*/ #include "grid.h" + #include "reset_utils.h" #include "vector.h" #include "index_utils.h" -#include "cuda_utils.h" - int Estimate_GCell_Population( reax_system* system ) { @@ -361,23 +360,6 @@ void Bin_Atoms( reax_system* system, static_storage *workspace ) workspace->realloc.gcell_atoms = MAX(max_atoms*SAFE_ZONE,MIN_GCELL_POPL); } -void Cuda_Bin_Atoms (reax_system *system, static_storage *workspace ) -{ - Cuda_Reset_Grid ( &system->d_g); - - Bin_Atoms ( system, workspace ); - - dev_workspace->realloc.gcell_atoms = workspace->realloc.gcell_atoms; -} - -void Cuda_Bin_Atoms_Sync (reax_system *system) -{ - copy_host_device (system->g.top, system->d_g.top, - INT_SIZE * system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_TOP); - - copy_host_device (system->g.atoms, system->d_g.atoms, - INT_SIZE * system->g.max_atoms*system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_ATOMS); -} inline void reax_atom_Copy( reax_atom *dest, reax_atom *src ) { @@ -396,11 +378,11 @@ void Copy_Storage( reax_system *system, static_storage *workspace, int i; for( i = 0; i < RESTART+1; ++i ) - v[ index_wkspace_sys (i,top, system) ] = workspace->v[ index_wkspace_sys (i,old_id, system) ]; + v[ index_wkspace_sys (i,top, system->N) ] = workspace->v[ index_wkspace_sys (i,old_id, system->N) ]; for( i = 0; i < 3; ++i ) { - s[ index_wkspace_sys (i,top, system) ] = workspace->s[ index_wkspace_sys (i,old_id, system) ]; - t[ index_wkspace_sys (i,top, system) ] = workspace->t[ index_wkspace_sys (i,old_id, system) ]; + s[ index_wkspace_sys (i,top, system->N) ] = workspace->s[ index_wkspace_sys (i,old_id, system->N) ]; + t[ index_wkspace_sys (i,top, system->N) ] = workspace->t[ index_wkspace_sys (i,old_id, system->N) ]; } orig_id[top] = workspace->orig_id[old_id]; diff --git a/PuReMD-GPU/src/grid.h b/PuReMD-GPU/src/grid.h index b524fb76ea394c38dd5a2f7fa94e3b668b43a744..8f26f018e947bc2ebe1789736d8203dc435addfd 100644 --- a/PuReMD-GPU/src/grid.h +++ b/PuReMD-GPU/src/grid.h @@ -23,6 +23,11 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Setup_Grid( reax_system* ); void Update_Grid( reax_system* ); @@ -32,9 +37,12 @@ int Shift( int, int, int, grid* ); void Cluster_Atoms( reax_system*, static_storage* ); void Bin_Atoms( reax_system*, static_storage* ); -void Cuda_Bin_Atoms( reax_system*, static_storage* ); -void Cuda_Bin_Atoms_Sync (reax_system *); void Reset_Marks( grid*, ivec*, int ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/index_utils.h b/PuReMD-GPU/src/index_utils.h index b856e5efe79eb7e2ca96163d9bd67744b44f9a00..cbd81cadc4113ecf8804d6bd28299336aa4705e6 100644 --- a/PuReMD-GPU/src/index_utils.h +++ b/PuReMD-GPU/src/index_utils.h @@ -23,101 +23,74 @@ #include "mytypes.h" -extern inline HOST_DEVICE int index_grid_3d (int i, int j, int k, grid *g) -{ - return (i * g->ncell[1] * g->ncell[2]) + - (j * g->ncell[2]) + - k; -} -extern inline HOST_DEVICE int index_grid_nbrs (int i, int j, int k, int l, grid *g) +static inline HOST_DEVICE int index_grid_3d( int i, int j, int k, grid *g ) { - return (i * g->ncell[1] * g->ncell[2] * g->max_nbrs) + - (j * g->ncell[2] * g->max_nbrs) + - (k * g->max_nbrs) + - l; + return (i * g->ncell[1] * g->ncell[2]) + (j * g->ncell[2]) + k; } -extern inline HOST_DEVICE int index_grid_atoms (int i, int j, int k, int l, grid *g) + +static inline HOST_DEVICE int index_grid_nbrs( int i, int j, int k, int l, grid *g ) { - return (i * g->ncell[1] * g->ncell[2] * g->max_atoms) + - (j * g->ncell[2] * g->max_atoms) + - (k * g->max_atoms) + - l; + return (i * g->ncell[1] * g->ncell[2] * g->max_nbrs) + + (j * g->ncell[2] * g->max_nbrs) + + (k * g->max_nbrs) + + l; } -extern inline HOST_DEVICE int index_wkspace_sys (int i, int j, reax_system *system) + +static inline HOST_DEVICE int index_grid_atoms( int i, int j, int k, int l, grid *g ) { - return (i * system->N) + j; + return (i * g->ncell[1] * g->ncell[2] * g->max_atoms) + + (j * g->ncell[2] * g->max_atoms) + + (k * g->max_atoms) + + l; } -extern inline HOST_DEVICE int index_wkspace_sys (int i, int j, int N) + +static inline HOST_DEVICE int index_wkspace_sys( int i, int j, int N ) { return (i * N) + j; } -extern inline HOST_DEVICE int index_wkspace_res (int i, int j ) + +static inline HOST_DEVICE int index_wkspace_res( int i, int j ) { return (i * (RESTART + 1)) + j; } -extern inline HOST_DEVICE int index_tbp (int i, int j, reax_interaction *reax) -{ - return (i * reax->num_atom_types) + j; -} -extern inline HOST_DEVICE int index_tbp (int i, int j, int num_atom_types) +static inline HOST_DEVICE int index_tbp( int i, int j, int num_atom_types ) { return (i * num_atom_types) + j; } -extern inline HOST_DEVICE int index_thbp (int i, int j, int k, reax_interaction *reax) -{ - return (i * reax->num_atom_types * reax->num_atom_types ) + - (j * reax->num_atom_types ) + - k; -} -extern inline HOST_DEVICE int index_thbp (int i, int j, int k, int num_atom_types) +static inline HOST_DEVICE int index_thbp( int i, int j, int k, int num_atom_types ) { - return (i * num_atom_types * num_atom_types ) + - (j * num_atom_types ) + - k; + return (i * num_atom_types * num_atom_types ) + (j * num_atom_types ) + k; } -extern inline HOST_DEVICE int index_hbp (int i, int j, int k, reax_interaction *reax) -{ - return (i * reax->num_atom_types * reax->num_atom_types ) + - (j * reax->num_atom_types ) + - k; -} -extern inline HOST_DEVICE int index_hbp (int i, int j, int k, int num_atom_types) +static inline HOST_DEVICE int index_hbp( int i, int j, int k, int num_atom_types ) { - return (i * num_atom_types * num_atom_types ) + - (j * num_atom_types ) + - k; + return (i * num_atom_types * num_atom_types ) + (j * num_atom_types ) + k; } -extern inline HOST_DEVICE int index_fbp (int i, int j, int k, int l, reax_interaction *reax) -{ - return (i * reax->num_atom_types * reax->num_atom_types * reax->num_atom_types ) + - (j * reax->num_atom_types * reax->num_atom_types ) + - (k * reax->num_atom_types ) + - l; -} -extern inline HOST_DEVICE int index_fbp (int i, int j, int k, int l, int num_atom_types) +static inline HOST_DEVICE int index_fbp( int i, int j, int k, int l, int num_atom_types ) { - return (i * num_atom_types * num_atom_types * num_atom_types ) + - (j * num_atom_types * num_atom_types ) + - (k * num_atom_types ) + - l; + return (i * num_atom_types * num_atom_types * num_atom_types ) + + (j * num_atom_types * num_atom_types ) + + (k * num_atom_types ) + + l; } -extern inline HOST_DEVICE int index_lr (int i, int j, int num_atom_types ) + +static inline HOST_DEVICE int index_lr( int i, int j, int num_atom_types ) { return (i * num_atom_types) + j; } + #endif diff --git a/PuReMD-GPU/src/init_md.c b/PuReMD-GPU/src/init_md.c new file mode 100644 index 0000000000000000000000000000000000000000..2a2ce1270e2c694722e489b9a3f38f8dd48177a1 --- /dev/null +++ b/PuReMD-GPU/src/init_md.c @@ -0,0 +1,879 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "init_md.h" + +#include "allocate.h" +#include "box.h" +#include "forces.h" +#include "grid.h" +#include "index_utils.h" +#include "lin_alg.h" +#include "integrate.h" +#include "neighbors.h" +#include "list.h" +#include "lookup.h" +#include "print_utils.h" +#include "reset_utils.h" +#include "system_props.h" +#include "traj.h" +#include "vector.h" + + +void Generate_Initial_Velocities(reax_system *system, real T ) +{ + int i; + real scale, norm; + + + if( T <= 0.1 ) + { + for ( i = 0; i < system->N; i++ ) + { + rvec_MakeZero( system->atoms[i].v ); + } + +#if defined(DEBUG) + fprintf( stderr, "no random velocities...\n" ); +#endif + } + else + { + for( i = 0; i < system->N; i++ ) + { + rvec_Random( system->atoms[i].v ); + + norm = rvec_Norm_Sqr( system->atoms[i].v ); + scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * + norm / (3.0 * K_B * T) ); + + rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v ); + + /* + fprintf( stderr, "v = %f %f %f\n", + system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]); + fprintf( stderr, "scale = %f\n", scale ); + fprintf( stderr, "v = %f %f %f\n", + system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]); + */ + } + } +} + + +void Init_System( reax_system *system, control_params *control, + simulation_data *data ) +{ + int i; + rvec dx; + + if( !control->restart ) + { + Reset_Atoms( system ); + } + + Compute_Total_Mass( system, data ); + + Compute_Center_of_Mass( system, data, stderr ); + + /* reposition atoms */ + // just fit the atoms to the periodic box + if( control->reposition_atoms == 0 ) + { + rvec_MakeZero( dx ); + } + // put the center of mass to the center of the box + else if( control->reposition_atoms == 1 ) + { + rvec_Scale( dx, 0.5, system->box.box_norms ); + rvec_ScaledAdd( dx, -1., data->xcm ); + } + // put the center of mass to the origin + else if( control->reposition_atoms == 2 ) { + rvec_Scale( dx, -1., data->xcm ); + } + else { + fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" ); + exit( UNKNOWN_OPTION ); + } + + for( i = 0; i < system->N; ++i ) { + Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); + /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", + i, system->atoms[i].type, + system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/ + } + + /* Initialize velocities so that desired init T can be attained */ + if( !control->restart || (control->restart && control->random_vel) ) { + Generate_Initial_Velocities( system, control->T_init ); + } + + Setup_Grid( system ); +} + + +void Init_Simulation_Data( reax_system *system, control_params *control, + simulation_data *data, output_controls *out_control, + evolve_function *Evolve ) +{ + + Reset_Simulation_Data( data ); + + if( !control->restart ) + data->step = data->prev_steps = 0; + + switch( control->ensemble ) { + case NVE: + data->N_f = 3 * system->N; + *Evolve = Velocity_Verlet_NVE; + break; + + + case NVT: + data->N_f = 3 * system->N + 1; + //control->Tau_T = 100 * data->N_f * K_B * control->T_final; + if( !control->restart || (control->restart && control->random_vel) ) { + data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - + data->N_f * K_B * control->T ); + data->therm.v_xi = data->therm.G_xi * control->dt; + data->therm.v_xi_old = 0; + data->therm.xi = 0; +#if defined(DEBUG_FOCUS) + fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n", + data->therm.G_xi, control->Tau_T, data->E_Kin, + data->N_f, data->therm.v_xi ); +#endif + } + + *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein; + break; + + + case NPT: // Anisotropic NPT + fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); + exit( UNKNOWN_OPTION ); + data->N_f = 3 * system->N + 9; + if( !control->restart ) { + data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - + data->N_f * K_B * control->T ); + data->therm.v_xi = data->therm.G_xi * control->dt; + data->iso_bar.eps = 0.33333 * log(system->box.volume); + //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P)); + //Compute_Pressure( system, data, workspace ); + } + *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; + break; + + + case sNPT: // Semi-Isotropic NPT + data->N_f = 3 * system->N + 4; + *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT; + break; + + + case iNPT: // Isotropic NPT + data->N_f = 3 * system->N + 2; + *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; + break; + + case bNVT: //berendensen NVT + data->N_f = 3 * system->N + 1; + *Evolve = Velocity_Verlet_Berendsen_NVT; + break; + + default: + break; + } + + Compute_Kinetic_Energy( system, data ); + + /* init timing info for the host*/ + data->timing.start = Get_Time( ); + data->timing.total = data->timing.start; + data->timing.nbrs = 0; + data->timing.init_forces = 0; + data->timing.bonded = 0; + data->timing.nonb = 0; + data->timing.QEq = 0; + data->timing.matvecs = 0; +} + + +void Init_Workspace( reax_system *system, control_params *control, + static_storage *workspace ) +{ + int i; + + /* Allocate space for hydrogen bond list */ + workspace->hbond_index = (int *) malloc( system->N * sizeof( int ) ); + + /* bond order related storage */ + workspace->total_bond_order = (real *) malloc( system->N * sizeof( real ) ); + workspace->Deltap = (real *) malloc( system->N * sizeof( real ) ); + workspace->Deltap_boc = (real *) malloc( system->N * sizeof( real ) ); + workspace->dDeltap_self = (rvec *) malloc( system->N * sizeof( rvec ) ); + + workspace->Delta = (real *) malloc( system->N * sizeof( real ) ); + workspace->Delta_lp = (real *) malloc( system->N * sizeof( real ) ); + workspace->Delta_lp_temp = (real *) malloc( system->N * sizeof( real ) ); + workspace->dDelta_lp = (real *) malloc( system->N * sizeof( real ) ); + workspace->dDelta_lp_temp = (real *) malloc( system->N * sizeof( real ) ); + workspace->Delta_e = (real *) malloc( system->N * sizeof( real ) ); + workspace->Delta_boc = (real *) malloc( system->N * sizeof( real ) ); + workspace->nlp = (real *) malloc( system->N * sizeof( real ) ); + workspace->nlp_temp = (real *) malloc( system->N * sizeof( real ) ); + workspace->Clp = (real *) malloc( system->N * sizeof( real ) ); + workspace->CdDelta = (real *) malloc( system->N * sizeof( real ) ); + workspace->vlpex = (real *) malloc( system->N * sizeof( real ) ); + + /* QEq storage */ + //workspace->H = NULL; + //workspace->L = NULL; + //workspace->U = NULL; + // + workspace->H.start = NULL; + workspace->L.start = NULL; + workspace->U.start = NULL; + + workspace->H.entries = NULL; + workspace->L.entries = NULL; + workspace->U.entries = NULL; + + workspace->droptol = (real *) calloc( system->N, sizeof( real ) ); + workspace->w = (real *) calloc( system->N, sizeof( real ) ); + workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) ); + workspace->b = (real *) calloc( system->N * 2, sizeof( real ) ); + workspace->b_s = (real *) calloc( system->N, sizeof( real ) ); + workspace->b_t = (real *) calloc( system->N, sizeof( real ) ); + workspace->b_prc = (real *) calloc( system->N * 2, sizeof( real ) ); + workspace->b_prm = (real *) calloc( system->N * 2, sizeof( real ) ); + workspace->s_t = (real *) calloc( system->N * 2, sizeof( real ) ); + workspace->s = (real *) calloc( 5 * system->N, sizeof( real ) ); + workspace->t = (real *) calloc( 5 * system->N, sizeof( real ) ); + // workspace->s_old = (real *) calloc( system->N, sizeof( real ) ); + // workspace->t_old = (real *) calloc( system->N, sizeof( real ) ); + // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) ); + // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) ); + + for( i = 0; i < system->N; ++i ) { + workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta; + workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi; + workspace->b_t[i] = -1.0; + + workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi; + workspace->b[i+system->N] = -1.0; + } + + /* GMRES storage */ + workspace->y = (real *) calloc( RESTART+1, sizeof( real ) ); + workspace->z = (real *) calloc( RESTART+1, sizeof( real ) ); + workspace->g = (real *) calloc( RESTART+1, sizeof( real ) ); + workspace->hs = (real *) calloc( RESTART+1, sizeof( real ) ); + workspace->hc = (real *) calloc( RESTART+1, sizeof( real ) ); + + workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) ); + workspace->v = (real *) calloc( (RESTART+1)*system->N, sizeof( real) ); + workspace->h = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) ); + + /* CG storage */ + workspace->r = (real *) calloc( system->N, sizeof( real ) ); + workspace->d = (real *) calloc( system->N, sizeof( real ) ); + workspace->q = (real *) calloc( system->N, sizeof( real ) ); + workspace->p = (real *) calloc( system->N, sizeof( real ) ); + + /* integrator storage */ + workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) ); + + + /* storage for analysis */ + if( control->molec_anal || control->diffusion_coef ) + { + workspace->mark = (int *) calloc( system->N, sizeof(int) ); + workspace->old_mark = (int *) calloc( system->N, sizeof(int) ); + } + else + workspace->mark = workspace->old_mark = NULL; + + if( control->diffusion_coef ) + workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) ); + else workspace->x_old = NULL; + + +#ifdef TEST_FORCES + workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_ele = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_vdw = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_bo = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_be = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_lp = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_ov = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_un = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_ang = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_coa = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_pen = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_hb = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_tor = (rvec *) malloc( system->N * sizeof( rvec ) ); + workspace->f_con = (rvec *) malloc( system->N * sizeof( rvec ) ); +#endif + + workspace->realloc.num_far = -1; + workspace->realloc.Htop = -1; + workspace->realloc.hbonds = -1; + workspace->realloc.bonds = -1; + workspace->realloc.num_3body = -1; + workspace->realloc.gcell_atoms = -1; + + Reset_Workspace( system, workspace ); +} + +void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *data, list *slist, int N) +{ + int index = 0; + int count = 0; + int jicount = 0; + int i, j, end_index, gpu_index, gpu_end, k; + far_neighbor_data gpu, cpu; + + /* + for (int i = 0; i < N ; i++ ) + { + if (test[i] != start[i]) { + fprintf (stderr, "start index does not match \n"); + exit (0); + } + + if (test[i+1] != (end[i]) ){ + fprintf (stderr, "end index does not match for atom %d (cpu: %d gpu: %d) \n", i, test[i+1], end[i]); + exit (0); + } + } + */ + + + for (i = 0; i < N; i++){ + index = Start_Index (i, slist); + //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]); + + + for (j = start[i]; j < end[i]; j++){ + gpu = data[j]; + + if (i < data[j].nbr) continue; + /* + if (i < data[j].nbr) { + //fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j); + int src = data[j].nbr; + int dest = i; + int x; + + + for (x = start[src]; x < end[src]; x++) { + if (data[x].nbr != dest) continue; + + gpu = data[x]; + cpu = data[j]; + + if ( (gpu.d != cpu.d) || + (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || + (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { + fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, + data[j].d, + data[j].rel_box[0], + data[j].rel_box[1], + data[j].rel_box[2], + data[j].dvec[0], + data[j].dvec[1], + data[j].dvec[2] + ); + fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr, + data[x].d, + data[x].rel_box[0], + data[x].rel_box[1], + data[x].rel_box[2], + data[x].dvec[0], + data[x].dvec[1], + data[x].dvec[2] + ); + jicount++; + } + break; + } + + if (x >= end[src]) { + fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src ); + exit (0); + } + + continue; + } + */ + + cpu = slist->select.far_nbr_list[index]; + //if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){ + //if ( (gpu->d != cpu->d) ){ + if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) || + (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || + (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { + //if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) || + // (gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) { + //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){ + + fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d ) (%d %d %d) \n", i, start[i], end[i], j, gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); + fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, slist), End_Index (i, slist), index); + + /* + fprintf (stdout, "Far neighbors does not match atom: %d \n", i ); + fprintf (stdout, "neighbor %d , %d \n", cpu.nbr, gpu.nbr); + fprintf (stdout, "d %f , %f \n", slist->select.far_nbr_list[index].d, data[j].d); + fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", + cpu.dvec[0], cpu.dvec[1], cpu.dvec[2], + gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] ); + + fprintf (stdout, "ivec (%d %d %d) (%d %d %d) \n", + cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2], + gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); + + */ + count ++; + } + + //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d); + index ++; + } + + if (index != End_Index (i, slist)) + { + fprintf( stderr, + "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", + i, index, Start_Index (i, slist), End_Index(i, slist), + start[i], end[i]); + exit( 10 ); + } + } + + fprintf (stderr, "Far neighbors MATCH between CPU and GPU -->%d reverse %d \n", count, jicount); + + /* + for (int i = 0; i < N; i++) + { + index = Start_Index (i, slist); + end_index = End_Index (i, slist); + + gpu_index = start[i]; + gpu_end = end[i]; + for (int j = index; j < end_index; j++) + { + far_neighbor_data *cpu = &slist->select.far_nbr_list[j]; + far_neighbor_data *gpu; + + for (k = gpu_index; k < gpu_end; k++) { + gpu = &data[k]; + if (gpu->nbr == cpu->nbr) break; + } + + if (k == gpu_end) { fprintf (stderr, " could not find neighbor for atom %d \n", i); exit (1); } + + if ( (gpu->nbr != cpu->nbr) || (gpu->d != cpu->d) || + ((cpu->dvec[0] || gpu->dvec[0]) || (cpu->dvec[1] || gpu->dvec[1]) || (cpu->dvec[2] || gpu->dvec[2])) || + ((cpu->rel_box[0] || gpu->rel_box[0]) || (cpu->rel_box[1] || gpu->rel_box[1]) || (cpu->rel_box[2] || gpu->rel_box[2])) ) { + + fprintf (stderr, "Far neighbors does not match atom: %d \n", i ); + fprintf (stderr, "neighbor %d , %d \n", cpu->nbr, gpu->nbr); + fprintf (stderr, "d %d , %d \n", cpu->d, gpu->d); + fprintf (stderr, "dvec (%f %f %f) (%f %f %f) \n", + cpu->dvec[0], cpu->dvec[1], cpu->dvec[2], + gpu->dvec[0], gpu->dvec[1], gpu->dvec[2] ); + + fprintf (stderr, "ivec (%d %d %d) (%d %d %d) \n", + cpu->rel_box[0], cpu->rel_box[1], cpu->rel_box[2], + gpu->rel_box[0], gpu->rel_box[1], gpu->rel_box[2] ); + fprintf (stderr, "GPU start %d GPU End %d \n", gpu_index, gpu_end ); + + exit (1); + } + } + } + + */ +} + + +void Init_Lists( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop; + int *hb_top, *bond_top; + + real t_start, t_elapsed; + + num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists ); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs); +#endif + + if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS ) ) { + fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); + exit( INIT_ERR ); + } +#if defined(DEBUG_FOCUS) + fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", + num_nbrs * sizeof(far_neighbor_data) / (1024*1024) ); +#endif + + t_start = Get_Time (); + Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control); + t_elapsed = Get_Timing_Info ( t_start ); + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed ); +#endif + + Htop = 0; + hb_top = (int*) calloc( system->N, sizeof(int) ); + bond_top = (int*) calloc( system->N, sizeof(int) ); + num_3body = 0; + Estimate_Storage_Sizes( system, control, lists, + &Htop, hb_top, bond_top, &num_3body ); + + Allocate_Matrix( &(workspace->H), system->N, Htop ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "estimated storage - Htop: %d\n", Htop ); + fprintf( stderr, "memory allocated: H = %ldMB\n", + Htop * sizeof(sparse_matrix_entry) / (1024*1024) ); +#endif + + workspace->num_H = 0; + if( control->hb_cut > 0 ) { + /* init H indexes */ + for( i = 0; i < system->N; ++i ) + if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom + workspace->hbond_index[i] = workspace->num_H++; + else workspace->hbond_index[i] = -1; + + Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, + hb_top, (*lists)+HBONDS ); + num_hbonds = hb_top[system->N-1]; + +#ifdef __DEBUG_CUDA__ + fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds ); +#endif + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds ); + fprintf( stderr, "memory allocated: hbonds = %ldMB\n", + num_hbonds * sizeof(hbond_data) / (1024*1024) ); +#endif + } + + /* bonds list */ + Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS ); + num_bonds = bond_top[system->N-1]; + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds ); + fprintf( stderr, "memory allocated: bonds = %ldMB\n", + num_bonds * sizeof(bond_data) / (1024*1024) ); +#endif + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " host num_3body : %d \n", num_3body); + fprintf (stderr, " host num_bonds : %d \n", num_bonds); +#endif + + /* 3bodies list */ + if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES )) { + fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); + exit( INIT_ERR ); + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body ); + fprintf( stderr, "memory allocated: 3-body = %ldMB\n", + num_3body * sizeof(three_body_interaction_data) / (1024*1024) ); +#endif + +#ifdef TEST_FORCES + if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) { + fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" ); + exit( INIT_ERR ); + } + + if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) { + fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" ); + exit( INIT_ERR ); + } +#endif + + free( hb_top ); + free( bond_top ); +} + + +void Init_Out_Controls(reax_system *system, control_params *control, + static_storage *workspace, output_controls *out_control) +{ + char temp[1000]; + + /* Init trajectory file */ + if( out_control->write_steps > 0 ) { + strcpy( temp, control->sim_name ); + strcat( temp, ".trj" ); + out_control->trj = fopen( temp, "w" ); + out_control->write_header( system, control, workspace, out_control ); + } + + if( out_control->energy_update_freq > 0 ) { + /* Init out file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".out" ); + out_control->out = fopen( temp, "w" ); + fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n", + "step", "total energy", "poten. energy", "kin. energy", + "temp.", "target", "volume", "press.", "target" ); + fflush( out_control->out ); + + /* Init potentials file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".pot" ); + out_control->pot = fopen( temp, "w" ); + fprintf( out_control->pot, + "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n", + "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", + "etor", "econj", "evdw","ecoul", "epol" ); + fflush( out_control->pot ); + + /* Init log file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".log" ); + out_control->log = fopen( temp, "w" ); + fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", + "step", "total", "neighbors", "init", "bonded", + "nonbonded", "QEq", "matvec" ); + } + + /* Init pressure file */ + if( control->ensemble == NPT || + control->ensemble == iNPT || + control->ensemble == sNPT ) { + strcpy( temp, control->sim_name ); + strcat( temp, ".prs" ); + out_control->prs = fopen( temp, "w" ); + fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n", + "step", "norm_x", "norm_y", "norm_z", + "press_x", "press_y", "press_z", "target_p", "volume" ); + fflush( out_control->prs ); + } + + /* Init molecular analysis file */ + if( control->molec_anal ) { + sprintf( temp, "%s.mol", control->sim_name ); + out_control->mol = fopen( temp, "w" ); + if( control->num_ignored ) { + sprintf( temp, "%s.ign", control->sim_name ); + out_control->ign = fopen( temp, "w" ); + } + } + + /* Init electric dipole moment analysis file */ + if( control->dipole_anal ) { + strcpy( temp, control->sim_name ); + strcat( temp, ".dpl" ); + out_control->dpl = fopen( temp, "w" ); + fprintf( out_control->dpl, + "Step Molecule Count Avg. Dipole Moment Norm\n" ); + fflush( out_control->dpl ); + } + + /* Init diffusion coef analysis file */ + if( control->diffusion_coef ) { + strcpy( temp, control->sim_name ); + strcat( temp, ".drft" ); + out_control->drft = fopen( temp, "w" ); + fprintf( out_control->drft, "Step Type Count Avg Squared Disp\n" ); + fflush( out_control->drft ); + } + +#ifdef TEST_ENERGY + /* open bond energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ebond" ); + out_control->ebond = fopen( temp, "w" ); + + /* open lone-pair energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".elp" ); + out_control->elp = fopen( temp, "w" ); + + /* open overcoordination energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".eov" ); + out_control->eov = fopen( temp, "w" ); + + /* open undercoordination energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".eun" ); + out_control->eun = fopen( temp, "w" ); + + /* open angle energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".eval" ); + out_control->eval = fopen( temp, "w" ); + + /* open penalty energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".epen" ); + out_control->epen = fopen( temp, "w" ); + + /* open coalition energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ecoa" ); + out_control->ecoa = fopen( temp, "w" ); + + /* open hydrogen bond energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ehb" ); + out_control->ehb = fopen( temp, "w" ); + + /* open torsion energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".etor" ); + out_control->etor = fopen( temp, "w" ); + + /* open conjugation energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".econ" ); + out_control->econ = fopen( temp, "w" ); + + /* open vdWaals energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".evdw" ); + out_control->evdw = fopen( temp, "w" ); + + /* open coulomb energy file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ecou" ); + out_control->ecou = fopen( temp, "w" ); +#endif + +#ifdef TEST_FORCES + /* open bond orders file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fbo" ); + out_control->fbo = fopen( temp, "w" ); + + /* open bond orders derivatives file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fdbo" ); + out_control->fdbo = fopen( temp, "w" ); + + /* open bond forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fbond" ); + out_control->fbond = fopen( temp, "w" ); + + /* open lone-pair forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".flp" ); + out_control->flp = fopen( temp, "w" ); + + /* open overcoordination forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fatom" ); + out_control->fatom = fopen( temp, "w" ); + + /* open angle forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".f3body" ); + out_control->f3body = fopen( temp, "w" ); + + /* open hydrogen bond forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fhb" ); + out_control->fhb = fopen( temp, "w" ); + + /* open torsion forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".f4body" ); + out_control->f4body = fopen( temp, "w" ); + + /* open nonbonded forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".fnonb" ); + out_control->fnonb = fopen( temp, "w" ); + + /* open total force file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ftot" ); + out_control->ftot = fopen( temp, "w" ); + + /* open coulomb forces file */ + strcpy( temp, control->sim_name ); + strcat( temp, ".ftot2" ); + out_control->ftot2 = fopen( temp, "w" ); +#endif + + /* Error handling */ + /* if ( out_control->out == NULL || out_control->pot == NULL || + out_control->log == NULL || out_control->mol == NULL || + out_control->dpl == NULL || out_control->drft == NULL || + out_control->pdb == NULL ) + { + fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." ); + exit( CANNOT_OPEN_OUTFILE ); + }*/ +} + + +void Initialize(reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, list **lists, + output_controls *out_control, evolve_function *Evolve) +{ + Randomize(); + + Init_System( system, control, data ); + + Init_Simulation_Data( system, control, data, out_control, Evolve ); + + Init_Workspace( system, control, workspace ); + + Init_Lists( system, control, data, workspace, lists, out_control ); + + Init_Out_Controls( system, control, workspace, out_control ); + + /* These are done in forces.c, only forces.c can see all those functions */ + Init_Bonded_Force_Functions( control ); + +#ifdef TEST_FORCES + Init_Force_Test_Functions( ); +#endif + + if( control->tabulate ) + Make_LR_Lookup_Table( system, control ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "data structures have been initialized...\n" ); +#endif +} diff --git a/PuReMD-GPU/src/init_md.cu b/PuReMD-GPU/src/init_md.cu deleted file mode 100644 index e1912d3c2bd139e2dfcda43b23c77adfcd325782..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/init_md.cu +++ /dev/null @@ -1,1361 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#include "init_md.h" -#include "allocate.h" -#include "box.h" -#include "forces.h" -#include "grid.h" -#include "GMRES.h" -#include "integrate.h" -#include "neighbors.h" -#include "list.h" -#include "lookup.h" -#include "print_utils.h" -#include "reset_utils.h" -#include "system_props.h" -#include "traj.h" -#include "vector.h" - - -#include "cuda_init.h" -#include "cuda_copy.h" -#include "cuda_utils.h" -#include "helpers.h" -#include "reduction.h" - -#include "index_utils.h" - -#include "validation.h" - -void Generate_Initial_Velocities(reax_system *system, real T ) -{ - int i; - real scale, norm; - - - if( T <= 0.1 ) { - for (i=0; i < system->N; i++) - rvec_MakeZero( system->atoms[i].v ); -#if defined(DEBUG) - fprintf( stderr, "no random velocities...\n" ); -#endif - } - else { - for( i = 0; i < system->N; i++ ) { - rvec_Random( system->atoms[i].v ); - - norm = rvec_Norm_Sqr( system->atoms[i].v ); - scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * - norm / (3.0 * K_B * T) ); - - rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v ); - - /* - fprintf( stderr, "v = %f %f %f\n", - system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]); - fprintf( stderr, "scale = %f\n", scale ); - fprintf( stderr, "v = %f %f %f\n", - system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]); - */ - } - } -} - - -void Init_System( reax_system *system, control_params *control, - simulation_data *data ) -{ - int i; - rvec dx; - - if( !control->restart ) - Reset_Atoms( system ); - - Compute_Total_Mass( system, data ); - - Compute_Center_of_Mass( system, data, stderr ); - - /* reposition atoms */ - // just fit the atoms to the periodic box - if( control->reposition_atoms == 0 ) { - rvec_MakeZero( dx ); - } - // put the center of mass to the center of the box - else if( control->reposition_atoms == 1 ) { - rvec_Scale( dx, 0.5, system->box.box_norms ); - rvec_ScaledAdd( dx, -1., data->xcm ); - } - // put the center of mass to the origin - else if( control->reposition_atoms == 2 ) { - rvec_Scale( dx, -1., data->xcm ); - } - else { - fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" ); - exit( UNKNOWN_OPTION ); - } - - for( i = 0; i < system->N; ++i ) { - Inc_on_T3( system->atoms[i].x, dx, &(system->box) ); - /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", - i, system->atoms[i].type, - system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/ - } - - /* Initialize velocities so that desired init T can be attained */ - if( !control->restart || (control->restart && control->random_vel) ) { - Generate_Initial_Velocities( system, control->T_init ); - } - - Setup_Grid( system ); -} - - -void Cuda_Init_System( reax_system *system, control_params *control, - simulation_data *data ) -{ - int i; - rvec dx; - - if( !control->restart ) - Cuda_Reset_Atoms( system ); - - Cuda_Compute_Total_Mass( system, data ); - - Cuda_Compute_Center_of_Mass( system, data, stderr ); - - /* reposition atoms */ - // just fit the atoms to the periodic box - if( control->reposition_atoms == 0 ) { - rvec_MakeZero( dx ); - } - // put the center of mass to the center of the box - else if( control->reposition_atoms == 1 ) { - rvec_Scale( dx, 0.5, system->box.box_norms ); - rvec_ScaledAdd( dx, -1., data->xcm ); - } - // put the center of mass to the origin - else if( control->reposition_atoms == 2 ) { - rvec_Scale( dx, -1., data->xcm ); - } - else { - fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" ); - exit( UNKNOWN_OPTION ); - } - - compute_Inc_on_T3 <<<BLOCKS_POW_2, BLOCK_SIZE>>> - (system->d_atoms, system->N, system->d_box, dx[0], dx[1], dx[2]); - cudaThreadSynchronize (); - cudaCheckError (); - - //copy back the atoms from device to the host - copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , - cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); - - /* Initialize velocities so that desired init T can be attained */ - if( !control->restart || (control->restart && control->random_vel) ) { - Generate_Initial_Velocities( system, control->T_init ); - } - - Setup_Grid( system ); -} - - - -void Init_Simulation_Data( reax_system *system, control_params *control, - simulation_data *data, output_controls *out_control, - evolve_function *Evolve ) -{ - - Reset_Simulation_Data( data ); - - if( !control->restart ) - data->step = data->prev_steps = 0; - - switch( control->ensemble ) { - case NVE: - data->N_f = 3 * system->N; - *Evolve = Velocity_Verlet_NVE; - break; - - - case NVT: - data->N_f = 3 * system->N + 1; - //control->Tau_T = 100 * data->N_f * K_B * control->T_final; - if( !control->restart || (control->restart && control->random_vel) ) { - data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - - data->N_f * K_B * control->T ); - data->therm.v_xi = data->therm.G_xi * control->dt; - data->therm.v_xi_old = 0; - data->therm.xi = 0; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n", - data->therm.G_xi, control->Tau_T, data->E_Kin, - data->N_f, data->therm.v_xi ); -#endif - } - - *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein; - break; - - - case NPT: // Anisotropic NPT - fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); - exit( UNKNOWN_OPTION ); - data->N_f = 3 * system->N + 9; - if( !control->restart ) { - data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - - data->N_f * K_B * control->T ); - data->therm.v_xi = data->therm.G_xi * control->dt; - data->iso_bar.eps = 0.33333 * log(system->box.volume); - //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P)); - //Compute_Pressure( system, data, workspace ); - } - *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; - break; - - - case sNPT: // Semi-Isotropic NPT - data->N_f = 3 * system->N + 4; - *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT; - break; - - - case iNPT: // Isotropic NPT - data->N_f = 3 * system->N + 2; - *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; - break; - - case bNVT: //berendensen NVT - data->N_f = 3 * system->N + 1; - *Evolve = Velocity_Verlet_Berendsen_NVT; - break; - - default: - break; - } - - Compute_Kinetic_Energy( system, data ); - - /* init timing info for the host*/ - data->timing.start = Get_Time( ); - data->timing.total = data->timing.start; - data->timing.nbrs = 0; - data->timing.init_forces = 0; - data->timing.bonded = 0; - data->timing.nonb = 0; - data->timing.QEq = 0; - data->timing.matvecs = 0; -} - - -void Cuda_Init_Simulation_Data( reax_system *system, control_params *control, - simulation_data *data, output_controls *out_control, - evolve_function *Evolve ) -{ - - Reset_Simulation_Data( data ); - - if( !control->restart ) - data->step = data->prev_steps = 0; - - switch( control->ensemble ) { - case NVE: - data->N_f = 3 * system->N; - *Evolve = Cuda_Velocity_Verlet_NVE; - break; - - - case NVT: - data->N_f = 3 * system->N + 1; - //control->Tau_T = 100 * data->N_f * K_B * control->T_final; - if( !control->restart || (control->restart && control->random_vel) ) { - data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - - data->N_f * K_B * control->T ); - data->therm.v_xi = data->therm.G_xi * control->dt; - data->therm.v_xi_old = 0; - data->therm.xi = 0; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n", - data->therm.G_xi, control->Tau_T, data->E_Kin, - data->N_f, data->therm.v_xi ); -#endif - } - - *Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein; - break; - - - case NPT: // Anisotropic NPT - fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); - exit( UNKNOWN_OPTION ); - data->N_f = 3 * system->N + 9; - if( !control->restart ) { - data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - - data->N_f * K_B * control->T ); - data->therm.v_xi = data->therm.G_xi * control->dt; - data->iso_bar.eps = 0.33333 * log(system->box.volume); - //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P)); - //Compute_Pressure( system, data, workspace ); - } - *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; - break; - - - case sNPT: // Semi-Isotropic NPT - fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); - exit( UNKNOWN_OPTION ); - data->N_f = 3 * system->N + 4; - *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT; - break; - - - case iNPT: // Isotropic NPT - fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" ); - exit( UNKNOWN_OPTION ); - data->N_f = 3 * system->N + 2; - *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT; - break; - - case bNVT: //berendensen NVT - data->N_f = 3 * system->N + 1; - *Evolve = Cuda_Velocity_Verlet_Berendsen_NVT; - break; - - default: - break; - } - - Cuda_Compute_Kinetic_Energy (system, data); - -#ifdef __BUILD_DEBUG__ - real t_E_Kin = 0; - t_E_Kin = data->E_Kin; -#endif - - copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, - REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); - if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! - data->therm.T = ALMOST_ZERO; - -#ifdef __BUILD_DEBUG__ - if (check_zero (t_E_Kin, data->E_Kin)){ - fprintf (stderr, "SimulationData:E_Kin does not match between host and device (%f %f) \n", t_E_Kin, data->E_Kin ); - exit (1); - } - //validate_data ( system, data ); -#endif - - /* init timing info for the host*/ - data->timing.start = Get_Time( ); - data->timing.total = data->timing.start; - data->timing.nbrs = 0; - data->timing.init_forces = 0; - data->timing.bonded = 0; - data->timing.nonb = 0; - data->timing.QEq = 0; - data->timing.matvecs = 0; - - /* init timing info for the device */ - d_timing.start = Get_Time( ); - d_timing.total = data->timing.start; - d_timing.nbrs = 0; - d_timing.init_forces = 0; - d_timing.bonded = 0; - d_timing.nonb = 0; - d_timing.QEq = 0; - d_timing.matvecs = 0; -} - - -void Init_Workspace( reax_system *system, control_params *control, - static_storage *workspace ) -{ - int i; - - /* Allocate space for hydrogen bond list */ - workspace->hbond_index = (int *) malloc( system->N * sizeof( int ) ); - - /* bond order related storage */ - workspace->total_bond_order = (real *) malloc( system->N * sizeof( real ) ); - workspace->Deltap = (real *) malloc( system->N * sizeof( real ) ); - workspace->Deltap_boc = (real *) malloc( system->N * sizeof( real ) ); - workspace->dDeltap_self = (rvec *) malloc( system->N * sizeof( rvec ) ); - - workspace->Delta = (real *) malloc( system->N * sizeof( real ) ); - workspace->Delta_lp = (real *) malloc( system->N * sizeof( real ) ); - workspace->Delta_lp_temp = (real *) malloc( system->N * sizeof( real ) ); - workspace->dDelta_lp = (real *) malloc( system->N * sizeof( real ) ); - workspace->dDelta_lp_temp = (real *) malloc( system->N * sizeof( real ) ); - workspace->Delta_e = (real *) malloc( system->N * sizeof( real ) ); - workspace->Delta_boc = (real *) malloc( system->N * sizeof( real ) ); - workspace->nlp = (real *) malloc( system->N * sizeof( real ) ); - workspace->nlp_temp = (real *) malloc( system->N * sizeof( real ) ); - workspace->Clp = (real *) malloc( system->N * sizeof( real ) ); - workspace->CdDelta = (real *) malloc( system->N * sizeof( real ) ); - workspace->vlpex = (real *) malloc( system->N * sizeof( real ) ); - - /* QEq storage */ - //workspace->H = NULL; - //workspace->L = NULL; - //workspace->U = NULL; - // - workspace->H.start = NULL; - workspace->L.start = NULL; - workspace->U.start = NULL; - - workspace->H.entries = NULL; - workspace->L.entries = NULL; - workspace->U.entries = NULL; - - workspace->droptol = (real *) calloc( system->N, sizeof( real ) ); - workspace->w = (real *) calloc( system->N, sizeof( real ) ); - workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) ); - workspace->b = (real *) calloc( system->N * 2, sizeof( real ) ); - workspace->b_s = (real *) calloc( system->N, sizeof( real ) ); - workspace->b_t = (real *) calloc( system->N, sizeof( real ) ); - workspace->b_prc = (real *) calloc( system->N * 2, sizeof( real ) ); - workspace->b_prm = (real *) calloc( system->N * 2, sizeof( real ) ); - workspace->s_t = (real *) calloc( system->N * 2, sizeof( real ) ); - workspace->s = (real *) calloc( 5 * system->N, sizeof( real ) ); - workspace->t = (real *) calloc( 5 * system->N, sizeof( real ) ); - // workspace->s_old = (real *) calloc( system->N, sizeof( real ) ); - // workspace->t_old = (real *) calloc( system->N, sizeof( real ) ); - // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) ); - // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) ); - - for( i = 0; i < system->N; ++i ) { - workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta; - workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi; - workspace->b_t[i] = -1.0; - - workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi; - workspace->b[i+system->N] = -1.0; - } - - /* GMRES storage */ - workspace->y = (real *) calloc( RESTART+1, sizeof( real ) ); - workspace->z = (real *) calloc( RESTART+1, sizeof( real ) ); - workspace->g = (real *) calloc( RESTART+1, sizeof( real ) ); - workspace->hs = (real *) calloc( RESTART+1, sizeof( real ) ); - workspace->hc = (real *) calloc( RESTART+1, sizeof( real ) ); - - workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) ); - workspace->v = (real *) calloc( (RESTART+1)*system->N, sizeof( real) ); - workspace->h = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) ); - - /* CG storage */ - workspace->r = (real *) calloc( system->N, sizeof( real ) ); - workspace->d = (real *) calloc( system->N, sizeof( real ) ); - workspace->q = (real *) calloc( system->N, sizeof( real ) ); - workspace->p = (real *) calloc( system->N, sizeof( real ) ); - - - /* integrator storage */ - workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) ); - - - /* storage for analysis */ - if( control->molec_anal || control->diffusion_coef ) - { - workspace->mark = (int *) calloc( system->N, sizeof(int) ); - workspace->old_mark = (int *) calloc( system->N, sizeof(int) ); - } - else - workspace->mark = workspace->old_mark = NULL; - - if( control->diffusion_coef ) - workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) ); - else workspace->x_old = NULL; - - -#ifdef TEST_FORCES - workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_ele = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_vdw = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_bo = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_be = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_lp = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_ov = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_un = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_ang = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_coa = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_pen = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_hb = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_tor = (rvec *) malloc( system->N * sizeof( rvec ) ); - workspace->f_con = (rvec *) malloc( system->N * sizeof( rvec ) ); -#endif - - workspace->realloc.num_far = -1; - workspace->realloc.Htop = -1; - workspace->realloc.hbonds = -1; - workspace->realloc.bonds = -1; - workspace->realloc.num_3body = -1; - workspace->realloc.gcell_atoms = -1; - - Reset_Workspace( system, workspace ); -} - -void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *data, list *slist, int N) -{ - int index = 0; - int count = 0; - int jicount = 0; - int end_index, gpu_index, gpu_end, k; - far_neighbor_data gpu, cpu; - - /* - for (int i = 0; i < N ; i++ ) - { - if (test[i] != start[i]) { - fprintf (stderr, "start index does not match \n"); - exit (0); - } - - if (test[i+1] != (end[i]) ){ - fprintf (stderr, "end index does not match for atom %d (cpu: %d gpu: %d) \n", i, test[i+1], end[i]); - exit (0); - } - } - */ - - - for (int i = 0; i < N; i++){ - index = Start_Index (i, slist); - //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]); - - - for (int j = start[i]; j < end[i]; j++){ - gpu = data[j]; - - if (i < data[j].nbr) continue; - /* - if (i < data[j].nbr) { - //fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j); - int src = data[j].nbr; - int dest = i; - int x; - - - for (x = start[src]; x < end[src]; x++) { - if (data[x].nbr != dest) continue; - - gpu = data[x]; - cpu = data[j]; - - if ( (gpu.d != cpu.d) || - (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || - (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { - fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, - data[j].d, - data[j].rel_box[0], - data[j].rel_box[1], - data[j].rel_box[2], - data[j].dvec[0], - data[j].dvec[1], - data[j].dvec[2] - ); - fprintf (stderr, " atom %d neighbor %d (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr, - data[x].d, - data[x].rel_box[0], - data[x].rel_box[1], - data[x].rel_box[2], - data[x].dvec[0], - data[x].dvec[1], - data[x].dvec[2] - ); - jicount++; - } - break; - } - - if (x >= end[src]) { - fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src ); - exit (0); - } - - continue; - } - */ - - cpu = slist->select.far_nbr_list[index]; - //if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){ - //if ( (gpu->d != cpu->d) ){ - if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) || - (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) || - (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) { - //if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) || - // (gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) { - //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){ - - fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d ) (%d %d %d) \n", i, start[i], end[i], j, gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); - fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, slist), End_Index (i, slist), index); - - /* - fprintf (stdout, "Far neighbors does not match atom: %d \n", i ); - fprintf (stdout, "neighbor %d , %d \n", cpu.nbr, gpu.nbr); - fprintf (stdout, "d %f , %f \n", slist->select.far_nbr_list[index].d, data[j].d); - fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", - cpu.dvec[0], cpu.dvec[1], cpu.dvec[2], - gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] ); - - fprintf (stdout, "ivec (%d %d %d) (%d %d %d) \n", - cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2], - gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] ); - - */ - count ++; - } - - //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d); - index ++; - } - - if (index != End_Index (i, slist)) - { - fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, slist), End_Index(i, slist), - start[i], end[i]); - exit (10); - } - } - - fprintf (stderr, "Far neighbors MATCH between CPU and GPU -->%d reverse %d \n", count, jicount); - - /* - for (int i = 0; i < N; i++) - { - index = Start_Index (i, slist); - end_index = End_Index (i, slist); - - gpu_index = start[i]; - gpu_end = end[i]; - for (int j = index; j < end_index; j++) - { - far_neighbor_data *cpu = &slist->select.far_nbr_list[j]; - far_neighbor_data *gpu; - - for (k = gpu_index; k < gpu_end; k++) { - gpu = &data[k]; - if (gpu->nbr == cpu->nbr) break; - } - - if (k == gpu_end) { fprintf (stderr, " could not find neighbor for atom %d \n", i); exit (1); } - - if ( (gpu->nbr != cpu->nbr) || (gpu->d != cpu->d) || - ((cpu->dvec[0] || gpu->dvec[0]) || (cpu->dvec[1] || gpu->dvec[1]) || (cpu->dvec[2] || gpu->dvec[2])) || - ((cpu->rel_box[0] || gpu->rel_box[0]) || (cpu->rel_box[1] || gpu->rel_box[1]) || (cpu->rel_box[2] || gpu->rel_box[2])) ) { - - fprintf (stderr, "Far neighbors does not match atom: %d \n", i ); - fprintf (stderr, "neighbor %d , %d \n", cpu->nbr, gpu->nbr); - fprintf (stderr, "d %d , %d \n", cpu->d, gpu->d); - fprintf (stderr, "dvec (%f %f %f) (%f %f %f) \n", - cpu->dvec[0], cpu->dvec[1], cpu->dvec[2], - gpu->dvec[0], gpu->dvec[1], gpu->dvec[2] ); - - fprintf (stderr, "ivec (%d %d %d) (%d %d %d) \n", - cpu->rel_box[0], cpu->rel_box[1], cpu->rel_box[2], - gpu->rel_box[0], gpu->rel_box[1], gpu->rel_box[2] ); - fprintf (stderr, "GPU start %d GPU End %d \n", gpu_index, gpu_end ); - - exit (1); - } - } - } - - */ - } - - int Estimate_Device_Matrix (reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int *indices, *Htop; - list *far_nbrs = dev_lists + FAR_NBRS; - int max_sparse_entries = 0; - real t1, t2; - - indices = (int *) scratch; - cuda_memset ( indices, 0, INT_SIZE * system->N, RES_SCRATCH ); - - t1 = Get_Time (); - - Estimate_Sparse_Matrix_Entries <<<BLOCKS, BLOCK_SIZE>>> - ( system->d_atoms, (control_params *)control->d_control, - (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, - *far_nbrs, system->N, indices ); - cudaThreadSynchronize (); - cudaCheckError (); - - t2 = Get_Timing_Info ( t1 ); - - //fprintf (stderr, " Time to estimate sparse matrix entries --- > %f \n", t2 ); - - Htop = (int *) malloc (INT_SIZE * (system->N + 1)); - memset (Htop, 0, INT_SIZE * (system->N + 1)); - copy_host_device (Htop, indices, system->N * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__); - - for (int i = 0; i < system->N; i++) - { - if (max_sparse_entries < Htop[i]) { - max_sparse_entries = Htop[i]; - } - } - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Max sparse entries for this run are ---> %d \n", max_sparse_entries ); -#endif - - return max_sparse_entries * SAFE_ZONE; - //return max_sparse_entries; - } - - void Allocate_Device_Matrix (reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - - //Allocate space for the sparse Matrix entries here. - system->max_sparse_matrix_entries = - Estimate_Device_Matrix (system, control, data, workspace, lists, out_control ); - dev_workspace->H.n = system->N ; - dev_workspace->H.m = system->N * system->max_sparse_matrix_entries; - Cuda_Init_Sparse_Matrix (&dev_workspace->H, system->max_sparse_matrix_entries * system->N, system->N ); - -#ifdef __CUDA_MEM__ - fprintf( stderr, "Device memory allocated: sparse matrix= %ld (MB)\n", - system->max_sparse_matrix_entries * system->N * sizeof(sparse_matrix_entry) / (1024*1024) ); -#endif - } - - void Cuda_Init_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop; - int *hb_top, *bond_top; - - real t_start, t_elapsed; - - grid *g = &( system->g ); - int *d_indices = (int *) scratch; - int total = g->ncell[0] * g->ncell[1] * g->ncell[2]; - - cuda_memset ( d_indices, 0, INT_SIZE * system->N, RES_SCRATCH ); - -#ifdef __BUILD_DEBUG__ - for (int i = 0; i < g->max_nbrs; i ++) { - if ((g->nbrs[i][0] >= g->ncell[0]) || - (g->nbrs[i][1] >= g->ncell[1]) || - (g->nbrs[i][2] >= g->ncell[2]) ) { - fprintf (stderr, " Grid Incorrectly built.... \n"); - exit (1); - } - - } -#endif - - dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); - dim3 threadsperblock (system->g.max_atoms); - -#ifdef __BUILD_DEBUG__ - fprintf (stderr, "Blocks per grid (%d %d %d)\n", system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); - fprintf (stderr, "Estimate Num Neighbors with threads per block as %d \n", system->d_g.max_atoms); - fprintf (stderr, "Max nbrs %d \n", system->d_g.max_nbrs); -#endif - - - //First Bin atoms and they sync the host and the device for the grid. - //This will copy the atoms from host to device. - Cuda_Bin_Atoms (system, workspace); - Sync_Host_Device (&system->g, &system->d_g, cudaMemcpyHostToDevice ); - - Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>> - (system->d_atoms, system->d_g, system->d_box, - (control_params *)control->d_control, d_indices); - cudaThreadSynchronize (); - cudaCheckError (); - - int *nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) ); - memset (nbrs_indices , 0, INT_SIZE * (system->N + 1)); - - nbrs_indices [0] = 0; - copy_host_device (&nbrs_indices [1], d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - - for (int i = 1; i <= system->N; i++) - nbrs_indices [i] += nbrs_indices [i-1]; - - num_nbrs = nbrs_indices [system->N] ; - system->num_nbrs = num_nbrs; - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Total neighbors %d \n", nbrs_indices[system->N]); - fprintf (stderr, "Corrected Total neighbors %d \n", num_nbrs); -#endif - - - list *far_nbrs = (dev_lists + FAR_NBRS); - if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE) ) { - fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); - exit( INIT_ERR ); - } - -#ifdef __CUDA_MEM__ - fprintf( stderr, "Device memory allocated: far_nbrs = %ld (MB)\n", - num_nbrs * sizeof(far_neighbor_data) / (1024*1024) ); -#endif - - copy_host_device (nbrs_indices, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ ); - copy_host_device (nbrs_indices, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ ); - Cuda_Generate_Neighbor_Lists (system, workspace, control, false); - -#ifdef __BUILD_DEBUG__ - - int *end = (int *)malloc (sizeof (int) * system->N); - int *start = (int *) malloc (sizeof (int) * system->N ); - - copy_host_device (start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0); - copy_host_device (end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0); - - far_neighbor_data *far_data = (far_neighbor_data *) - malloc (FAR_NEIGHBOR_SIZE * num_nbrs); - copy_host_device (far_data, far_nbrs->select.far_nbr_list, - FAR_NEIGHBOR_SIZE * num_nbrs, cudaMemcpyDeviceToHost, 0); - - compare_far_neighbors (nbrs_indices, start, end, far_data, *lists + FAR_NBRS, system->N); - - free (start); - free (end); -#endif - - int *output, size; - size = INT_SIZE * 2 * system->N + 2; - output = (int *) malloc (size); - Cuda_Estimate_Storage_Sizes (system, control, output); - - Htop = output[0]; - num_3body = output[1]; - hb_top = &output[ 2 ]; - bond_top = &output[ 2 + system->N ]; - -#ifdef __DEBUG_CUDA__ - int max_hbonds = 0; - int min_hbonds = 1000; - int max_bonds = 0; - int min_bonds = 1000; - for (int i = 0; i < system->N; i++) { - if ( max_hbonds < hb_top[i]) - max_hbonds = hb_top[i]; - if (min_hbonds > hb_top[i]) - min_hbonds = hb_top[i]; - - if (max_bonds < bond_top [i]) - max_bonds = bond_top[i]; - if (min_bonds > bond_top[i]) - min_bonds = bond_top[i]; - } - - fprintf (stderr, "Max Hbonds %d min Hbonds %d \n", max_hbonds, min_hbonds ); - fprintf (stderr, "Max bonds %d min bonds %d \n", max_bonds, min_bonds ); - fprintf (stderr, "Device HTop --> %d and num_3body --> %d \n", Htop, num_3body ); -#endif - - Allocate_Device_Matrix (system, control, data, workspace, lists, out_control ); - - dev_workspace->num_H = 0; - - if( control->hb_cut > 0 ) { - - int *hbond_index = (int *) malloc ( INT_SIZE * system->N ); - // init H indexes - num_hbonds = 0; - for( i = 0; i < system->N; ++i ) - if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 || - system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 2 ) // H atom - //hbond_index[i] = workspace->num_H++; - hbond_index[i] = num_hbonds ++; - else - hbond_index[i] = -1; - - copy_host_device (hbond_index, dev_workspace->hbond_index, - system->N * INT_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_HBOND_INDEX ); - dev_workspace->num_H = num_hbonds; - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Device num_H --> %d \n", dev_workspace->num_H ); -#endif - - Cuda_Allocate_HBond_List( system->N, dev_workspace->num_H, dev_workspace->hbond_index, - hb_top, (dev_lists+HBONDS) ); - num_hbonds = hb_top[system->N-1]; - system->num_hbonds = num_hbonds; - -#ifdef __CUDA_MEM__ - fprintf (stderr, "Device memory allocated: Hydrogen Bonds list: %ld (MB) \n", - sizeof (hbond_data) * num_hbonds / (1024*1024)); -#endif - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Device Total number of HBonds --> %d \n", num_hbonds ); -#endif - - free (hbond_index); - } - - // bonds list - Cuda_Allocate_Bond_List( system->N, bond_top, dev_lists+BONDS ); - num_bonds = bond_top[system->N-1]; - system->num_bonds = num_bonds; - -#ifdef __CUDA_MEM__ - fprintf (stderr, "Device memory allocated: Bonds list: %ld (MB) \n", - sizeof (bond_data) * num_bonds / (1024*1024)); -#endif - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Device Total Bonds --> %d \n", num_bonds ); -#endif - - // system->max_thb_intrs = num_3body; - // 3bodies list - //if(!Make_List(num_bonds, num_bonds * MAX_THREE_BODIES, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) { - // fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); - // exit( INIT_ERR ); - //} - - //fprintf( stderr, "***memory allocated: three_body = %ldMB\n", - // num_bonds * MAX_THREE_BODIES *sizeof(three_body_interaction_data) / (1024*1024) ); - //fprintf (stderr, "size of (three_body_interaction_data) : %d \n", sizeof (three_body_interaction_data)); - - //Free local resources - free (output); - free (nbrs_indices); - } - - - void Init_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop; - int *hb_top, *bond_top; - - real t_start, t_elapsed; - - num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists ); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs); -#endif - - if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS) ) { - fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n"); - exit( INIT_ERR ); - } -#if defined(DEBUG_FOCUS) - fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", - num_nbrs * sizeof(far_neighbor_data) / (1024*1024) ); -#endif - - t_start = Get_Time (); - Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control); - t_elapsed = Get_Timing_Info ( t_start ); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed ); -#endif - - Htop = 0; - hb_top = (int*) calloc( system->N, sizeof(int) ); - bond_top = (int*) calloc( system->N, sizeof(int) ); - num_3body = 0; - Estimate_Storage_Sizes( system, control, lists, - &Htop, hb_top, bond_top, &num_3body ); - - Allocate_Matrix( &(workspace->H), system->N, Htop ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "estimated storage - Htop: %d\n", Htop ); - fprintf( stderr, "memory allocated: H = %ldMB\n", - Htop * sizeof(sparse_matrix_entry) / (1024*1024) ); -#endif - - workspace->num_H = 0; - if( control->hb_cut > 0 ) { - /* init H indexes */ - for( i = 0; i < system->N; ++i ) - if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom - workspace->hbond_index[i] = workspace->num_H++; - else workspace->hbond_index[i] = -1; - - Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, - hb_top, (*lists)+HBONDS ); - num_hbonds = hb_top[system->N-1]; - -#ifdef __DEBUG_CUDA__ - fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds ); -#endif - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds ); - fprintf( stderr, "memory allocated: hbonds = %ldMB\n", - num_hbonds * sizeof(hbond_data) / (1024*1024) ); -#endif - } - - /* bonds list */ - Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS ); - num_bonds = bond_top[system->N-1]; -#if defined(DEBUG_FOCUS) - fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds ); - fprintf( stderr, "memory allocated: bonds = %ldMB\n", - num_bonds * sizeof(bond_data) / (1024*1024) ); -#endif - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " host num_3body : %d \n", num_3body); - fprintf (stderr, " host num_bonds : %d \n", num_bonds); -#endif - - /* 3bodies list */ - if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES)) { - fprintf( stderr, "Problem in initializing angles list. Terminating!\n" ); - exit( INIT_ERR ); - } -#if defined(DEBUG_FOCUS) - fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body ); - fprintf( stderr, "memory allocated: 3-body = %ldMB\n", - num_3body * sizeof(three_body_interaction_data) / (1024*1024) ); -#endif -#ifdef TEST_FORCES - if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) { - fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" ); - exit( INIT_ERR ); - } - - if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) { - fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" ); - exit( INIT_ERR ); - } -#endif - - free( hb_top ); - free( bond_top ); - } - - - void Init_Out_Controls(reax_system *system, control_params *control, - static_storage *workspace, output_controls *out_control) - { - char temp[1000]; - - /* Init trajectory file */ - if( out_control->write_steps > 0 ) { - strcpy( temp, control->sim_name ); - strcat( temp, ".trj" ); - out_control->trj = fopen( temp, "w" ); - out_control->write_header( system, control, workspace, out_control ); - } - - if( out_control->energy_update_freq > 0 ) { - /* Init out file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".out" ); - out_control->out = fopen( temp, "w" ); - fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n", - "step", "total energy", "poten. energy", "kin. energy", - "temp.", "target", "volume", "press.", "target" ); - fflush( out_control->out ); - - /* Init potentials file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".pot" ); - out_control->pot = fopen( temp, "w" ); - fprintf( out_control->pot, - "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n", - "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", - "etor", "econj", "evdw","ecoul", "epol" ); - fflush( out_control->pot ); - - /* Init log file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".log" ); - out_control->log = fopen( temp, "w" ); - fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", - "step", "total", "neighbors", "init", "bonded", - "nonbonded", "QEq", "matvec" ); - } - - /* Init pressure file */ - if( control->ensemble == NPT || - control->ensemble == iNPT || - control->ensemble == sNPT ) { - strcpy( temp, control->sim_name ); - strcat( temp, ".prs" ); - out_control->prs = fopen( temp, "w" ); - fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n", - "step", "norm_x", "norm_y", "norm_z", - "press_x", "press_y", "press_z", "target_p", "volume" ); - fflush( out_control->prs ); - } - - /* Init molecular analysis file */ - if( control->molec_anal ) { - sprintf( temp, "%s.mol", control->sim_name ); - out_control->mol = fopen( temp, "w" ); - if( control->num_ignored ) { - sprintf( temp, "%s.ign", control->sim_name ); - out_control->ign = fopen( temp, "w" ); - } - } - - /* Init electric dipole moment analysis file */ - if( control->dipole_anal ) { - strcpy( temp, control->sim_name ); - strcat( temp, ".dpl" ); - out_control->dpl = fopen( temp, "w" ); - fprintf( out_control->dpl, - "Step Molecule Count Avg. Dipole Moment Norm\n" ); - fflush( out_control->dpl ); - } - - /* Init diffusion coef analysis file */ - if( control->diffusion_coef ) { - strcpy( temp, control->sim_name ); - strcat( temp, ".drft" ); - out_control->drft = fopen( temp, "w" ); - fprintf( out_control->drft, "Step Type Count Avg Squared Disp\n" ); - fflush( out_control->drft ); - } - - -#ifdef TEST_ENERGY - /* open bond energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ebond" ); - out_control->ebond = fopen( temp, "w" ); - - /* open lone-pair energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".elp" ); - out_control->elp = fopen( temp, "w" ); - - /* open overcoordination energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".eov" ); - out_control->eov = fopen( temp, "w" ); - - /* open undercoordination energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".eun" ); - out_control->eun = fopen( temp, "w" ); - - /* open angle energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".eval" ); - out_control->eval = fopen( temp, "w" ); - - /* open penalty energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".epen" ); - out_control->epen = fopen( temp, "w" ); - - /* open coalition energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ecoa" ); - out_control->ecoa = fopen( temp, "w" ); - - /* open hydrogen bond energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ehb" ); - out_control->ehb = fopen( temp, "w" ); - - /* open torsion energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".etor" ); - out_control->etor = fopen( temp, "w" ); - - /* open conjugation energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".econ" ); - out_control->econ = fopen( temp, "w" ); - - /* open vdWaals energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".evdw" ); - out_control->evdw = fopen( temp, "w" ); - - /* open coulomb energy file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ecou" ); - out_control->ecou = fopen( temp, "w" ); -#endif - - -#ifdef TEST_FORCES - /* open bond orders file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fbo" ); - out_control->fbo = fopen( temp, "w" ); - - /* open bond orders derivatives file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fdbo" ); - out_control->fdbo = fopen( temp, "w" ); - - /* open bond forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fbond" ); - out_control->fbond = fopen( temp, "w" ); - - /* open lone-pair forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".flp" ); - out_control->flp = fopen( temp, "w" ); - - /* open overcoordination forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fatom" ); - out_control->fatom = fopen( temp, "w" ); - - /* open angle forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".f3body" ); - out_control->f3body = fopen( temp, "w" ); - - /* open hydrogen bond forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fhb" ); - out_control->fhb = fopen( temp, "w" ); - - /* open torsion forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".f4body" ); - out_control->f4body = fopen( temp, "w" ); - - /* open nonbonded forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".fnonb" ); - out_control->fnonb = fopen( temp, "w" ); - - /* open total force file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ftot" ); - out_control->ftot = fopen( temp, "w" ); - - /* open coulomb forces file */ - strcpy( temp, control->sim_name ); - strcat( temp, ".ftot2" ); - out_control->ftot2 = fopen( temp, "w" ); -#endif - - - /* Error handling */ - /* if ( out_control->out == NULL || out_control->pot == NULL || - out_control->log == NULL || out_control->mol == NULL || - out_control->dpl == NULL || out_control->drft == NULL || - out_control->pdb == NULL ) - { - fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." ); - exit( CANNOT_OPEN_OUTFILE ); - }*/ - } - - - void Initialize(reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, list **lists, - output_controls *out_control, evolve_function *Evolve) - { - Randomize(); - - Init_System( system, control, data ); - - Init_Simulation_Data( system, control, data, out_control, Evolve ); - - Init_Workspace( system, control, workspace ); - - Init_Lists( system, control, data, workspace, lists, out_control ); - - Init_Out_Controls( system, control, workspace, out_control ); - - /* These are done in forces.c, only forces.c can see all those functions */ - Init_Bonded_Force_Functions( control ); -#ifdef TEST_FORCES - Init_Force_Test_Functions( ); -#endif - - if( control->tabulate ) - Make_LR_Lookup_Table( system, control ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "data structures have been initialized...\n" ); -#endif - } - - void Cuda_Initialize(reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, list **lists, - output_controls *out_control, evolve_function *Evolve) - { - Randomize (); - - Cuda_Init_Scratch (); - - //System - Cuda_Init_System (system); - Sync_Host_Device ( system, cudaMemcpyHostToDevice ); - Cuda_Init_System (system, control, data ); - - //Simulation Data - copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , - cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS ); - Cuda_Init_Simulation_Data (data); - //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); - Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve ); - Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); - - //static storage - Cuda_Init_Workspace_System ( system, dev_workspace ); - Cuda_Init_Workspace ( system, control, dev_workspace ); - Cuda_Init_Workspace_Device (workspace); - - //control - Cuda_Init_Control (control); - - //Grid - Cuda_Init_Grid (&system->g, &system->d_g ); - - //lists - Cuda_Init_Lists (system, control, data, workspace, lists, out_control ); - - Init_Out_Controls( system, control, workspace, out_control ); - - if( control->tabulate ) { - real start, end; - start = Get_Time (); - Make_LR_Lookup_Table( system, control ); - copy_LR_table_to_device (system, control ); - end = Get_Timing_Info ( start ); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Done copying the LR table to the device ---> %f \n", end ); -#endif - } - } diff --git a/PuReMD-GPU/src/init_md.h b/PuReMD-GPU/src/init_md.h index 65e12348eef4830a6eede11c881b9f46ffa283e8..8c23806594a8f2b107ddb884efbf68e7b5fe27ff 100644 --- a/PuReMD-GPU/src/init_md.h +++ b/PuReMD-GPU/src/init_md.h @@ -23,10 +23,22 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Initialize( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls*, evolve_function* ); + static_storage*, list**, output_controls*, evolve_function* ); + +void Generate_Initial_Velocities(reax_system *, real ); + +void Init_Out_Controls(reax_system *, control_params *, static_storage *, + output_controls *); + +#ifdef __cplusplus +} +#endif -void Cuda_Initialize( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls*, evolve_function* ); #endif diff --git a/PuReMD-GPU/src/integrate.cu b/PuReMD-GPU/src/integrate.c similarity index 65% rename from PuReMD-GPU/src/integrate.cu rename to PuReMD-GPU/src/integrate.c index d079028653d79dfeb5117fa6d95f33b700dee5b1..482a9c89a302c052e9ac44ae2de446c61b1c6a3e 100644 --- a/PuReMD-GPU/src/integrate.cu +++ b/PuReMD-GPU/src/integrate.c @@ -19,6 +19,7 @@ ----------------------------------------------------------------------*/ #include "integrate.h" + #include "allocate.h" #include "box.h" #include "forces.h" @@ -32,10 +33,6 @@ #include "vector.h" #include "list.h" -#include "cuda_utils.h" -#include "reduction.h" -#include "validation.h" - void Velocity_Verlet_NVE(reax_system* system, control_params* control, simulation_data *data, static_storage *workspace, @@ -49,6 +46,7 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control, dt_sqr = SQR(dt); steps = data->step - data->prev_steps; renbr = (steps % control->reneighbor == 0); + #if defined(DEBUG_FOCUS) fprintf( stderr, "step%d: ", data->step ); #endif @@ -63,6 +61,7 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control, rvec_ScaledAdd( system->atoms[i].v, 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); } + #if defined(DEBUG_FOCUS) fprintf( stderr, "verlet1 - "); #endif @@ -70,106 +69,25 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control, Reallocate( system, workspace, lists, renbr ); Reset( system, control, data, workspace, lists ); if( renbr ) + { Generate_Neighbor_Lists( system, control, data, workspace, lists, out_control ); + } Compute_Forces( system, control, data, workspace, lists, out_control ); - for( i = 0; i < system->N; i++ ) { + for( i = 0; i < system->N; i++ ) + { inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; rvec_ScaledAdd( system->atoms[i].v, 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f ); } -#if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet2\n"); -#endif -} - -/////////////////////////////////////////////////////////////////// -//Cuda Function -- Velocity Verlet NVE -/////////////////////////////////////////////////////////////////// - -GLOBAL void Cuda_Velocity_Verlet_NVE_atoms1 (reax_atom *atoms, - single_body_parameters *sbp, - simulation_box *box, - int N, real dt) -{ - real inv_m, dt_sqr; - rvec dx; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - dt_sqr = SQR(dt); - //for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / sbp[atoms[i].type].mass; - - rvec_ScaledSum( dx, dt, atoms[i].v, - 0.5 * dt_sqr * -F_CONV * inv_m, atoms[i].f ); - Inc_on_T3( atoms[i].x, dx, box ); - - rvec_ScaledAdd( atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, atoms[i].f ); - //} -} - -GLOBAL void Cuda_Velocity_Verlet_NVE_atoms2 (reax_atom *atoms, single_body_parameters *sbp, int N, real dt) -{ - real inv_m; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - //for( i = 0; i < system->N; i++ ) { - inv_m = 1.0 / sbp[atoms[i].type].mass; - rvec_ScaledAdd( atoms[i].v, - 0.5 * dt * -F_CONV * inv_m, atoms[i].f ); - //} -} - -void Cuda_Velocity_Verlet_NVE(reax_system* system, control_params* control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) -{ - int i, steps, renbr; - real inv_m, dt, dt_sqr; - rvec dx; - int blocks, block_size; - - dt = control->dt; - dt_sqr = SQR(dt); - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: ", data->step ); -#endif - - compute_blocks (&blocks, &block_size, system->N); - Cuda_Velocity_Verlet_NVE_atoms1 <<<blocks, block_size>>> - (system->d_atoms, system->reaxprm.d_sbp, - (simulation_box *)system->d_box, system->N, dt); - cudaThreadSynchronize (); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet1 - "); -#endif - - Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); - Cuda_Reset( system, control, data, workspace, lists ); - - if( renbr ) { - Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true); - } - - Cuda_Compute_Forces( system, control, data, workspace, lists, out_control ); - - Cuda_Velocity_Verlet_NVE_atoms2<<<blocks, block_size>>> - (system->d_atoms, system->reaxprm.d_sbp, system->N, dt); - cudaThreadSynchronize (); #if defined(DEBUG_FOCUS) fprintf( stderr, "verlet2\n"); #endif } + void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, control_params* control, simulation_data *data, @@ -188,6 +106,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, therm = &( data->therm ); steps = data->step - data->prev_steps; renbr = (steps % control->reneighbor == 0); + #if defined(DEBUG_FOCUS) fprintf( stderr, "step%d: ", data->step ); #endif @@ -197,7 +116,8 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, #endif /* Compute x(t + dt) and copy old forces */ - for (i=0; i < system->N; i++) { + for (i=0; i < system->N; i++) + { inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, system->atoms[i].v, @@ -209,6 +129,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, } /* Compute xi(t + dt) */ therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi ); + #if defined(DEBUG_FOCUS) fprintf( stderr, "verlet1 - " ); #endif @@ -217,14 +138,17 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, Reset( system, control, data, workspace, lists ); if( renbr ) + { Generate_Neighbor_Lists( system, control, data, workspace, lists, out_control ); + } /* Calculate Forces at time (t + dt) */ Compute_Forces( system,control,data, workspace, lists, out_control ); /* Compute iteration constants for each atom's velocity */ - for( i = 0; i < system->N; ++i ) { + for( i = 0; i < system->N; ++i ) + { inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; rvec_Scale( workspace->v_const[i], @@ -241,7 +165,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, #endif } - v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; E_kin_new = G_xi_new = v_xi_old = 0; itr = 0; @@ -258,7 +181,8 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, //print_sys_atoms (system); #endif - for( i = 0; i < system->N; ++i ) { + for( i = 0; i < system->N; ++i ) + { rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] ); E_kin_new += ( 0.5*system->reaxprm.sbp[system->atoms[i].type].mass * @@ -272,6 +196,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - data->N_f * K_B * control->T ); v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new ); + #if defined(DEBUG) fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n", itr, G_xi_new, v_xi_new, v_xi_old ); @@ -283,7 +208,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, fprintf (stderr, " Iteration Count in NVE --> %d \n", itr ); #endif - #ifndef __BUILD_DEBUG__ therm->v_xi_old = therm->v_xi; therm->v_xi = v_xi_new; @@ -296,215 +220,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, } - - -/////////////////////////////////////////////////////////////////// -//Cuda Function -- Velocity_Verlet_Nose_Hoover_NVT_Klein -/////////////////////////////////////////////////////////////////// - -GLOBAL void Compute_X_t_dt (real dt, real dt_sqr, thermostat p_therm, - reax_atom *atoms, single_body_parameters *sbp, - simulation_box *box, - static_storage p_workspace, int N) -{ - - real inv_m; - rvec dx; - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if (i >= N) return; - - static_storage *workspace = &p_workspace; - thermostat *therm = &p_therm; - - /* Compute x(t + dt) and copy old forces */ - //for (i=0; i < system->N; i++) { - inv_m = 1.0 / sbp[atoms[i].type].mass; - - rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, atoms[i].v, - 0.5 * dt_sqr * inv_m * -F_CONV, atoms[i].f ); - - Inc_on_T3( atoms[i].x, dx, box ); - - rvec_Copy( workspace->f_old[i], atoms[i].f ); - //} - -} - -GLOBAL void Update_Velocity (reax_atom *atoms, single_body_parameters *sbp, - static_storage p_workspace, real dt, thermostat p_therm, - int N) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - real inv_m; - static_storage *workspace = &p_workspace; - thermostat *therm = &p_therm; - - //for( i = 0; i < system->N; ++i ) { - inv_m = 1.0 / sbp[atoms[i].type].mass; - - rvec_Scale( workspace->v_const[i], - 1.0 - 0.5 * dt * therm->v_xi, atoms[i].v ); - rvec_ScaledAdd( workspace->v_const[i], - 0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] ); - rvec_ScaledAdd( workspace->v_const[i], - 0.5 * dt * inv_m * -F_CONV, atoms[i].f ); - //} -} - -GLOBAL void E_Kin_Reduction (reax_atom *atoms, static_storage p_workspace, - single_body_parameters *sbp, - real *per_block_results, real coef_v, const size_t n) -{ - extern __shared__ real sdata[]; - unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - real x = 0; - static_storage *workspace = &p_workspace; - - if(i < n) - { - rvec_Scale( atoms[i].v, coef_v, workspace->v_const[i] ); - x = ( 0.5 * sbp[atoms[i].type].mass * - rvec_Dot( atoms[i].v, atoms[i].v ) ); - } - sdata[threadIdx.x] = x; - __syncthreads(); - - for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) - { - if(threadIdx.x < offset) - { - sdata[threadIdx.x] += sdata[threadIdx.x + offset]; - } - - __syncthreads(); - } - - if(threadIdx.x == 0) - { - per_block_results[blockIdx.x] = sdata[0]; - } -} - - -void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control ) -{ - int i, itr, steps, renbr; - real inv_m, coef_v, dt, dt_sqr; - real E_kin_new, G_xi_new, v_xi_new, v_xi_old; - rvec dx; - thermostat *therm; - - real *results = (real *)scratch; - - dt = control->dt; - dt_sqr = SQR( dt ); - therm = &( data->therm ); - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Device: Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein --> coef to update velocity --> %6.10f\n", therm->v_xi_old); -#endif - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: ", data->step ); -#endif - - Compute_X_t_dt <<< BLOCKS, BLOCK_SIZE >>> - (dt, dt_sqr, data->therm, system->d_atoms, - system->reaxprm.d_sbp, system->d_box, *dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - /* Compute xi(t + dt) */ - therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi ); -#if defined(DEBUG_FOCUS) - fprintf( stderr, "verlet1 - " ); -#endif - - Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); - Cuda_Reset( system, control, data, workspace, lists ); - - if( renbr ) { - //generate_neighbor_lists here - Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true); - } - - /* Calculate Forces at time (t + dt) */ - Cuda_Compute_Forces( system,control,data, workspace, lists, out_control ); - - /* Compute iteration constants for each atom's velocity */ - Update_Velocity <<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, system->reaxprm.d_sbp, *dev_workspace, - dt, *therm, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - - v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi; - E_kin_new = G_xi_new = v_xi_old = 0; - itr = 0; - do { - itr++; - - /* new values become old in this iteration */ - v_xi_old = v_xi_new; - coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old); - E_kin_new = 0; - - /*reduction for the E_Kin_new here*/ -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Device: coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old); -#endif - cuda_memset (results, 0, 2 * BLOCK_SIZE * REAL_SIZE, RES_SCRATCH ); - E_Kin_Reduction <<< BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> - (system->d_atoms, *dev_workspace, system->reaxprm.d_sbp, - results, coef_v, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> - (results, results + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); - - copy_host_device (&E_kin_new, results + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); - - G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - - data->N_f * K_B * control->T ); - v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new ); -#if defined(DEBUG) - fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n", - itr, G_xi_new, v_xi_new, v_xi_old ); -#endif - } - while( fabs(v_xi_new - v_xi_old ) > 1e-5 ); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " Iteration Count in NVE --> %d \n", itr ); -#endif - - therm->v_xi_old = therm->v_xi; - therm->v_xi = v_xi_new; - therm->G_xi = G_xi_new; -#if defined(DEBUG_FOCUS) - fprintf( stderr,"vel scale\n" ); -#endif -} - -/////////////////////////////////////////////////////////////////// -//Cuda Function -- Velocity_Verlet_Nose_Hoover_NVT_Klein -/////////////////////////////////////////////////////////////////// - - /* uses Berendsen-type coupling for both T and P. All box dimensions are scaled by the same amount, there is no change in the angles between axes. */ @@ -522,6 +237,7 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system, dt = control->dt; steps = data->step - data->prev_steps; renbr = (steps % control->reneighbor == 0); + #if defined(DEBUG_FOCUS) //fprintf( out_control->prs, // "tau_t: %g tau_p: %g dt/tau_t: %g dt/tau_p: %g\n", @@ -530,7 +246,8 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system, #endif /* velocity verlet, 1st part */ - for( i = 0; i < system->N; i++ ) { + for( i = 0; i < system->N; i++ ) + { inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; /* Compute x(t + dt) */ rvec_ScaledSum( dx, dt, system->atoms[i].v, @@ -546,6 +263,7 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system, 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */ } + #if defined(DEBUG_FOCUS) fprintf( stderr, "verlet1 - " ); #endif @@ -574,6 +292,7 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system, } //Compute_Kinetic_Energy( system, data ); Compute_Pressure_Isotropic( system, control, data, out_control ); + #if defined(DEBUG_FOCUS) fprintf( stderr, "verlet2 - " ); #endif @@ -633,6 +352,7 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system, dt = control->dt; steps = data->step - data->prev_steps; renbr = (steps % control->reneighbor == 0); + #if defined(DEBUG_FOCUS) //fprintf( out_control->prs, // "tau_t: %g tau_p: %g dt/tau_t: %g dt/tau_p: %g\n", @@ -657,6 +377,7 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system, 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */ } + #if defined(DEBUG_FOCUS) fprintf( stderr, "verlet1 - " ); #endif @@ -685,6 +406,7 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system, } //Compute_Kinetic_Energy( system, data ); Compute_Pressure_Isotropic( system, control, data, out_control ); + #if defined(DEBUG_FOCUS) fprintf( stderr, "verlet2 - " ); #endif @@ -730,14 +452,12 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system, } - /************************************************/ /* BELOW FUNCTIONS ARE NOT BEING USED ANYMORE! */ /* */ /*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/ /*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/ /************************************************/ - #ifdef ANISOTROPIC void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system, @@ -810,7 +530,6 @@ void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system, } - void Velocity_Verlet_Isotropic_NPT( reax_system* system, control_params* control, simulation_data *data, @@ -983,8 +702,6 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system, #endif -//////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////// /* uses Berendsen-type coupling for both T and P. All box dimensions are scaled by the same amount, there is no change in the angles between axes. */ @@ -1006,6 +723,7 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system, #if defined(DEBUG_FOCUS) fprintf( stderr, "step%d\n", data->step ); #endif + dt = control->dt; steps = data->step - data->prev_steps; renbr = (steps % control->reneighbor == 0); @@ -1020,6 +738,7 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system, /* Compute v(t + dt/2) */ rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); } + #if defined(DEBUG_FOCUS) fprintf(stderr, "step%d: verlet1 done\n", data->step); #endif @@ -1040,6 +759,7 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system, /* Compute v(t + dt) */ rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); } + #if defined(DEBUG_FOCUS) fprintf(stderr, "step%d: verlet2 done\n", data->step); #endif @@ -1065,182 +785,3 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system, data->step ); #endif } - -GLOBAL void ker_update_velocity_1 (reax_atom *atoms, - single_body_parameters *sbp, - real dt, - simulation_box *box, - int N) -{ - real inv_m; - rvec dx; - reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N ) return; - - /* velocity verlet, 1st part */ - //for( i = 0; i < system->n; i++ ) { - atom = &(atoms[i]); - inv_m = 1.0 / sbp[atom->type].mass; - /* Compute x(t + dt) */ - rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); - rvec_Add( atom->x, dx ); - - /* Metin's suggestion to rebox the atoms */ - /* bNVT fix */ - Inc_on_T3( atoms[i].x, dx, box ); - /* bNVT fix */ - - /* Compute v(t + dt/2) */ - rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); - //} -} - -void bNVT_update_velocity_part1 (reax_system *system, simulation_box *box, real dt) -{ - ker_update_velocity_1 <<< BLOCKS, BLOCK_SIZE>>> - (system->d_atoms, system->reaxprm.d_sbp, dt, box, system->N); - cudaThreadSynchronize (); - cudaCheckError (); -} - -GLOBAL void ker_update_velocity_2 (reax_atom *atoms, - single_body_parameters *sbp, - real dt, - int N) -{ - reax_atom *atom; - real inv_m; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N ) return; - - /* velocity verlet, 2nd part */ - //for( i = 0; i < system->n; i++ ) { - atom = &(atoms[i]); - inv_m = 1.0 / sbp[atom->type].mass; - /* Compute v(t + dt) */ - rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); - //} -} - -void bNVT_update_velocity_part2 (reax_system *system, real dt) -{ - ker_update_velocity_2 <<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, system->reaxprm.d_sbp, dt, system->N); - cudaThreadSynchronize (); - cudaCheckError (); -} - -GLOBAL void ker_scale_velocities (reax_atom *atoms, real lambda, int N) -{ - reax_atom *atom; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N ) return; - - /* Scale velocities and positions at t+dt */ - //for( i = 0; i < system->n; ++i ) { - atom = &(atoms[i]); - rvec_Scale( atom->v, lambda, atom->v ); - //} -} - -void bNVT_scale_velocities (reax_system *system, real lambda) -{ - ker_scale_velocities <<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, lambda, system->N); - cudaThreadSynchronize (); - cudaCheckError (); -} - -void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, - control_params* control, - simulation_data *data, - static_storage *workspace, - list **lists, - output_controls *out_control - ) -{ - int i, steps, renbr; - real inv_m, dt, lambda; - rvec dx; - reax_atom *atom; - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d\n", data->step ); -#endif - dt = control->dt; - steps = data->step - data->prev_steps; - renbr = (steps % control->reneighbor == 0); - - /* velocity verlet, 1st part - for( i = 0; i < system->N; i++ ) { - atom = &(system->atoms[i]); - inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; - // Compute x(t + dt) - rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f ); - rvec_Add( atom->x, dx ); - // Compute v(t + dt/2) - rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f ); - } - */ - bNVT_update_velocity_part1 (system, (simulation_box *) system->d_box, dt); - -#if defined(DEBUG_FOCUS) - fprintf(stderr, "step%d: verlet1 done\n", data->step); -#endif - - Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step ); - Cuda_Reset( system, control, data, workspace, lists ); - - if( renbr ) { - Cuda_Generate_Neighbor_Lists( system, workspace, control, true); - } - - Cuda_Compute_Forces( system, control, data, workspace, - lists, out_control ); - - /* velocity verlet, 2nd part - for( i = 0; i < system->N; i++ ) { - atom = &(system->atoms[i]); - inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass; - // Compute v(t + dt) - rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f ); - } - */ - bNVT_update_velocity_part2 (system, dt); -#if defined(DEBUG_FOCUS) - fprintf(stderr, "step%d: verlet2 done\n", data->step); -#endif - - /* temperature scaler */ - Cuda_Compute_Kinetic_Energy( system, data ); - //get the latest temperature from the device to the host. - copy_host_device (&data->therm, &((simulation_data *)data->d_simulation_data)->therm, - sizeof (thermostat), cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - - lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); - if( lambda < MIN_dT ) - lambda = MIN_dT; - else if (lambda > MAX_dT ) - lambda = MAX_dT; - lambda = SQRT( lambda ); - - //fprintf (stderr, "step:%d lambda -> %f \n", data->step, lambda); - - /* Scale velocities and positions at t+dt - for( i = 0; i < system->N; ++i ) { - atom = &(system->atoms[i]); - rvec_Scale( atom->v, lambda, atom->v ); - } - */ - bNVT_scale_velocities (system, lambda); - Cuda_Compute_Kinetic_Energy( system, data ); - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "step%d: scaled velocities\n", - data->step ); -#endif - -} - - diff --git a/PuReMD-GPU/src/integrate.h b/PuReMD-GPU/src/integrate.h index 945b29fa09fb59e8e801b55a7527ca88814e7bc2..6f5848f0de84e8a50ef2c5090194618b61f185fc 100644 --- a/PuReMD-GPU/src/integrate.h +++ b/PuReMD-GPU/src/integrate.h @@ -24,24 +24,19 @@ #include "mytypes.h" void Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); -void Cuda_Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); + static_storage*, list**, output_controls* ); void Velocity_Verlet_Nose_Hoover_NVT( reax_system*, control_params*, - simulation_data*, static_storage*, - list**, output_controls* ); -void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*, simulation_data*, static_storage*, list**, output_controls* ); -void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*, +void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*, simulation_data*, static_storage*, list**, output_controls* ); void Velocity_Verlet_Flexible_NPT( reax_system*, control_params*, - simulation_data*, static_storage*, - list**, output_controls* ); + simulation_data*, static_storage*, + list**, output_controls* ); void Velocity_Verlet_Isotropic_NPT( reax_system*, control_params*, - simulation_data*, static_storage*, - list**, output_controls* ); + simulation_data*, static_storage*, + list**, output_controls* ); void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system*, control_params*, simulation_data*, static_storage*, list**, output_controls* ); @@ -50,9 +45,7 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system*, control_params*, static_storage*, list**, output_controls* ); void Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* , - simulation_data *, static_storage *, - list **, output_controls * ); -void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* , simulation_data *, static_storage *, list **, output_controls * ); + #endif diff --git a/PuReMD-GPU/src/lin_alg.c b/PuReMD-GPU/src/lin_alg.c new file mode 100644 index 0000000000000000000000000000000000000000..cb141d475b0e2cf702901ed551287e0e238cdcd6 --- /dev/null +++ b/PuReMD-GPU/src/lin_alg.c @@ -0,0 +1,676 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "lin_alg.h" + +#include "list.h" +#include "vector.h" +#include "index_utils.h" + + +void Sparse_MatVec( sparse_matrix *A, real *x, real *b ) +{ + int i, j, k, n, si, ei; + real H; + + n = A->n; + for( i = 0; i < n; ++i ) + b[i] = 0; + + for( i = 0; i < n; ++i ) { + si = A->start[i]; + ei = A->start[i+1]-1; + + for( k = si; k < ei; ++k ) { + j = A->entries[k].j; + H = A->entries[k].val; + b[j] += H * x[i]; + b[i] += H * x[j]; + } + + // the diagonal entry is the last one in + b[i] += A->entries[k].val * x[i]; + } +} + + +void Forward_Subs( sparse_matrix *L, real *b, real *y ) +{ + int i, pj, j, si, ei; + real val; + + for( i = 0; i < L->n; ++i ) { + y[i] = b[i]; + si = L->start[i]; + ei = L->start[i+1]; + for( pj = si; pj < ei-1; ++pj ){ + j = L->entries[pj].j; + val = L->entries[pj].val; + y[i] -= val * y[j]; + } + y[i] /= L->entries[pj].val; + } +} + + +void Backward_Subs( sparse_matrix *U, real *y, real *x ) +{ + int i, pj, j, si, ei; + real val; + + for( i = U->n-1; i >= 0; --i ) { + x[i] = y[i]; + si = U->start[i]; + ei = U->start[i+1]; + for( pj = si+1; pj < ei; ++pj ){ + j = U->entries[pj].j; + val = U->entries[pj].val; + x[i] -= val * x[j]; + } + x[i] /= U->entries[si].val; + } +} + + +int GMRES( static_storage *workspace, sparse_matrix *H, + real *b, real tol, real *x, FILE *fout, reax_system* system) +{ + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + + N = H->n; + bnorm = Norm( b, N ); + + /* apply the diagonal pre-conditioner to rhs */ + for( i = 0; i < N; ++i ) + workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; + + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) { + /* calculate r0 */ + Sparse_MatVec( H, x, workspace->b_prm ); + + for( i = 0; i < N; ++i ) + workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ + + + Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system->N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N); + workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system->N)], N ); + Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system->N) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system->N)], N ); + + /* GMRES inner-loop */ + for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) { + /* matvec */ + Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] ); + + for( k = 0; k < N; ++k ) + workspace->v[ index_wkspace_sys (j+1,k,system->N)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ + + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + for( i = 0; i <= j; i++ ) { + workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N ); + Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system->N)], + -workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system->N)], N ); + } + + + workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system->N)], N ); + Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system->N)], + 1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N ); + // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); + + + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + for( i = 0; i <= j; i++ ) { + if( i == j ) { + cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) ); + workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc; + workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc; + } + + tmp1 = workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + + workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ]; + tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + + workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ]; + + workspace->h[ index_wkspace_res (i,j) ] = tmp1; + workspace->h[ index_wkspace_res (i+1,j) ] = tmp2; + } + + /* apply Givens rotations to the rhs as well */ + tmp1 = workspace->hc[j] * workspace->g[j]; + tmp2 = -workspace->hs[j] * workspace->g[j]; + workspace->g[j] = tmp1; + workspace->g[j+1] = tmp2; + + // fprintf( stderr, "h: " ); + // for( i = 0; i <= j+1; ++i ) + // fprintf( stderr, "%.6f ", workspace->h[i][j] ); + // fprintf( stderr, "\n" ); + //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); + } + + + /* solve Hy = g. + H is now upper-triangular, do back-substitution */ + for( i = j-1; i >= 0; i-- ) { + temp = workspace->g[i]; + for( k = j-1; k > i; k-- ) + temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; + + workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ]; + } + + + /* update x = x_0 + Vy */ + for( i = 0; i < j; i++ ) + Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system->N)], N ); + + /* stopping condition */ + if( fabs(workspace->g[j]) / bnorm <= tol ) + break; + } + + // Sparse_MatVec( H, x, workspace->b_prm ); + // for( i = 0; i < N; ++i ) + // workspace->b_prm[i] *= workspace->Hdia_inv[i]; + // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); + // for( i = 0; i < N; ++i ) + // fprintf( fout, "%10.5f%15.12f%15.12f\n", + // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ + + // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", + // itr, j, fabs( workspace->g[j] ) / bnorm ); + // data->timing.matvec += itr * RESTART + j; + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + // return -1; + return itr * (RESTART+1) + j + 1; + } + + return itr * (RESTART+1) + j + 1; +} + + +int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, + real *b, real tol, real *x, FILE *fout, reax_system *system) +{ + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + real v[10000], z[RESTART+2][10000], w[RESTART+2]; + real u[RESTART+2][10000]; + + N = H->n; + bnorm = Norm( b, N ); + + /* apply the diagonal pre-conditioner to rhs */ + for( i = 0; i < N; ++i ) + workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i]; + + // memset( x, 0, sizeof(real) * N ); + + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) { + /* compute z = r0 */ + Sparse_MatVec( H, x, workspace->b_prm ); + for( i = 0; i < N; ++i ) + workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */ + Vector_Sum( z[0], 1., workspace->b_prc, -1., workspace->b_prm, N ); + + Vector_MakeZero( w, RESTART+1 ); + w[0] = Norm( z[0], N ); + + Vector_Copy( u[0], z[0], N ); + u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0]; + Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N ); + + w[0] *= ( u[0][0] < 0.0 ? 1 :-1 ); + // fprintf( stderr, "\n\n%12.6f\n", w[0] ); + + /* GMRES inner-loop */ + for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) { + /* compute v_j */ + Vector_Scale( z[j], -2 * u[j][j], u[j], N ); + z[j][j] += 1.; /* due to e_j */ + + for( i = j-1; i >= 0; --i ) + Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i ); + + + /* matvec */ + Sparse_MatVec( H, z[j], v ); + + for( k = 0; k < N; ++k ) + v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */ + + for( i = 0; i <= j; ++i ) + Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i ); + + + if( !Vector_isZero( v + (j+1), N - (j+1) ) ) { + /* compute the HouseHolder unit vector u_j+1 */ + for( i = 0; i <= j; ++i ) + u[j+1][i] = 0; + + Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) ); + + u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) ); + + Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N ); + + /* overwrite v with P_m+1 * v */ + v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1]; + Vector_MakeZero( v + (j+2), N - (j+2) ); + // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N ); + } + + + /* prev Givens rots on the upper-Hessenberg matrix to make it U */ + for( i = 0; i < j; i++ ) { + tmp1 = workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1]; + tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1]; + + v[i] = tmp1; + v[i+1] = tmp2; + } + + /* apply the new Givens rotation to H and right-hand side */ + if( fabs(v[j+1]) >= ALMOST_ZERO ) { + cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) ); + workspace->hc[j] = v[j] / cc; + workspace->hs[j] = v[j+1] / cc; + + tmp1 = workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1]; + tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1]; + + v[j] = tmp1; + v[j+1] = tmp2; + + /* Givens rotations to rhs */ + tmp1 = workspace->hc[j] * w[j]; + tmp2 = -workspace->hs[j] * w[j]; + w[j] = tmp1; + w[j+1] = tmp2; + } + + /* extend R */ + for( i = 0; i <= j; ++i ) + workspace->h[ index_wkspace_res (i,j) ] = v[i]; + + + // fprintf( stderr, "h:" ); + // for( i = 0; i <= j+1 ; ++i ) + // fprintf( stderr, "%.6f ", h[i][j] ); + // fprintf( stderr, "\n" ); + // fprintf( stderr, "%12.6f\n", w[j+1] ); + } + + + /* solve Hy = w. + H is now upper-triangular, do back-substitution */ + for( i = j-1; i >= 0; i-- ) { + temp = w[i]; + for( k = j-1; k > i; k-- ) + temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; + + workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ]; + } + + // fprintf( stderr, "y: " ); + // for( i = 0; i < RESTART+1; ++i ) + // fprintf( stderr, "%8.3f ", workspace->y[i] ); + + + /* update x = x_0 + Vy */ + // memset( z, 0, sizeof(real) * N ); + // for( i = j-1; i >= 0; i-- ) + // { + // Vector_Copy( v, z, N ); + // v[i] += workspace->y[i]; + // + // Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N ); + // } + // + // fprintf( stderr, "\nz: " ); + // for( k = 0; k < N; ++k ) + // fprintf( stderr, "%6.2f ", z[k] ); + + // fprintf( stderr, "\nx_bef: " ); + // for( i = 0; i < N; ++i ) + // fprintf( stderr, "%6.2f ", x[i] ); + + // Vector_Add( x, 1, z, N ); + for( i = j-1; i >= 0; i-- ) + Vector_Add( x, workspace->y[i], z[i], N ); + + // fprintf( stderr, "\nx_aft: " ); + // for( i = 0; i < N; ++i ) + // fprintf( stderr, "%6.2f ", x[i] ); + + /* stopping condition */ + if( fabs( w[j] ) / bnorm <= tol ) + break; + } + + // Sparse_MatVec( H, x, workspace->b_prm ); + // for( i = 0; i < N; ++i ) + // workspace->b_prm[i] *= workspace->Hdia_inv[i]; + + // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); + // for( i = 0; i < N; ++i ) + // fprintf( fout, "%10.5f%15.12f%15.12f\n", + // workspace->b_prc[i], workspace->b_prm[i], x[i] ); + + //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", + // itr, j, fabs( workspace->g[j] ) / bnorm ); + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + // return -1; + return itr * (RESTART+1) + j + 1; + } + + return itr * (RESTART+1) + j + 1; +} + + +int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, + sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system ) +{ + int i, j, k, itr, N; + real cc, tmp1, tmp2, temp, bnorm; + + N = H->n; + bnorm = Norm( b, N ); + + /* GMRES outer-loop */ + for( itr = 0; itr < MAX_ITR; ++itr ) + { + /* calculate r0 */ + Sparse_MatVec( H, x, workspace->b_prm ); + Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system->N)], 1., b, -1., workspace->b_prm, N ); + Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system->N)], &workspace->v[index_wkspace_sys(0,0,system->N)] ); + Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system->N)], &workspace->v[index_wkspace_sys(0,0,system->N)] ); + workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system->N)], N ); + Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system->N)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system->N)], N ); + //fprintf( stderr, "res: %.15e\n", workspace->g[0] ); + + /* GMRES inner-loop */ + for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) + { + /* matvec */ + Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system->N)], &workspace->v[index_wkspace_sys (j+1,0,system->N)] ); + Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] ); + Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] ); + + /* apply modified Gram-Schmidt to orthogonalize the new residual */ + for( i = 0; i < j-1; i++ ) + { + workspace->h[ index_wkspace_res (i,j)] = 0; + } + + //for( i = 0; i <= j; i++ ) { + for( i = MAX(j-1,0); i <= j; i++ ) { + workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N ); + Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system->N)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system->N)], N ); + } + + workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system->N)], N ); + Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system->N)], + 1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N ); + // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j ); + + /* Givens rotations on the upper-Hessenberg matrix to make it U */ + for( i = MAX(j-1,0); i <= j; i++ ) + { + if( i == j ) + { + cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) ); + workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc; + workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc; + } + + tmp1 = workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + + workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ]; + tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + + workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ]; + + workspace->h[ index_wkspace_res (i,j) ] = tmp1; + workspace->h[ index_wkspace_res (i+1,j) ] = tmp2; + } + + /* apply Givens rotations to the rhs as well */ + tmp1 = workspace->hc[j] * workspace->g[j]; + tmp2 = -workspace->hs[j] * workspace->g[j]; + workspace->g[j] = tmp1; + workspace->g[j+1] = tmp2; + + //fprintf( stderr, "h: " ); + //for( i = 0; i <= j+1; ++i ) + //fprintf( stderr, "%.6f ", workspace->h[i][j] ); + //fprintf( stderr, "\n" ); + //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] ); + } + + + /* solve Hy = g: H is now upper-triangular, do back-substitution */ + for( i = j-1; i >= 0; i-- ) + { + temp = workspace->g[i]; + for( k = j-1; k > i; k-- ) + { + temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k]; + } + + workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)]; + } + + /* update x = x_0 + Vy */ + Vector_MakeZero( workspace->p, N ); + for( i = 0; i < j; i++ ) + Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system->N)], N ); + //Backward_Subs( U, workspace->p, workspace->p ); + //Forward_Subs( L, workspace->p, workspace->p ); + Vector_Add( x, 1., workspace->p, N ); + + /* stopping condition */ + if( fabs(workspace->g[j]) / bnorm <= tol ) + { + break; + } + } + + // Sparse_MatVec( H, x, workspace->b_prm ); + // for( i = 0; i < N; ++i ) + // workspace->b_prm[i] *= workspace->Hdia_inv[i]; + // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" ); + // for( i = 0; i < N; ++i ) + // fprintf( fout, "%10.5f%15.12f%15.12f\n", + // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/ + + // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", + // itr, j, fabs( workspace->g[j] ) / bnorm ); + // data->timing.matvec += itr * RESTART + j; + + if( itr >= MAX_ITR ) { + fprintf( stderr, "GMRES convergence failed\n" ); + // return -1; + return itr * (RESTART+1) + j + 1; + } + + return itr * (RESTART+1) + j + 1; +} + + +int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, + sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system ) +{ + int i, N; + real tmp, alpha, beta, b_norm, r_norm; + real sig0, sig_old, sig_new; + + N = A->n; + b_norm = Norm( b, N ); + //fprintf( stderr, "b_norm: %.15e\n", b_norm ); + + Sparse_MatVec( A, x, workspace->q ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); + r_norm = Norm(workspace->r, N); + //Print_Soln( workspace, x, q, b, N ); + //fprintf( stderr, "res: %.15e\n", r_norm ); + + Forward_Subs( L, workspace->r, workspace->d ); + Backward_Subs( U, workspace->d, workspace->p ); + sig_new = Dot( workspace->r, workspace->p, N ); + sig0 = sig_new; + + for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) + { + //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) { + Sparse_MatVec( A, workspace->p, workspace->q ); + tmp = Dot( workspace->q, workspace->p, N ); + alpha = sig_new / tmp; + Vector_Add( x, alpha, workspace->p, N ); + //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n", + // i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp ); + + Vector_Add( workspace->r, -alpha, workspace->q, N ); + r_norm = Norm(workspace->r, N); + //fprintf( stderr, "res: %.15e\n", r_norm ); + + Forward_Subs( L, workspace->r, workspace->d ); + Backward_Subs( U, workspace->d, workspace->d ); + sig_old = sig_new; + sig_new = Dot( workspace->r, workspace->d, N ); + beta = sig_new / sig_old; + Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N ); + } + + //fprintf( fout, "CG took %d iterations\n", i ); + if( i >= 200 ) { + fprintf( stderr, "CG convergence failed!\n" ); + return i; + } + + return i; +} + + +int CG( static_storage *workspace, sparse_matrix *H, + real *b, real tol, real *x, FILE *fout, reax_system *system) +{ + int i, j, N; + real tmp, alpha, beta, b_norm; + real sig_old, sig_new, sig0; + + N = H->n; + b_norm = Norm( b, N ); + //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); + + Sparse_MatVec( H, x, workspace->q ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + sig_new = Dot( workspace->r, workspace->d, N ); + sig0 = sig_new; + //Print_Soln( workspace, x, q, b, N ); + //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", + // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) ); + //fprintf( stderr, "sig_new: %f\n", sig_new ); + + for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) { + //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) { + Sparse_MatVec( H, workspace->d, workspace->q ); + tmp = Dot( workspace->d, workspace->q, N ); + //fprintf( stderr, "tmp: %f\n", tmp ); + alpha = sig_new / tmp; + Vector_Add( x, alpha, workspace->d, N ); + //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", + // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); + + Vector_Add( workspace->r, -alpha, workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + sig_old = sig_new; + sig_new = Dot( workspace->r, workspace->p, N ); + beta = sig_new / sig_old; + Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N ); + //fprintf( stderr, "sig_new: %f\n", sig_new ); + } + + fprintf( stderr, "CG took %d iterations\n", i ); + + if( i >= 300 ) { + fprintf( stderr, "CG convergence failed!\n" ); + return i; + } + + return i; +} + + +/* Steepest Descent */ +int SDM( static_storage *workspace, sparse_matrix *H, + real *b, real tol, real *x, FILE *fout ) +{ + int i, j, N; + real tmp, alpha, beta, b_norm; + real sig0, sig; + + N = H->n; + b_norm = Norm( b, N ); + //fprintf( stderr, "b_norm: %10.6f\n", b_norm ); + + Sparse_MatVec( H, x, workspace->q ); + Vector_Sum( workspace->r , 1., b, -1., workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + sig = Dot( workspace->r, workspace->d, N ); + sig0 = sig; + + for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) { + Sparse_MatVec( H, workspace->d, workspace->q ); + + sig = Dot( workspace->r, workspace->d, N ); + tmp = Dot( workspace->d, workspace->q, N ); + alpha = sig / tmp; + + Vector_Add( x, alpha, workspace->d, N ); + Vector_Add( workspace->r, -alpha, workspace->q, N ); + for( j = 0; j < N; ++j ) + workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; + + //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n", + // Norm(workspace->d,N), Norm(workspace->q,N), tmp ); + } + + fprintf( stderr, "SDM took %d iterations\n", i ); + + if( i >= 300 ) { + fprintf( stderr, "SDM convergence failed!\n" ); + return i; + } + + return i; +} diff --git a/PuReMD-GPU/src/GMRES.h b/PuReMD-GPU/src/lin_alg.h similarity index 89% rename from PuReMD-GPU/src/GMRES.h rename to PuReMD-GPU/src/lin_alg.h index 5f9dc46bcd853ec1b2036c995b9bfc0675fd635c..a515a959494a6eca40fe9f338d2a08118ff3e39a 100644 --- a/PuReMD-GPU/src/GMRES.h +++ b/PuReMD-GPU/src/lin_alg.h @@ -18,19 +18,17 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#ifndef __GMRES_H_ -#define __GMRES_H_ +#ifndef __LIN_ALG_H_ +#define __LIN_ALG_H_ #define SIGN(x) (x < 0.0 ? -1 : 1); #include "mytypes.h" + int GMRES( static_storage*, sparse_matrix*, real*, real, real*, FILE* , reax_system* ); -int Cuda_GMRES( static_storage *, real *b, real tol, real *x ); -int Cublas_GMRES( reax_system *, static_storage *, real *b, real tol, real *x ); - int GMRES_HouseHolder( static_storage*, sparse_matrix*, real*, real, real*, FILE* , reax_system* ); @@ -46,4 +44,5 @@ int CG( static_storage*, sparse_matrix*, int uyduruk_GMRES( static_storage*, sparse_matrix*, real*, real, real*, int, FILE*, reax_system* ); + #endif diff --git a/PuReMD-GPU/src/list.c b/PuReMD-GPU/src/list.c new file mode 100644 index 0000000000000000000000000000000000000000..c6f0e55ebad4fc59c07f253a1d216d3242115aff --- /dev/null +++ b/PuReMD-GPU/src/list.c @@ -0,0 +1,146 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "list.h" + + +char Make_List(int n, int num_intrs, int type, list* l) +{ + char success=1; + + l->n = n; + l->num_intrs = num_intrs; + + l->index = (int*) malloc( n * sizeof(int) ); + l->end_index = (int*) malloc( n * sizeof(int) ); + + if (l->index == NULL) success = 0; + if (l->end_index == NULL) success = 0; + + l->type = type; + + switch(type) + { + case TYP_VOID: + l->select.v = (void *) malloc(l->num_intrs*sizeof(void)); + if (l->select.v == NULL) success = 0; + break; + + case TYP_THREE_BODY: + l->select.three_body_list = (three_body_interaction_data*) + malloc(l->num_intrs*sizeof(three_body_interaction_data)); + if (l->select.three_body_list == NULL) success = 0; + break; + + case TYP_BOND: + l->select.bond_list = (bond_data*) + malloc(l->num_intrs * sizeof(bond_data)); + if (l->select.bond_list == NULL) success = 0; + break; + + case TYP_DBO: + l->select.dbo_list = (dbond_data*) + malloc(l->num_intrs * sizeof(dbond_data)); + if (l->select.dbo_list == NULL) success = 0; + break; + + case TYP_DDELTA: + l->select.dDelta_list = (dDelta_data*) + malloc(l->num_intrs*sizeof(dDelta_data)); + if (l->select.dDelta_list == NULL) success = 0; + break; + + case TYP_FAR_NEIGHBOR: + l->select.far_nbr_list = (far_neighbor_data*) + malloc(l->num_intrs*sizeof(far_neighbor_data)); + if (l->select.far_nbr_list == NULL) success = 0; + break; + + case TYP_NEAR_NEIGHBOR: + l->select.near_nbr_list = (near_neighbor_data*) + malloc(l->num_intrs*sizeof(near_neighbor_data)); + if (l->select.near_nbr_list == NULL) success = 0; + break; + + case TYP_HBOND: + l->select.hbond_list = (hbond_data*) + malloc( l->num_intrs * sizeof(hbond_data) ); + if (l->select.hbond_list == NULL) success = 0; + break; + + default: + l->select.v = (void *) malloc(l->num_intrs*sizeof(void)); + if (l->select.v == NULL) success = 0; + l->type = TYP_VOID; + break; + } + + return success; +} + + +void Delete_List(list* l) +{ + if( l->index != NULL ) + free(l->index); + if( l->end_index != NULL ) + free(l->end_index); + + switch(l->type) + { + case TYP_VOID: + if( l->select.v != NULL ) + free(l->select.v); + break; + case TYP_THREE_BODY: + if( l->select.three_body_list != NULL ) + free(l->select.three_body_list); + break; + case TYP_BOND: + if( l->select.bond_list != NULL ) + free(l->select.bond_list); + break; + case TYP_DBO: + if( l->select.dbo_list != NULL ) + free(l->select.dbo_list); + break; + case TYP_DDELTA: + if( l->select.dDelta_list != NULL ) + free(l->select.dDelta_list); + break; + case TYP_FAR_NEIGHBOR: + if( l->select.far_nbr_list != NULL ) + free(l->select.far_nbr_list); + break; + case TYP_NEAR_NEIGHBOR: + if( l->select.near_nbr_list != NULL ) + free(l->select.near_nbr_list); + break; + case TYP_HBOND: + if( l->select.hbond_list != NULL ) + free(l->select.hbond_list); + break; + + default: + // Report fatal error + break; + } +} + diff --git a/PuReMD-GPU/src/list.cu b/PuReMD-GPU/src/list.cu deleted file mode 100644 index 095409aa4fdc96102a5e321d1511737e37371eef..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/list.cu +++ /dev/null @@ -1,235 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#include "list.h" -#include "cuda_utils.h" - -HOST char Make_List(int n, int num_intrs, int type, list* l, int proc) -{ - char success=1; - - if (proc == TYP_HOST) { - - l->n = n; - l->num_intrs = num_intrs; - - l->index = (int*) malloc( n * sizeof(int) ); - l->end_index = (int*) malloc( n * sizeof(int) ); - - if (l->index == NULL) success = 0; - if (l->end_index == NULL) success = 0; - - l->type = type; - - switch(type) - { - case TYP_VOID: - l->select.v = (void *) malloc(l->num_intrs*sizeof(void)); - if (l->select.v == NULL) success = 0; - break; - - case TYP_THREE_BODY: - l->select.three_body_list = (three_body_interaction_data*) - malloc(l->num_intrs*sizeof(three_body_interaction_data)); - if (l->select.three_body_list == NULL) success = 0; - break; - - case TYP_BOND: - l->select.bond_list = (bond_data*) - malloc(l->num_intrs * sizeof(bond_data)); - if (l->select.bond_list == NULL) success = 0; - break; - - case TYP_DBO: - l->select.dbo_list = (dbond_data*) - malloc(l->num_intrs * sizeof(dbond_data)); - if (l->select.dbo_list == NULL) success = 0; - break; - - case TYP_DDELTA: - l->select.dDelta_list = (dDelta_data*) - malloc(l->num_intrs*sizeof(dDelta_data)); - if (l->select.dDelta_list == NULL) success = 0; - break; - - case TYP_FAR_NEIGHBOR: - l->select.far_nbr_list = (far_neighbor_data*) - malloc(l->num_intrs*sizeof(far_neighbor_data)); - if (l->select.far_nbr_list == NULL) success = 0; - break; - - case TYP_NEAR_NEIGHBOR: - l->select.near_nbr_list = (near_neighbor_data*) - malloc(l->num_intrs*sizeof(near_neighbor_data)); - if (l->select.near_nbr_list == NULL) success = 0; - break; - - case TYP_HBOND: - l->select.hbond_list = (hbond_data*) - malloc( l->num_intrs * sizeof(hbond_data) ); - if (l->select.hbond_list == NULL) success = 0; - break; - - default: - l->select.v = (void *) malloc(l->num_intrs*sizeof(void)); - if (l->select.v == NULL) success = 0; - l->type = TYP_VOID; - break; - } - - } - else - { - l->n = n; - l->num_intrs = num_intrs; - - cuda_malloc ((void **)&l->index, n * sizeof(int), 1, LIST_INDEX ); - cuda_malloc ((void **)&l->end_index, n * sizeof(int), 1, LIST_END_INDEX ); - - switch(type) - { - case TYP_FAR_NEIGHBOR: - cuda_malloc ((void **) &l->select.far_nbr_list, - l->num_intrs*sizeof(far_neighbor_data), - 1, LIST_FAR_NEIGHBOR_DATA); - /* - cudaHostAlloc ((void **) &l->select.far_nbr_list, - l->num_intrs*sizeof(far_neighbor_data), - cudaHostAllocMapped); - - cudaHostGetDevicePointer ( (void **) &l->select.far_nbr_list, - (void *)l->select.far_nbr_list, 0); - */ - break; - - case TYP_HBOND: - cuda_malloc ((void **) &l->select.hbond_list, - l->num_intrs * sizeof(hbond_data), - 1, LIST_HBOND_DATA ); - break; - - case TYP_BOND: - cuda_malloc ((void **) &l->select.bond_list, - l->num_intrs * sizeof(bond_data), - 1, LIST_BOND_DATA ); - break; - - case TYP_THREE_BODY: - cuda_malloc ( (void **) &l->select.three_body_list, - l->num_intrs * sizeof(three_body_interaction_data), - 1, LIST_THREE_BODY_DATA ); - break; - - default: - fprintf (stderr, "Unknown list creation \n" ); - exit (1); - } - } - - return success; -} - - -HOST void Delete_List(list* l, int type) -{ - - if (type == TYP_HOST ) - { - if( l->index != NULL ) - free(l->index); - if( l->end_index != NULL ) - free(l->end_index); - - switch(l->type) - { - case TYP_VOID: - if( l->select.v != NULL ) - free(l->select.v); - break; - case TYP_THREE_BODY: - if( l->select.three_body_list != NULL ) - free(l->select.three_body_list); - break; - case TYP_BOND: - if( l->select.bond_list != NULL ) - free(l->select.bond_list); - break; - case TYP_DBO: - if( l->select.dbo_list != NULL ) - free(l->select.dbo_list); - break; - case TYP_DDELTA: - if( l->select.dDelta_list != NULL ) - free(l->select.dDelta_list); - break; - case TYP_FAR_NEIGHBOR: - if( l->select.far_nbr_list != NULL ) - free(l->select.far_nbr_list); - break; - case TYP_NEAR_NEIGHBOR: - if( l->select.near_nbr_list != NULL ) - free(l->select.near_nbr_list); - break; - case TYP_HBOND: - if( l->select.hbond_list != NULL ) - free(l->select.hbond_list); - break; - - default: - // Report fatal error - break; - } - } - else - { - if (l->index != NULL) - cuda_free (l->index, LIST_INDEX ); - if (l->end_index != NULL) - cuda_free (l->end_index, LIST_END_INDEX ); - - switch(type) - { - case TYP_FAR_NEIGHBOR: - if (l->select.far_nbr_list != NULL) - cuda_free (l->select.far_nbr_list, LIST_FAR_NEIGHBOR_DATA); - break; - - case TYP_HBOND: - if (l->select.hbond_list != NULL) - cuda_free (l->select.hbond_list, LIST_HBOND_DATA ); - break; - - case TYP_BOND: - if (l->select.bond_list != NULL) - cuda_free (l->select.bond_list, LIST_BOND_DATA ); - break; - - case TYP_THREE_BODY: - if (l->select.three_body_list != NULL) - cuda_free ( l->select.three_body_list, LIST_THREE_BODY_DATA ); - break; - - default: - fprintf (stderr, "Unknown list deletion \n" ); - exit (1); - } - } -} - diff --git a/PuReMD-GPU/src/list.h b/PuReMD-GPU/src/list.h index f341c2e270912e57beee4867bca9927a5c905633..b90c41419271ca6b859be08ea4005fbe9107c029 100644 --- a/PuReMD-GPU/src/list.h +++ b/PuReMD-GPU/src/list.h @@ -23,31 +23,36 @@ #include "mytypes.h" -HOST char Make_List( int, int, int, list* , int proc = TYP_HOST); -HOST void Delete_List( list* , int proc = TYP_HOST); +char Make_List( int, int, int, list* ); +void Delete_List( list* ); -inline HOST_DEVICE int Num_Entries(int i, list* l) + +static inline HOST_DEVICE int Num_Entries(int i, list* l) { return l->end_index[i] - l->index[i]; } -inline HOST_DEVICE int Start_Index(int i, list *l ) + +static inline HOST_DEVICE int Start_Index(int i, list *l ) { return l->index[i]; } -inline HOST_DEVICE int End_Index( int i, list *l ) + +static inline HOST_DEVICE int End_Index( int i, list *l ) { return l->end_index[i]; } -inline HOST_DEVICE void Set_Start_Index(int i, int val, list *l) + +static inline HOST_DEVICE void Set_Start_Index(int i, int val, list *l) { l->index[i] = val; } -inline HOST_DEVICE void Set_End_Index(int i, int val, list *l) + +static inline HOST_DEVICE void Set_End_Index(int i, int val, list *l) { l->end_index[i] = val; } diff --git a/PuReMD-GPU/src/lookup.c b/PuReMD-GPU/src/lookup.c new file mode 100644 index 0000000000000000000000000000000000000000..c439709dc09c77775ed716a39db797fa8c831585 --- /dev/null +++ b/PuReMD-GPU/src/lookup.c @@ -0,0 +1,406 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "lookup.h" + +#include "two_body_interactions.h" + +#include "index_utils.h" + + +void Make_Lookup_Table(real xmin, real xmax, int n, + lookup_function f, lookup_table* t) +{ + int i; + + t->xmin = xmin; + t->xmax = xmax; + t->n = n; + t->dx = (xmax - xmin)/(n-1); + t->inv_dx = 1.0 / t->dx; + t->a = (n-1)/(xmax-xmin); + t->y = (real*) malloc(n*sizeof(real)); + + for(i=0; i < n; i++) + t->y[i] = f(i*t->dx + t->xmin); + + // //fprintf(stdout,"dx = %lf\n",t->dx); + // for(i=0; i < n; i++) + // //fprintf( stdout,"%d %lf %lf %lf\n", + // i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) ); +} + + +/* Fills solution into x. Warning: will modify c and d! */ +void Tridiagonal_Solve( const real *a, const real *b, + real *c, real *d, real *x, unsigned int n){ + int i; + real id; + + /* Modify the coefficients. */ + c[0] /= b[0]; /* Division by zero risk. */ + d[0] /= b[0]; /* Division by zero would imply a singular matrix. */ + for(i = 1; i < n; i++){ + id = (b[i] - c[i-1] * a[i]); /* Division by zero risk. */ + c[i] /= id; /* Last value calculated is redundant. */ + d[i] = (d[i] - d[i-1] * a[i])/id; + } + + /* Now back substitute. */ + x[n - 1] = d[n - 1]; + for(i = n - 2; i >= 0; i--) + x[i] = d[i] - c[i] * x[i + 1]; +} + + +void Natural_Cubic_Spline( const real *h, const real *f, + cubic_spline_coef *coef, unsigned int n ) +{ + int i; + real *a, *b, *c, *d, *v; + + /* allocate space for the linear system */ + a = (real*) malloc( n * sizeof(real) ); + b = (real*) malloc( n * sizeof(real) ); + c = (real*) malloc( n * sizeof(real) ); + d = (real*) malloc( n * sizeof(real) ); + v = (real*) malloc( n * sizeof(real) ); + + /* build the linear system */ + a[0] = a[1] = a[n-1] = 0; + for( i = 2; i < n-1; ++i ) + a[i] = h[i-1]; + + b[0] = b[n-1] = 0; + for( i = 1; i < n-1; ++i ) + b[i] = 2 * (h[i-1] + h[i]); + + c[0] = c[n-2] = c[n-1] = 0; + for( i = 1; i < n-2; ++i ) + c[i] = h[i]; + + d[0] = d[n-1] = 0; + for( i = 1; i < n-1; ++i ) + d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); + + /*//fprintf( stderr, "i a b c d\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ + v[0] = 0; + v[n-1] = 0; + Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); + + for( i = 1; i < n; ++i ){ + coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); + coef[i-1].c = v[i]/2; + coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; + coef[i-1].a = f[i]; + } + + /*//fprintf( stderr, "i v coef\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f %f\n", + i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ +} + + +void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast, + cubic_spline_coef *coef, unsigned int n ) +{ + int i; + real *a, *b, *c, *d, *v; + + /* allocate space for the linear system */ + a = (real*) malloc( n * sizeof(real) ); + b = (real*) malloc( n * sizeof(real) ); + c = (real*) malloc( n * sizeof(real) ); + d = (real*) malloc( n * sizeof(real) ); + v = (real*) malloc( n * sizeof(real) ); + + /* build the linear system */ + a[0] = 0; + for( i = 1; i < n; ++i ) + a[i] = h[i-1]; + + b[0] = 2*h[0]; + for( i = 1; i < n; ++i ) + b[i] = 2 * (h[i-1] + h[i]); + + c[n-1] = 0; + for( i = 0; i < n-1; ++i ) + c[i] = h[i]; + + d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0; + d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]); + for( i = 1; i < n-1; ++i ) + d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]); + + /*//fprintf( stderr, "i a b c d\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f\n", i, a[i], b[i], c[i], d[i] );*/ + Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n ); + // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 ); + + for( i = 1; i < n; ++i ){ + coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]); + coef[i-1].c = v[i]/2; + coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6; + coef[i-1].a = f[i]; + } + + /*//fprintf( stderr, "i v coef\n" ); + for( i = 0; i < n; ++i ) + //fprintf( stderr, "%d %f %f %f %f %f\n", + i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */ +} + + +void LR_Lookup( LR_lookup_table *t, real r, LR_data *y ) +{ + int i; + real base, dif; + + i = (int)(r * t->inv_dx); + if( i == 0 ) ++i; + base = (real)(i+1) * t->dx; + dif = r - base; + ////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif ); + + y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + + t->vdW[i].a; + y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + + t->CEvd[i].b)*dif + t->CEvd[i].a; + //y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b; + + y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + + t->ele[i].a; + y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif + + t->CEclmb[i].a; + + y->H = y->e_ele * EV_to_KCALpMOL / C_ele; + //y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a; +} + + +void Make_LR_Lookup_Table( reax_system *system, control_params *control ) +{ + int i, j, r; + int num_atom_types; + int existing_types[MAX_ATOM_TYPES]; + real dr; + real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb; + real v0_vdw, v0_ele, vlast_vdw, vlast_ele; + /* real rand_dist; + real evdw_abserr, evdw_relerr, fvdw_abserr, fvdw_relerr; + real eele_abserr, eele_relerr, fele_abserr, fele_relerr; + real evdw_maxerr, eele_maxerr; + LR_data y, y_spline; */ + + /* initializations */ + vlast_ele = 0; + vlast_vdw = 0; + v0_ele = 0; + v0_vdw = 0; + + num_atom_types = system->reaxprm.num_atom_types; + dr = control->r_cut / control->tabulate; + h = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fh = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fele = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) ); + + /* allocate Long-Range LookUp Table space based on + number of atom types in the ffield file */ + //LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) ); + //for( i = 0; i < num_atom_types; ++i ) + // LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table)); + + LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table)); + + /* most atom types in ffield file will not exist in the current + simulation. to avoid unnecessary lookup table space, determine + the atom types that exist in the current simulation */ + for( i = 0; i < MAX_ATOM_TYPES; ++i ) + existing_types[i] = 0; + for( i = 0; i < system->N; ++i ) + existing_types[ system->atoms[i].type ] = 1; + + /* fill in the lookup table entries for existing atom types. + only lower half should be enough. */ + for( i = 0; i < num_atom_types; ++i ) + if( existing_types[i] ) + for( j = i; j < num_atom_types; ++j ) + if( existing_types[j] ) { + LR[ index_lr (i,j,num_atom_types) ].xmin = 0; + LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut; + LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1; + LR[ index_lr (i,j,num_atom_types) ].dx = dr; + LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut; + LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data)); + LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) + malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef)); + + for( r = 1; r <= control->tabulate; ++r ) { + LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) ); + h[r] = LR[ index_lr (i,j,num_atom_types) ].dx; + fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H; + fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW; + fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; + fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele; + fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; + + if( r == 1 ){ + v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; + v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; + } + else if( r == control->tabulate ){ + vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd; + vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb; + } + } + + /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fh" ); + for( r = 1; r <= control->tabulate; ++r ) + //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fh[r] ); */ + Natural_Cubic_Spline( &h[1], &fh[1], + &(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 ); + + /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fvdw" ); + for( r = 1; r <= control->tabulate; ++r ) + //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fvdw[r] ); + //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw ); + */ + Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, + &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 ); + Natural_Cubic_Spline( &h[1], &fCEvd[1], + &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 ); + + /*//fprintf( stderr, "%-6s %-6s %-6s\n", "r", "h", "fele" ); + for( r = 1; r <= control->tabulate; ++r ) + //fprintf( stderr, "%f %f %f\n", r * dr, h[r], fele[r] ); + //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele ); + */ + Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, + &(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 ); + Natural_Cubic_Spline( &h[1], &fCEclmb[1], + &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 ); + } + + /***** //test LR-Lookup table + evdw_maxerr = 0; + eele_maxerr = 0; + for( i = 0; i < num_atom_types; ++i ) + if( existing_types[i] ) + for( j = i; j < num_atom_types; ++j ) + if( existing_types[j] ) { + for( r = 1; r <= 100; ++r ) { + rand_dist = (real)rand()/RAND_MAX * control->r_cut; + LR_vdW_Coulomb( system, control, i, j, rand_dist, &y ); + LR_Lookup( &(LR[i][j]), rand_dist, &y_spline ); + + evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW); + evdw_relerr = fabs(evdw_abserr / y.e_vdW); + fvdw_abserr = fabs(y.CEvd - y_spline.CEvd); + fvdw_relerr = fabs(fvdw_abserr / y.CEvd); + eele_abserr = fabs(y.e_ele - y_spline.e_ele); + eele_relerr = fabs(eele_abserr / y.e_ele); + fele_abserr = fabs(y.CEclmb - y_spline.CEclmb); + fele_relerr = fabs(fele_abserr / y.CEclmb); + + if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){ + //fprintf( stderr, "rand_dist = %24.15e\n", rand_dist ); + //fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", + y.H, y_spline.H, + fabs(y.H-y_spline.H), fabs((y.H-y_spline.H)/y.H) ); + + //fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", + y.e_vdW, y_spline.e_vdW, evdw_abserr, evdw_relerr ); + //fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", + y.CEvd, y_spline.CEvd, fvdw_abserr, fvdw_relerr ); + + //fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", + y.e_ele, y_spline.e_ele, eele_abserr, eele_relerr ); + //fprintf( stderr, "%24.15e %24.15e %24.15e %24.15e\n", + y.CEclmb, y_spline.CEclmb, fele_abserr, fele_relerr ); + } + + if( evdw_relerr > evdw_maxerr ) + evdw_maxerr = evdw_relerr; + if( eele_relerr > eele_maxerr ) + eele_maxerr = eele_relerr; + } + } + //fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr ); + //fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr ); + *******/ + + free(h); + free(fh); + free(fvdw); + free(fCEvd); + free(fele); + free(fCEclmb); +} + + +int Lookup_Index_Of( real x, lookup_table* t ) +{ + return (int)( t->a * ( x - t->xmin ) ); +} + + +real Lookup( real x, lookup_table* t ) +{ + real x1, x2; + real b; + int i; + + /* if ( x < t->xmin) + { + //fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x); + exit(0); + } + if ( x > t->xmax) + { + //fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x); + exit(0); + } */ + + i = Lookup_Index_Of( x, t ); + x1 = i * t->dx + t->xmin; + x2 = (i+1) * t->dx + t->xmin; + + b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx; + // //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n", + // i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x)); + + return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b; +} diff --git a/PuReMD-GPU/src/lookup.h b/PuReMD-GPU/src/lookup.h index 9ea972f8eeaa07e307516149658c07bfc68f37f4..7dac3e4f41764caa53cf0e2313f8c566f4e0e634 100644 --- a/PuReMD-GPU/src/lookup.h +++ b/PuReMD-GPU/src/lookup.h @@ -23,14 +23,20 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Make_Lookup_Table( real, real, int, lookup_function, lookup_table* ); int Lookup_Index_Of( real, lookup_table* ); real Lookup( real, lookup_table* ); void Make_LR_Lookup_Table( reax_system*, control_params* ); -//CUDA Functions -void Cuda_Make_LR_Lookup_Table( reax_system*, control_params* ); -void copy_LR_table_to_device ( reax_system*, control_params* ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/matvec.cu b/PuReMD-GPU/src/matvec.cu deleted file mode 100644 index bf08cdf83dc8e14e31b48a6eaac41b5bfa8cf97e..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/matvec.cu +++ /dev/null @@ -1,89 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - - -#include "matvec.h" - -//one thread per row -GLOBAL void Cuda_Matvec (sparse_matrix H, real *vec, real *results, int rows) -{ - real results_row = 0; - int col; - real val; - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= rows) return; - - for (int c = H.start[i]; c < H.end[i]; c++) - { - col = H.entries [c].j; - val = H.entries[c].val; - - results_row += val * vec [col]; - } - - results [i] = results_row; -} - -//32 thread warp per matrix row. -//invoked as follows -// <<< system->N, 32 >>> -GLOBAL void Cuda_Matvec_csr (sparse_matrix H, real *vec, real *results, int num_rows) -{ - extern __shared__ real vals []; - int thread_id = blockDim.x * blockIdx.x + threadIdx.x; - int warp_id = thread_id / 32; - int lane = thread_id & (32 - 1); - - int row_start; - int row_end; - - // one warp per row - //int row = warp_id; - int row = warp_id; - //if (row < num_rows) - { - vals[threadIdx.x] = 0; - - if (row < num_rows) { - row_start = H.start[row]; - row_end = H.end[row]; - - // compute running sum per thread - for(int jj = row_start + lane; jj < row_end; jj += 32) - vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ]; - //vals[threadIdx.x] += H.val[jj] * vec [ H.j[jj] ]; - } - - __syncthreads (); - - // parallel reduction in shared memory - //SIMD instructions with a WARP are synchronous -- so we do not need to synch here - if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads(); - if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads (); - if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads (); - if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads (); - if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads (); - - // first thread writes the result - if (lane == 0 && row < num_rows) - results[row] = vals[threadIdx.x]; - } -} diff --git a/PuReMD-GPU/src/mytypes.h b/PuReMD-GPU/src/mytypes.h index c7d42ee97e386aae84f4f9d9bc51d2767d866f29..273f95976159c55461a84668c4a5f2494f1d9522 100644 --- a/PuReMD-GPU/src/mytypes.h +++ b/PuReMD-GPU/src/mytypes.h @@ -18,8 +18,42 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#ifndef __MYTYPES_H_ -#define __MYTYPES_H_ +#if !(defined(__MYTYPES_H_) || defined(__CUDA_MYTYPES_H_)) + +#ifdef __CUDACC__ + #ifndef __CUDA_MYTYPES_H_ + #define __CUDA_MYTYPES_H_ + #define HOST __host__ + #define DEVICE __device__ + #define GLOBAL __global__ + #define HOST_DEVICE __host__ __device__ + + #include <cuda_runtime.h> + #include <cuda.h> + #include <cuda_runtime_api.h> + + #include <cublas_v2.h> + #include <cusparse_v2.h> + #if __CUDA_ARCH__ < 600 + #define MYATOMICADD myAtomicAdd + #else + #define MYATOMICADD atomicAdd + #endif + #endif +#else + #ifndef __MYTYPES_H_ + #define __MYTYPES_H_ + #define HOST + #define DEVICE + #define GLOBAL + #define HOST_DEVICE + #endif +#endif + +#if (defined(HAVE_CONFIG_H) && !defined(__CONFIG_H_)) + #define __CONFIG_H_ + #include "config.h" +#endif #include "math.h" //#include "random.h" @@ -30,28 +64,16 @@ #include "time.h" #include "zlib.h" - //#define DEBUG_FOCUS //#define TEST_FORCES //#define TEST_ENERGY //#define REORDER_ATOMS // turns on nbrgen opt by re-ordering atoms //#define LGJ -#ifdef __USE_GPU__ - -#include "cublas_v2.h" -#include "cusparse_v2.h" - -#define HOST __host__ -#define DEVICE __device__ -#define GLOBAL __global__ -#define HOST_DEVICE __host__ __device__ -#else -#define HOST -#define DEVICE -#define GLOBAL -#define HOST_DEVICE -#endif +#define SUCCESS 1 +#define FAILURE 0 +#define TRUE 1 +#define FALSE 0 #define EXP exp #define SQRT sqrt @@ -140,10 +162,10 @@ #define RES_GRID_MARK 0x03 #define RES_GRID_START 0x04 #define RES_GRID_END 0x05 -#define RES_GRID_NBRS 0x06 -#define RES_GRID_NBRS_CP 0x07 +#define RES_GRID_NBRS 0x06 +#define RES_GRID_NBRS_CP 0x07 -#define RES_SYSTEM_ATOMS 0x10 +#define RES_SYSTEM_ATOMS 0x10 #define RES_SYSTEM_SIMULATION_BOX 0x11 #define RES_REAX_INT_SBP 0x20 @@ -154,58 +176,58 @@ #define RES_SIMULATION_DATA 0x30 -#define RES_STORAGE 0x401 -#define RES_STORAGE_HBOND_INDEX 0x402 -#define RES_STORAGE_TOTAL_BOND_ORDER 0x403 -#define RES_STORAGE_DELTAP 0x404 -#define RES_STORAGE_DELTAP_BOC 0x404 -#define RES_STORAGE_DDELTAP_SELF 0x405 -#define RES_STORAGE_DELTA 0x406 -#define RES_STORAGE_DELTA_LP 0x407 -#define RES_STORAGE_DELTA_LP_TEMP 0x408 -#define RES_STORAGE_DDELTA_LP 0x409 -#define RES_STORAGE_DDELTA_LP_TEMP 0x40A -#define RES_STORAGE_DELTA_E 0x40B -#define RES_STORAGE_DELTA_BOC 0x40C -#define RES_STORAGE_NL 0x40D -#define RES_STORAGE_NLP_TEMP 0x40E -#define RES_STORAGE_CLP 0x40F -#define RES_STORAGE_CDDELTA 0x410 -#define RES_STORAGE_VLPEX 0x411 -#define RES_STORAGE_DROPTOL 0x412 -#define RES_STORAGE_W 0x413 -#define RES_STORAGE_HDIA_INV 0x414 -#define RES_STORAGE_B 0x415 -#define RES_STORAGE_B_S 0x416 -#define RES_STORAGE_B_T 0x417 -#define RES_STORAGE_B_PRC 0x418 -#define RES_STORAGE_B_PRM 0x419 -#define RES_STORAGE_S_T 0x41A -#define RES_STORAGE_S 0x41B -#define RES_STORAGE_T 0x41C -#define RES_STORAGE_Y 0x41D -#define RES_STORAGE_Z 0x41E -#define RES_STORAGE_G 0x41F -#define RES_STORAGE_HS 0x420 -#define RES_STORAGE_HC 0x421 -#define RES_STORAGE_RN 0x422 -#define RES_STORAGE_V 0x423 -#define RES_STORAGE_H 0x424 -#define RES_STORAGE_R 0x425 -#define RES_STORAGE_D 0x426 -#define RES_STORAGE_Q 0x427 -#define RES_STORAGE_P 0x428 -#define RES_STORAGE_A 0x429 -#define RES_STORAGE_F_OLD 0x42A -#define RES_STORAGE_V_CONST 0x42B -#define RES_STORAGE_MARK 0x42C -#define RES_STORAGE_OLD_MARK 0x42D -#define RES_STORAGE_X_OLD 0x42E -#define RES_STORAGE_NLP 0x42F -#define RES_STORAGE_MAP_SERIALS 0x430 -#define RES_STORAGE_RESTRICTED 0x431 -#define RES_STORAGE_RESTRICTED_LIST 0x432 -#define RES_STORAGE_ORIG_ID 0x433 +#define RES_STORAGE 0x401 +#define RES_STORAGE_HBOND_INDEX 0x402 +#define RES_STORAGE_TOTAL_BOND_ORDER 0x403 +#define RES_STORAGE_DELTAP 0x404 +#define RES_STORAGE_DELTAP_BOC 0x404 +#define RES_STORAGE_DDELTAP_SELF 0x405 +#define RES_STORAGE_DELTA 0x406 +#define RES_STORAGE_DELTA_LP 0x407 +#define RES_STORAGE_DELTA_LP_TEMP 0x408 +#define RES_STORAGE_DDELTA_LP 0x409 +#define RES_STORAGE_DDELTA_LP_TEMP 0x40A +#define RES_STORAGE_DELTA_E 0x40B +#define RES_STORAGE_DELTA_BOC 0x40C +#define RES_STORAGE_NL 0x40D +#define RES_STORAGE_NLP_TEMP 0x40E +#define RES_STORAGE_CLP 0x40F +#define RES_STORAGE_CDDELTA 0x410 +#define RES_STORAGE_VLPEX 0x411 +#define RES_STORAGE_DROPTOL 0x412 +#define RES_STORAGE_W 0x413 +#define RES_STORAGE_HDIA_INV 0x414 +#define RES_STORAGE_B 0x415 +#define RES_STORAGE_B_S 0x416 +#define RES_STORAGE_B_T 0x417 +#define RES_STORAGE_B_PRC 0x418 +#define RES_STORAGE_B_PRM 0x419 +#define RES_STORAGE_S_T 0x41A +#define RES_STORAGE_S 0x41B +#define RES_STORAGE_T 0x41C +#define RES_STORAGE_Y 0x41D +#define RES_STORAGE_Z 0x41E +#define RES_STORAGE_G 0x41F +#define RES_STORAGE_HS 0x420 +#define RES_STORAGE_HC 0x421 +#define RES_STORAGE_RN 0x422 +#define RES_STORAGE_V 0x423 +#define RES_STORAGE_H 0x424 +#define RES_STORAGE_R 0x425 +#define RES_STORAGE_D 0x426 +#define RES_STORAGE_Q 0x427 +#define RES_STORAGE_P 0x428 +#define RES_STORAGE_A 0x429 +#define RES_STORAGE_F_OLD 0x42A +#define RES_STORAGE_V_CONST 0x42B +#define RES_STORAGE_MARK 0x42C +#define RES_STORAGE_OLD_MARK 0x42D +#define RES_STORAGE_X_OLD 0x42E +#define RES_STORAGE_NLP 0x42F +#define RES_STORAGE_MAP_SERIALS 0x430 +#define RES_STORAGE_RESTRICTED 0x431 +#define RES_STORAGE_RESTRICTED_LIST 0x432 +#define RES_STORAGE_ORIG_ID 0x433 #define RES_CONTROL_PARAMS 0x50 @@ -224,7 +246,6 @@ #define RES_SCRATCH 0x90 - #define LIST_INDEX 0x00 #define LIST_END_INDEX 0x01 #define LIST_FAR_NEIGHBOR_DATA 0x10 @@ -288,9 +309,6 @@ #define MATVEC_THREADS_PER_ROW 32 - -enum {TYP_HOST, TYP_DEVICE}; - typedef double real; typedef real rvec[3]; typedef int ivec[3]; @@ -309,7 +327,6 @@ enum {WRITE_ASCII, WRITE_BINARY, RF_N}; enum {XYZ, PDB, BGF, ASCII_RESTART, BINARY_RESTART, GF_N}; - /* Global params mapping */ /* l[0] = p_boc1 @@ -352,7 +369,6 @@ l[36] = N/A l[37] = version number l[38] = p_coa3 */ - typedef struct { int n_global; @@ -361,7 +377,6 @@ typedef struct } global_parameters; - typedef struct { /* Line one in field file */ @@ -405,7 +420,6 @@ typedef struct } single_body_parameters; - /* Two Body Parameters */ typedef struct { @@ -435,7 +449,6 @@ typedef struct } two_body_parameters; - /* 3-body parameters */ typedef struct { @@ -458,7 +471,6 @@ typedef struct } three_body_header; - /* hydrogen-bond parameters */ typedef struct { @@ -466,7 +478,6 @@ typedef struct } hbond_parameters; - /* 4-body parameters */ typedef struct { @@ -560,7 +571,6 @@ typedef struct int *end; ivec *nbrs; rvec *nbrs_cp; - } grid; @@ -768,8 +778,6 @@ typedef struct reax_timing timing; //CUDA reax_timing d_timing; - - void *d_simulation_data; } simulation_data; @@ -837,6 +845,7 @@ typedef struct rvec dBO, dBOpi, dBOpi2; } dbond_data; + typedef struct { real BO, BO_s, BO_pi, BO_pi2; @@ -847,6 +856,7 @@ typedef struct rvec dBOp, dln_BOp_s, dln_BOp_pi, dln_BOp_pi2; } bond_order_data; + typedef struct { int nbr; @@ -886,6 +896,7 @@ typedef struct real val; } sparse_matrix_entry; + typedef struct { int n, m; @@ -914,6 +925,7 @@ typedef struct int gcell_atoms; } reallocate_data; + typedef struct { /* bond order related storage */ @@ -999,7 +1011,6 @@ typedef struct } list; - typedef struct { FILE *trj; @@ -1070,12 +1081,12 @@ typedef struct } LR_data; - typedef struct { real a, b, c, d; } cubic_spline_coef; + typedef struct { real xmin, xmax; @@ -1126,8 +1137,7 @@ typedef void (*get_far_neighbors_function)(rvec, rvec, simulation_box*, int*); -// CUDA structures -// +/* CUDA structures */ extern list *dev_lists; extern static_storage *dev_workspace; extern LR_lookup_table *d_LR; @@ -1138,15 +1148,5 @@ extern void *scratch; extern int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE; extern int MATVEC_BLOCKS; -#ifdef __USE_GPU__ -extern cublasStatus_t cublasStatus; -extern cublasHandle_t cublasHandle; - -extern cusparseHandle_t cusparseHandle; -extern cusparseStatus_t cusparseStatus; -extern cusparseMatDescr_t matdescriptor; -#endif - - #endif diff --git a/PuReMD-GPU/src/neighbors.c b/PuReMD-GPU/src/neighbors.c new file mode 100644 index 0000000000000000000000000000000000000000..5f425e672080d2d4a272f7aca1859c45d8dde17d --- /dev/null +++ b/PuReMD-GPU/src/neighbors.c @@ -0,0 +1,698 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "neighbors.h" + +#include "box.h" +#include "grid.h" +#include "index_utils.h" +#include "list.h" +#include "reset_utils.h" +#include "system_props.h" +#include "vector.h" + + +int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, + real cutoff, far_neighbor_data *data ) +{ + real norm_sqr, d, tmp; + int i; + + norm_sqr = 0; + + for( i = 0; i < 3; i++ ) { + d = x2[i] - x1[i]; + tmp = SQR(d); + + if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) { + if( x2[i] > x1[i] ) { + d -= box->box_norms[i]; + data->rel_box[i] = -1; + } + else { + d += box->box_norms[i]; + data->rel_box[i] = +1; + } + + data->dvec[i] = d; + norm_sqr += SQR(d); + } + else { + data->dvec[i] = d; + norm_sqr += tmp; + data->rel_box[i] = 0; + } + } + + if( norm_sqr <= SQR(cutoff) ){ + data->d = sqrt(norm_sqr); + return 1; + } + + return 0; +} + + +void Generate_Neighbor_Lists( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, j, k, l, m, itr; + int x, y, z; + int atom1, atom2, max; + int num_far; + int *nbr_atoms; + ivec *nbrs; + rvec *nbrs_cp; + grid *g; + list *far_nbrs; + far_neighbor_data *nbr_data; + real t_start, t_elapsed; + + // fprintf( stderr, "\n\tentered nbrs - " ); + g = &( system->g ); + far_nbrs = (*lists) + FAR_NBRS; + Bin_Atoms( system, workspace ); + + t_start = Get_Time( ); + + // fprintf( stderr, "atoms sorted - " ); + num_far = 0; + + /* first pick up a cell in the grid */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ]; + nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ]; + //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); + + /* pick up an atom from the current cell */ + for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){ + atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ]; + Set_Start_Index( atom1, num_far, far_nbrs ); + //fprintf( stderr, "\tatom %d\n", atom1 ); + + itr = 0; + while( nbrs[itr][0] >= 0 ){ + x = nbrs[itr][0]; + y = nbrs[itr][1]; + z = nbrs[itr][2]; + //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); + + if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= + SQR(control->vlist_cut) ) { + nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ]; + max = g->top[ index_grid_3d (x,y,z,g) ]; + //fprintf( stderr, "\t\tmax: %d\n", max ); + + /* pick up another atom from the neighbor cell */ + for( m = 0; m < max; ++m ) { + atom2 = nbr_atoms[m]; + if( atom1 > atom2 ) { + nbr_data = &(far_nbrs->select.far_nbr_list[num_far]); + if(Are_Far_Neighbors(system->atoms[atom1].x, + system->atoms[atom2].x, + &(system->box), control->vlist_cut, + nbr_data)) { + nbr_data->nbr = atom2; + + ++num_far; + } + } + } + } + + ++itr; + } + + Set_End_Index( atom1, num_far, far_nbrs ); + //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", + // atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs), + // itr); + } + } + + fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far); + + if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) { + workspace->realloc.num_far = num_far; + if( num_far > far_nbrs->num_intrs ){ + fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d", + data->step, num_far, far_nbrs->num_intrs ); + exit( INSUFFICIENT_SPACE ); + } + } + + t_elapsed = Get_Timing_Info( t_start ); + data->timing.nbrs += t_elapsed; + +#if defined(DEBUG) + for( i = 0; i < system->N; ++i ) { + qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), + Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), + compare_far_nbrs ); + } +#endif + +#if defined(DEBUG_FOCUS) + //fprintf( stderr, "nbrs - "); + //fprintf( stderr, "nbrs done, num_far: %d\n", num_far ); +#endif + +#if defined(TEST_ENERGY) + //Print_Far_Neighbors( system, control, workspace, lists ); +#endif +} + + +int Estimate_NumNeighbors( reax_system *system, control_params *control, + static_storage *workspace, list **lists ) +{ + int i, j, k, l, m, itr; + int x, y, z; + int atom1, atom2, max; + int num_far; + int *nbr_atoms; + ivec *nbrs; + rvec *nbrs_cp; + grid *g; + far_neighbor_data nbr_data; + + int start = 0, finish = 0; + + // fprintf( stderr, "\n\tentered nbrs - " ); + g = &( system->g ); + Bin_Atoms( system, workspace ); + // fprintf( stderr, "atoms sorted - " ); + num_far = 0; + g->max_cuda_nbrs = 0; + + /* first pick up a cell in the grid */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ]; + nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ]; + //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); + + /* pick up an atom from the current cell */ + for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){ + atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ]; + start = num_far; + + itr = 0; + while( nbrs[itr][0] >= 0 ){ + x = nbrs[itr][0]; + y = nbrs[itr][1]; + z = nbrs[itr][2]; + //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); + + if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= + SQR(control->vlist_cut) ) { + nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ]; + max = g->top[index_grid_3d (x,y,z,g) ]; + //fprintf( stderr, "\t\tmax: %d\n", max ); + + /* pick up another atom from the neighbor cell - + we have to compare atom1 with its own periodic images as well, + that's why there is also equality in the if stmt below */ + for( m = 0; m < max; ++m ) { + atom2 = nbr_atoms[m]; + //if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) { + if( atom1 > atom2 ) { + if(Are_Far_Neighbors(system->atoms[atom1].x, + system->atoms[atom2].x, + &(system->box), control->vlist_cut, + &nbr_data)) + ++num_far; + } + } + } + + ++itr; + } + + // finish note + finish = num_far; + if (g->max_cuda_nbrs <= (finish - start)){ + g->max_cuda_nbrs = finish - start; + } + } + } + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far ); +#endif + + return num_far * SAFE_ZONE; +} + + +//Code not used anymore +#if defined DONE + +void Choose_Neighbor_Finder( reax_system *system, control_params *control, + get_far_neighbors_function *Get_Far_Neighbors ) +{ + if( control->periodic_boundaries ) + { + if( system->box.box_norms[0] > 2.0 * control->vlist_cut && + system->box.box_norms[1] > 2.0 * control->vlist_cut && + system->box.box_norms[2] > 2.0 * control->vlist_cut ) + (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box; + else (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box; + } + else + (*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors; +} + + +int compare_near_nbrs(const void *v1, const void *v2) +{ + return ((*(near_neighbor_data *)v1).nbr - (*(near_neighbor_data *)v2).nbr); +} + + +int compare_far_nbrs(const void *v1, const void *v2) +{ + return ((*(far_neighbor_data *)v1).nbr - (*(far_neighbor_data *)v2).nbr); +} + + +inline void Set_Far_Neighbor( far_neighbor_data *dest, int nbr, real d, real C, + rvec dvec, ivec rel_box/*, rvec ext_factor*/ ) +{ + dest->nbr = nbr; + dest->d = d; + rvec_Scale( dest->dvec, C, dvec ); + ivec_Copy( dest->rel_box, rel_box ); + // rvec_Scale( dest->ext_factor, C, ext_factor ); +} + + +inline void Set_Near_Neighbor(near_neighbor_data *dest, int nbr, real d, real C, + rvec dvec, ivec rel_box/*, rvec ext_factor*/) +{ + dest->nbr = nbr; + dest->d = d; + rvec_Scale( dest->dvec, C, dvec ); + ivec_Scale( dest->rel_box, C, rel_box ); + // rvec_Scale( dest->ext_factor, C, ext_factor ); +} + + +/* In case bond restrictions are applied, this method checks if + atom1 and atom2 are allowed to bond with each other */ +inline int can_Bond( static_storage *workspace, int atom1, int atom2 ) +{ + int i; + + // fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 ); + + if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] ) + return 1; + + for( i = 0; i < workspace->restricted[ atom1 ]; ++i ) + if( workspace->restricted_list[ atom1 ][i] == atom2 ) + return 1; + + for( i = 0; i < workspace->restricted[ atom2 ]; ++i ) + if( workspace->restricted_list[ atom2 ][i] == atom1 ) + return 1; + + return 0; +} + + +/* check if atom2 is on atom1's near neighbor list */ +inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 ) +{ + int i; + + for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i ) + if( near_nbrs->select.near_nbr_list[i].nbr == atom2 ) + { + // fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 ); + return 1; + } + + return 0; +} + +void Generate_Neighbor_Lists( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, j, k; + int x, y, z; + int *nbr_atoms; + int atom1, atom2, max; + int num_far; + int c, count; + int grid_top; + grid *g = &( system->g ); + list *far_nbrs = (*lists) + FAR_NBRS; + //int hb_type1, hb_type2; + //list *hbonds = (*lists) + HBOND; + //int top_hbond1, top_hbond2; + get_far_neighbors_function Get_Far_Neighbors; + far_neighbor_data new_nbrs[125]; +#ifndef REORDER_ATOMS + int l, m; +#endif + + // fprintf( stderr, "\n\tentered nbrs - " ); + if( control->ensemble == iNPT || control->ensemble == sNPT || + control->ensemble == NPT ) + Update_Grid( system ); + // fprintf( stderr, "grid updated - " ); + + Bin_Atoms( system, out_control ); + // fprintf( stderr, "atoms sorted - " ); + +#ifdef REORDER_ATOMS + Cluster_Atoms( system, workspace ); + // fprintf( stderr, "atoms clustered - " ); +#endif + + Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors ); + // fprintf( stderr, "function chosen - " ); + + Reset_Neighbor_Lists( system, workspace, lists ); + // fprintf( stderr, "lists cleared - " ); + + num_far = 0; + num_near = 0; + c = 0; + + /* first pick up a cell in the grid */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs = g->nbrs[i][j][k]; + nbrs_cp = g->nbrs_cp[i][j][k]; + + /* pick up an atom from the current cell */ + //#ifdef REORDER_ATOMS + // for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++) + //#else + for(l = 0; l < g->top[i][j][k]; ++l ){ + atom1 = g->atoms[i][j][k][l]; + Set_End_Index( atom1, num_far, far_nbrs ); + // fprintf( stderr, "atom %d:\n", atom1 ); + + itr = 0; + while( nbrs[itr][0] > 0 ){ + x = nbrs[itr][0]; + y = nbrs[itr][1]; + z = nbrs[itr][2]; + + // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= + // SQR(control->r_cut)) + nbr_atoms = g->atoms[x][y][z]; + max_atoms = g->top[x][y][z]; + + /* pick up another atom from the neighbor cell - + we have to compare atom1 with its own periodic images as well, + that's why there is also equality in the if stmt below */ + //#ifdef REORDER_ATOMS + //for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++) + //#else + for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] ) + if( atom1 >= atom2 ) { + //fprintf( stderr, "\tatom2 %d", atom2 ); + //top_near1 = End_Index( atom1, near_nbrs ); + //Set_Start_Index( atom1, num_far, far_nbrs ); + //hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond; + Get_Far_Neighbors( system->atoms[atom1].x, + system->atoms[atom2].x, + &(system->box), control, new_nbrs, &count ); + fprintf( stderr, "\t%d count:%d\n", atom2, count ); + + for( c = 0; c < count; ++c ) + if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){ + Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]), + atom2, new_nbrs[c].d, 1.0, + new_nbrs[c].dvec, new_nbrs[c].rel_box ); + ++num_far; + + /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n", + atom1, atom2, new_nbrs[c].d, + new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], + new_nbrs[c].dvec[2] ); */ + + + /* hydrogen bond lists */ + /*if( control->hb_cut > 0.1 && + new_nbrs[c].d <= control->hb_cut ) { + // fprintf( stderr, "%d %d\n", atom1, atom2 ); + hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond; + if( hb_type1 == 1 && hb_type2 == 2 ) { + top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds); + Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]), + atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec, + new_nbrs[c].rel_box ); + Set_End_Index( workspace->hbond_index[atom1], + top_hbond1 + 1, hbonds ); + } + else if( hb_type1 == 2 && hb_type2 == 1 ) { + top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds ); + Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]), + atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, + new_nbrs[c].rel_box ); + Set_End_Index( workspace->hbond_index[atom2], + top_hbond2 + 1, hbonds ); + }*/ + } + } + } + + Set_End_Index( atom1, top_far1, far_nbrs ); + } + } + + fprintf( stderr, "nbrs done-" ); + + /* apply restrictions on near neighbors only */ + if( (data->step - data->prev_steps) < control->restrict_bonds ) { + for( atom1 = 0; atom1 < system->N; ++atom1 ) + if( workspace->restricted[ atom1 ] ) { + // fprintf( stderr, "atom1: %d\n", atom1 ); + + top_near1 = End_Index( atom1, near_nbrs ); + + for( j = 0; j < workspace->restricted[ atom1 ]; ++j ) + if(!is_Near_Neighbor(near_nbrs, atom1, + atom2 = workspace->restricted_list[atom1][j])) { + fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n", + atom1, atom2 ); + + top_near2 = End_Index( atom2, near_nbrs ); + + /* we just would like to get the nearest image, so a call to + Get_Periodic_Far_Neighbors_Big_Box is good enough. */ + Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, + system->atoms[ atom2 ].x, + &(system->box), control, + new_nbrs, &count ); + + Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]), + atom2, new_nbrs[c].d, 1.0, + new_nbrs[c].dvec, new_nbrs[c].rel_box ); + ++top_near1; + + Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]), + atom1, new_nbrs[c].d, -1.0, + new_nbrs[c].dvec, new_nbrs[c].rel_box ); + Set_End_Index( atom2, top_near2+1, near_nbrs ); + } + + Set_End_Index( atom1, top_near1, near_nbrs ); + } + } + // fprintf( stderr, "restrictions applied-" ); + + + /* verify nbrlists, count num_intrs, sort nearnbrs */ + near_nbrs->num_intrs = 0; + far_nbrs->num_intrs = 0; + for( i = 0; i < system->N-1; ++i ) { + if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) { + fprintf( stderr, + "step%3d: nearnbr list of atom%d is overwritten by atom%d\n", + data->step, i+1, i ); + exit( 1 ); + } + + near_nbrs->num_intrs += Num_Entries(i, near_nbrs); + + if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) { + fprintf( stderr, + "step%3d: farnbr list of atom%d is overwritten by atom%d\n", + data->step, i+1, i ); + exit( 1 ); + } + + far_nbrs->num_intrs += Num_Entries(i, far_nbrs); + } + + for( i = 0; i < system->N; ++i ) { + qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]), + Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), + compare_near_nbrs ); + } + // fprintf( stderr, "near nbrs sorted\n" ); + +#ifdef TEST_ENERGY + /* for( i = 0; i < system->N; ++i ) { + qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), + Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), + compare_far_nbrs ); + } */ + + fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", + num_near / system->N ); + fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", + num_far / system->N, control->max_far_nbrs ); +#endif + + //fprintf( stderr, "step%d: num of nearnbrs = %6d num of farnbrs: %6d\n", + // data->step, num_near, num_far ); + + //fprintf( stderr, "\talloc nearnbrs = %6d alloc farnbrs: %6d\n", + // system->N * near_nbrs->intrs_per_unit, + // system->N * far_nbrs->intrs_per_unit ); +} + + +void Generate_Neighbor_Lists( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, j, k, l, m, itr; + int x, y, z; + int atom1, atom2, max; + int num_far, c, count; + int *nbr_atoms; + ivec *nbrs; + rvec *nbrs_cp; + grid *g; + list *far_nbrs; + get_far_neighbors_function Get_Far_Neighbors; + far_neighbor_data new_nbrs[125]; + + g = &( system->g ); + far_nbrs = (*lists) + FAR_NBRS; + + // fprintf( stderr, "\n\tentered nbrs - " ); + if( control->ensemble == iNPT || + control->ensemble == sNPT || + control->ensemble == NPT ) + Update_Grid( system ); + // fprintf( stderr, "grid updated - " ); + + Bin_Atoms( system, out_control ); + // fprintf( stderr, "atoms sorted - " ); + Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors ); + // fprintf( stderr, "function chosen - " ); + Reset_Neighbor_Lists( system, workspace, lists ); + // fprintf( stderr, "lists cleared - " ); + + num_far = 0; + c = 0; + + /* first pick up a cell in the grid */ + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ) { + nbrs = g->nbrs[i][j][k]; + nbrs_cp = g->nbrs_cp[i][j][k]; + fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); + + /* pick up an atom from the current cell */ + for(l = 0; l < g->top[i][j][k]; ++l ){ + atom1 = g->atoms[i][j][k][l]; + Set_Start_Index( atom1, num_far, far_nbrs ); + fprintf( stderr, "\tatom %d\n", atom1 ); + + itr = 0; + while( nbrs[itr][0] > 0 ){ + x = nbrs[itr][0]; + y = nbrs[itr][1]; + z = nbrs[itr][2]; + fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); + + // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= + // SQR(control->r_cut)) + nbr_atoms = g->atoms[x][y][z]; + max = g->top[x][y][z]; + fprintf( stderr, "\t\tmax: %d\n", max ); + + + /* pick up another atom from the neighbor cell - + we have to compare atom1 with its own periodic images as well, + that's why there is also equality in the if stmt below */ + for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] ) + if( atom1 >= atom2 ) { + Get_Far_Neighbors( system->atoms[atom1].x, + system->atoms[atom2].x, + &(system->box), control, new_nbrs, &count ); + fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count ); + + for( c = 0; c < count; ++c ) + if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){ + Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]), + atom2, new_nbrs[c].d, 1.0, + new_nbrs[c].dvec, new_nbrs[c].rel_box ); + ++num_far; + + /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n", + atom1, atom2, new_nbrs[c].d, + new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], + new_nbrs[c].dvec[2] ); */ + } + } + + ++itr; + } + + Set_End_Index( atom1, num_far, far_nbrs ); + } + } + + far_nbrs->num_intrs = num_far; + fprintf( stderr, "nbrs done, num_far: %d\n", num_far ); + +#if defined(DEBUG) + for( i = 0; i < system->N; ++i ) { + qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), + Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), + compare_far_nbrs ); + } + + fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far ); + fprintf( stderr, "\tallocated farnbrs: %6d\n", + system->N * far_nbrs->intrs_per_unit ); +#endif +} + + +#endif diff --git a/PuReMD-GPU/src/neighbors.cu b/PuReMD-GPU/src/neighbors.cu deleted file mode 100644 index 90779538353a05eca7bb04215ebd33b3ffd81e35..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/neighbors.cu +++ /dev/null @@ -1,1413 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#include "neighbors.h" -#include "box.h" -#include "grid.h" -#include "list.h" -#include "reset_utils.h" -#include "system_props.h" -#include "vector.h" -#include "index_utils.h" -#include "cuda_utils.h" - -extern inline DEVICE int index_grid (int blocksize) -{ - return blockIdx.x * gridDim.y * gridDim.z * blocksize + - blockIdx.y * gridDim.z * blocksize + - blockIdx.z * blocksize ; -} - -extern inline HOST_DEVICE int index_grid_debug (int x, int y, int z, int blocksize) -{ - return x * 8 * 8 * blocksize + - y * 8 * blocksize + - z * blocksize ; -} - -inline HOST_DEVICE real DistSqr_to_CP( rvec cp, rvec x ) -{ - int i; - real d_sqr = 0; - - for( i = 0; i < 3; ++i ) - if( cp[i] > NEG_INF ) - d_sqr += SQR( cp[i] - x[i] ); - - return d_sqr; -} - -HOST_DEVICE int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, - real cutoff, far_neighbor_data *data ) -{ - real norm_sqr, d, tmp; - int i; - - norm_sqr = 0; - - for( i = 0; i < 3; i++ ) { - d = x2[i] - x1[i]; - tmp = SQR(d); - - if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) { - if( x2[i] > x1[i] ) { - d -= box->box_norms[i]; - data->rel_box[i] = -1; - } - else { - d += box->box_norms[i]; - data->rel_box[i] = +1; - } - - data->dvec[i] = d; - norm_sqr += SQR(d); - } - else { - data->dvec[i] = d; - norm_sqr += tmp; - data->rel_box[i] = 0; - } - } - - if( norm_sqr <= SQR(cutoff) ){ - data->d = sqrt(norm_sqr); - return 1; - } - - return 0; -} - -void Generate_Neighbor_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) -{ - int i, j, k, l, m, itr; - int x, y, z; - int atom1, atom2, max; - int num_far; - int *nbr_atoms; - ivec *nbrs; - rvec *nbrs_cp; - grid *g; - list *far_nbrs; - far_neighbor_data *nbr_data; - real t_start, t_elapsed; - - // fprintf( stderr, "\n\tentered nbrs - " ); - g = &( system->g ); - far_nbrs = (*lists) + FAR_NBRS; - Bin_Atoms( system, workspace ); - - t_start = Get_Time( ); - - // fprintf( stderr, "atoms sorted - " ); - num_far = 0; - - /* first pick up a cell in the grid */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ]; - nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ]; - //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); - - /* pick up an atom from the current cell */ - for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){ - atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ]; - Set_Start_Index( atom1, num_far, far_nbrs ); - //fprintf( stderr, "\tatom %d\n", atom1 ); - - itr = 0; - while( nbrs[itr][0] >= 0 ){ - x = nbrs[itr][0]; - y = nbrs[itr][1]; - z = nbrs[itr][2]; - //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); - - if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= - SQR(control->vlist_cut) ) { - nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ]; - max = g->top[ index_grid_3d (x,y,z,g) ]; - //fprintf( stderr, "\t\tmax: %d\n", max ); - - /* pick up another atom from the neighbor cell */ - for( m = 0; m < max; ++m ) { - atom2 = nbr_atoms[m]; - if( atom1 > atom2 ) { - nbr_data = &(far_nbrs->select.far_nbr_list[num_far]); - if(Are_Far_Neighbors(system->atoms[atom1].x, - system->atoms[atom2].x, - &(system->box), control->vlist_cut, - nbr_data)) { - nbr_data->nbr = atom2; - - ++num_far; - } - } - } - } - - ++itr; - } - - Set_End_Index( atom1, num_far, far_nbrs ); - //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", - // atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs), - // itr); - } - } - - fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far); - - if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) { - workspace->realloc.num_far = num_far; - if( num_far > far_nbrs->num_intrs ){ - fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d", - data->step, num_far, far_nbrs->num_intrs ); - exit( INSUFFICIENT_SPACE ); - } - } - - t_elapsed = Get_Timing_Info( t_start ); - data->timing.nbrs += t_elapsed; - -#if defined(DEBUG) - for( i = 0; i < system->N; ++i ) { - qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), - Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), - compare_far_nbrs ); - } -#endif -#if defined(DEBUG_FOCUS) - //fprintf( stderr, "nbrs - "); - //fprintf( stderr, "nbrs done, num_far: %d\n", num_far ); -#endif -#if defined(TEST_ENERGY) - //Print_Far_Neighbors( system, control, workspace, lists ); -#endif -} - - -int Estimate_NumNeighbors( reax_system *system, control_params *control, - static_storage *workspace, list **lists ) -{ - int i, j, k, l, m, itr; - int x, y, z; - int atom1, atom2, max; - int num_far; - int *nbr_atoms; - ivec *nbrs; - rvec *nbrs_cp; - grid *g; - far_neighbor_data nbr_data; - - int start = 0, finish = 0; - - // fprintf( stderr, "\n\tentered nbrs - " ); - g = &( system->g ); - Bin_Atoms( system, workspace ); - // fprintf( stderr, "atoms sorted - " ); - num_far = 0; - g->max_cuda_nbrs = 0; - - /* first pick up a cell in the grid */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ]; - nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ]; - //fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); - - /* pick up an atom from the current cell */ - for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){ - atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ]; - start = num_far; - - itr = 0; - while( nbrs[itr][0] >= 0 ){ - x = nbrs[itr][0]; - y = nbrs[itr][1]; - z = nbrs[itr][2]; - //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); - - if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= - SQR(control->vlist_cut) ) { - nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ]; - max = g->top[index_grid_3d (x,y,z,g) ]; - //fprintf( stderr, "\t\tmax: %d\n", max ); - - /* pick up another atom from the neighbor cell - - we have to compare atom1 with its own periodic images as well, - that's why there is also equality in the if stmt below */ - for( m = 0; m < max; ++m ) { - atom2 = nbr_atoms[m]; - //if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) { - if( atom1 > atom2 ) { - if(Are_Far_Neighbors(system->atoms[atom1].x, - system->atoms[atom2].x, - &(system->box), control->vlist_cut, - &nbr_data)) - ++num_far; - } - } - } - - ++itr; - } - - // finish note - finish = num_far; - if (g->max_cuda_nbrs <= (finish - start)){ - g->max_cuda_nbrs = finish - start; - } - } - } - -#if defined(DEBUG_FOCUS) - fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far ); -#endif - return num_far * SAFE_ZONE; - } - - GLOBAL void Estimate_NumNeighbors ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params *control, - int *indices) - { - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, l, iter, max, m, num_far; - far_neighbor_data nbr_data; - int x, y, z, i; - - if (threadIdx.x >= *(top + index_grid(1))){ - return; - } - - nbrs = nbrs + index_grid (g.max_nbrs); - nbrs_cp = nbrs_cp + index_grid (g.max_nbrs); - atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x]; - - num_far = 0; - iter = 0; - - while (nbrs[iter][0] >= 0) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - //condition check for cutoff here - if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - for (m = 0; m < max; m++) { - atom2 = nbr_atoms[m]; - - //CHANGE ORIGINAL - /* - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &nbr_data)){ - ++num_far; - } - } - */ - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &nbr_data)){ - ++num_far; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, - control->vlist_cut, &nbr_data)){ - ++num_far; - } - } - //CHANGE ORIGINAL - } - } - ++iter; - } - - //indices[ atom1 ] = num_far;// * SAFE_ZONE; - indices[ atom1 ] = num_far * SAFE_ZONE; - } - - /*One thread per atom Implementation */ - GLOBAL void New_Estimate_NumNeighbors ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params* control, - int N, int *indices) - { - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, iter, max, m, num_far; - int x, y, z, i; - int atom_x, atom_y, atom_z; - far_neighbor_data temp; - rvec atom1_x; - - int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index > N) return; - - atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]); - atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]); - atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]); - -#ifdef __BNVT_FIX__ - if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; - if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; - if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; -#endif - - nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - atom1 = index; - - rvec_Copy (atom1_x, sys_atoms [atom1].x ); - - num_far = 0; - iter = 0; - - while (nbrs[iter][0] >= 0) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - - for (m = 0; m < max; m++) - { - atom2 = nbr_atoms[m]; - if (atom1 > atom2) { - if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)){ - num_far++; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, - control->vlist_cut, &temp)){ - num_far ++; - } - } - } - } - ++iter; - } - indices [atom1] = num_far * SAFE_ZONE; - } - - - - /*One thread per entry in the gcell implementation */ - GLOBAL void Generate_Neighbor_Lists ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params* control, - list far_nbrs) - { - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, l, iter, max, m, num_far; - int x, y, z, i; - far_neighbor_data *nbr_data; - far_neighbor_data temp; - - if (threadIdx.x >= *(top + index_grid(1))){ - return; - } - - nbrs = nbrs + index_grid (g.max_nbrs); - nbrs_cp = nbrs_cp + index_grid (g.max_nbrs); - atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x]; - - num_far = Start_Index (atom1, &far_nbrs); - //Set_Start_Index (atom1, 0, &far_nbrs); - //num_far = 0; - iter = 0; - - while (nbrs[iter][0] >= 0) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - //condition check for cutoff here - if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - - for (m = 0; m < max; m++) { - atom2 = nbr_atoms[m]; - - //nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] ); - - //CHANGE ORIGINAL - /* - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)){ - - nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - ++num_far; - } - } - */ - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)){ - nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - ++num_far; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, - control->vlist_cut, &temp)){ - nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - ++num_far; - } - } - //CHANGE ORIGINAL - } - } - ++iter; - } - - //end the far_neighbor list here - Set_End_Index (atom1, num_far, &far_nbrs); - } - - - /*One thread per atom Implementation */ - GLOBAL void New_Generate_Neighbor_Lists ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params* control, - list far_nbrs, int N) - { - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, l, iter, max, m, num_far; - int x, y, z, i; - far_neighbor_data *nbr_data, *my_start; - far_neighbor_data temp; - int atom_x, atom_y, atom_z; - rvec atom1_x; - - int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index > N) return; - - atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]); - atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]); - atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]); - -#ifdef __BNVT_FIX__ - if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; - if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; - if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; -#endif - - nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - atom1 = index; - - rvec_Copy (atom1_x, sys_atoms [atom1].x ); - - num_far = Start_Index (atom1, &far_nbrs); - my_start = & (far_nbrs.select.far_nbr_list [num_far] ); - - //Set_Start_Index (atom1, 0, &far_nbrs); - //num_far = 0; - iter = 0; - - while (nbrs[iter][0] >= 0) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - //condition check for cutoff here - //if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= - if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - - for (m = 0; m < max; m++) - { - atom2 = nbr_atoms[m]; - if (atom1 > atom2) { - if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)){ - //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data = my_start; - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - num_far++; - my_start ++; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, - control->vlist_cut, &temp)){ - //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); - nbr_data = my_start; - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - num_far ++; - my_start ++; - } - } - //CHANGE ORIGINAL - } - } - ++iter; - } - - //end the far_neighbor list here - Set_End_Index (atom1, num_far, &far_nbrs); - } - - /*Multiple threads per atom Implementation */ - GLOBAL void Test_Generate_Neighbor_Lists ( reax_atom *sys_atoms, - grid g, - simulation_box *box, - control_params* control, - list far_nbrs, int N ) - { - - extern __shared__ int __nbr[]; - extern __shared__ int __sofar []; - bool nbrgen; - - int __THREADS_PER_ATOM__ = NBRS_THREADS_PER_ATOM; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); - int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; - - if (warp_id >= N ) return; - - int *tnbr = __nbr; - //int *nbrssofar = __nbr + __THREADS_PER_ATOM__; - int *nbrssofar = __nbr + blockDim.x; - - int *atoms = g.atoms; - int *top = g.top; - ivec *nbrs = g.nbrs; - rvec *nbrs_cp = g.nbrs_cp; - - int *nbr_atoms; - int atom1, atom2, l, iter, max, m, num_far; - int leader = -10; - int x, y, z, i; - far_neighbor_data *nbr_data, *my_start; - far_neighbor_data temp; - int atom_x, atom_y, atom_z; - - - atom1 = warp_id; - atom_x = (int)(sys_atoms[atom1].x[0] * g.inv_len[0]); - atom_y = (int)(sys_atoms[atom1].x[1] * g.inv_len[1]); - atom_z = (int)(sys_atoms[atom1].x[2] * g.inv_len[2]); - -#ifdef __BNVT_FIX__ - if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1; - if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1; - if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1; -#endif - - nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g); - - num_far = Start_Index (atom1, &far_nbrs); - my_start = & (far_nbrs.select.far_nbr_list [num_far] ); - - iter = 0; - tnbr[threadIdx.x] = 0; - - if (lane_id == 0) { - //nbrssofar [threadIdx.x /__THREADS_PER_ATOM__] = 0; - nbrssofar [my_bucket] = 0; - } - - __syncthreads (); - - while ((nbrs[iter][0] >= 0)) { - x = nbrs[iter][0]; - y = nbrs[iter][1]; - z = nbrs[iter][2]; - - tnbr[threadIdx.x] = 0; - nbrgen = false; - - if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms [atom1].x) <= - SQR (control->vlist_cut)) - { - nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]); - max = top [index_grid_3d(x, y, z, &g)]; - - tnbr[threadIdx.x] = 0; - nbrgen = false; - m = lane_id ; //0-31 - int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1); - int iterations = 0; - //while (m < max) - while (iterations < loopcount) - { - tnbr [threadIdx.x] = 0; - nbrgen = false; - - if (m < max) { - atom2 = nbr_atoms[m]; - if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, - control->vlist_cut, &temp)) - { - tnbr [threadIdx.x] = 1; - nbrgen = true; - } - } - else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, - control->vlist_cut, &temp)){ - tnbr [threadIdx.x] = 1; - nbrgen = true; - } - } - } - - if (nbrgen) - { - //do leader selection here - leader = -1; - //for (l = threadIdx.x / __THREADS_PER_ATOM__; l < threadIdx.x / __THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++) - for (l = my_bucket *__THREADS_PER_ATOM__; l < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++) - if (tnbr[l]){ - leader = l; - break; - } - - //do the reduction; - if (threadIdx.x == leader) - for (l = 1; l < __THREADS_PER_ATOM__; l++) - //tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + l] += tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + (l-1)]; - tnbr [my_bucket * __THREADS_PER_ATOM__ + l] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (l-1)]; - } - - //__syncthreads (); - //atomicAdd ( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1); - //while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ; - - if (nbrgen) - { - //got the indices - //nbr_data = my_start + nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] + tnbr [threadIdx.x] - 1; - nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1; - nbr_data->nbr = atom2; - nbr_data->rel_box[0] = temp.rel_box[0]; - nbr_data->rel_box[1] = temp.rel_box[1]; - nbr_data->rel_box[2] = temp.rel_box[2]; - - nbr_data->d = temp.d; - nbr_data->dvec[0] = temp.dvec[0]; - nbr_data->dvec[1] = temp.dvec[1]; - nbr_data->dvec[2] = temp.dvec[2]; - - if (threadIdx.x == leader) - //nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] += tnbr[(threadIdx.x / __THREADS_PER_ATOM__)*__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; - nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)]; - } - - m += __THREADS_PER_ATOM__; - iterations ++; - - //cleanup - nbrgen = false; - tnbr [threadIdx.x] = 0; - } - } - ++iter; - } - - __syncthreads (); - - //end the far_neighbor list here - if (lane_id == 0) - Set_End_Index (atom1, num_far + nbrssofar[my_bucket], &far_nbrs); - //Set_End_Index (atom1, num_far + tnbr[63], &far_nbrs); - } - - void Cuda_Generate_Neighbor_Lists (reax_system *system, static_storage *workspace, control_params *control, bool estimate) - { - real t_start, t_elapsed; - real t_1, t_2; - - list *far_nbrs = dev_lists + FAR_NBRS; - - int *d_indices = (int *) scratch; - int *nbrs_start, *nbrs_end; - int i, max_nbrs = 0; - int nbs; - - t_start = Get_Time (); - - Cuda_Bin_Atoms (system, workspace); - Cuda_Bin_Atoms_Sync ( system ); - - if (dev_workspace->realloc.estimate_nbrs > -1) { - - /*reset the re-neighbor condition */ - dev_workspace->realloc.estimate_nbrs = -1; - - //#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Recomputing the neighbors estimate.... \n"); - //#endif - cuda_memset (d_indices, 0, INT_SIZE * system->N, RES_SCRATCH ); - /* - dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]); - dim3 threadsperblock (system->g.max_atoms); - - Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>> - (system->d_atoms, system->d_g, system->d_box, - (control_params *)control->d_control, d_indices); - cudaThreadSynchronize (); - cudaCheckError (); - */ - nbs = (system->N / NBRS_BLOCK_SIZE) + (((system->N) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); - New_Estimate_NumNeighbors <<<nbs, NBRS_BLOCK_SIZE>>> - ( system->d_atoms, system->d_g, - system->d_box, (control_params *)control->d_control, - system->N, d_indices); - cudaThreadSynchronize (); - cudaCheckError (); - - - int *nbrs_indices = NULL; - nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) ); - if (nbrs_indices == NULL) - { - fprintf (stderr, "Malloc failed for nbrs indices .... \n"); - exit (1); - } - memset (nbrs_indices , 0, INT_SIZE * (system->N+1) ); - - copy_host_device (nbrs_indices+1, d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 1; i <= system->N; i++) - nbrs_indices [i] += nbrs_indices [i-1]; - - copy_host_device (nbrs_indices, (far_nbrs->index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ ); - copy_host_device (nbrs_indices, (far_nbrs->end_index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ ); - - free (nbrs_indices); - } - - /* - One thread per atom Implementation - Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> - (system->d_atoms, system->d_g, system->d_box, - (control_params *)control->d_control, *far_nbrs); - */ - nbs = (system->N * NBRS_THREADS_PER_ATOM/ NBRS_BLOCK_SIZE) + - (((system->N *NBRS_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); - - /* Multiple threads per atom Implementation */ - Test_Generate_Neighbor_Lists <<<nbs, NBRS_BLOCK_SIZE, - INT_SIZE * (NBRS_BLOCK_SIZE+ NBRS_BLOCK_SIZE/NBRS_THREADS_PER_ATOM) >>> - (system->d_atoms, system->d_g, system->d_box, - (control_params *)control->d_control, *far_nbrs, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - - t_elapsed = Get_Timing_Info (t_start); - d_timing.nbrs += t_elapsed; - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Done with neighbor generation ---> %f \n", t_elapsed); -#endif - - /*validate neighbors list*/ - nbrs_start = (int *) calloc (system->N, INT_SIZE); - nbrs_end = (int *) calloc (system->N, INT_SIZE); - - copy_host_device (nbrs_start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); - copy_host_device (nbrs_end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); - - int device_nbrs = 0; - for(i = 0; i < system->N; i++) - { - if ((nbrs_end[i] - nbrs_start[i]) > max_nbrs) - max_nbrs = nbrs_end[i] - nbrs_start[i]; - - device_nbrs += nbrs_end[i] - nbrs_start[i]; - } -#ifdef __CUDA_TEST__ - //fprintf (stderr, " New Device count is : %d \n", device_nbrs); - //dev_workspace->realloc.num_far = device_nbrs; -#endif - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Max neighbors is ---> %d \n", max_nbrs ); - fprintf (stderr, "DEVICE NEIGHBORS ---> %d \n", device_nbrs); -#endif - - //validate check here - //get the num_far from the list here - for (i = 0; i < system->N-1; i++) - { - if ((nbrs_end[i] - nbrs_start[i]) > (nbrs_start[i+1] - nbrs_start[i]) * DANGER_ZONE ) - { - dev_workspace->realloc.num_far = device_nbrs; - //#ifdef __CUDA_MEM__ - //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far); - //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d (%d %d %d) \n", - // i, nbrs_start[i], nbrs_end[i], nbrs_start[i+1]); - //#endif - } - - if (nbrs_end[i] > nbrs_start[i+1]) { - fprintf( stderr, "**ran out of space on far_nbrs: start[i] = %d, end[i]=%d, start[i+1]=%d, end[i+1] = %d", - nbrs_start[i], nbrs_end[i], nbrs_start[i+1], nbrs_end[i+1]); - exit( INSUFFICIENT_SPACE ); - } - } - - if ((nbrs_end[i] - nbrs_start[i]) > (far_nbrs->num_intrs - nbrs_start[i]) * DANGER_ZONE ) { - dev_workspace->realloc.num_far = device_nbrs; - //#ifdef __CUDA_MEM__ - //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far); - //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d start: %d, end: %d, count: %d\n" - // , i, nbrs_start[i], nbrs_end[i], far_nbrs->num_intrs); - //#endif - } - if (nbrs_end[i] > far_nbrs->num_intrs) { - fprintf( stderr, "**ran out of space on far_nbrs: top=%d, max=%d", - nbrs_end[i], far_nbrs->num_intrs ); - exit( INSUFFICIENT_SPACE ); - } - - free (nbrs_start); - free (nbrs_end); - } - - //Code not used anymore -#if defined DONE - - void Choose_Neighbor_Finder( reax_system *system, control_params *control, - get_far_neighbors_function *Get_Far_Neighbors ) - { - if( control->periodic_boundaries ) - { - if( system->box.box_norms[0] > 2.0 * control->vlist_cut && - system->box.box_norms[1] > 2.0 * control->vlist_cut && - system->box.box_norms[2] > 2.0 * control->vlist_cut ) - (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box; - else (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box; - } - else - (*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors; - } - - - int compare_near_nbrs(const void *v1, const void *v2) - { - return ((*(near_neighbor_data *)v1).nbr - (*(near_neighbor_data *)v2).nbr); - } - - - int compare_far_nbrs(const void *v1, const void *v2) - { - return ((*(far_neighbor_data *)v1).nbr - (*(far_neighbor_data *)v2).nbr); - } - - - inline void Set_Far_Neighbor( far_neighbor_data *dest, int nbr, real d, real C, - rvec dvec, ivec rel_box/*, rvec ext_factor*/ ) - { - dest->nbr = nbr; - dest->d = d; - rvec_Scale( dest->dvec, C, dvec ); - ivec_Copy( dest->rel_box, rel_box ); - // rvec_Scale( dest->ext_factor, C, ext_factor ); - } - - - inline void Set_Near_Neighbor(near_neighbor_data *dest, int nbr, real d, real C, - rvec dvec, ivec rel_box/*, rvec ext_factor*/) - { - dest->nbr = nbr; - dest->d = d; - rvec_Scale( dest->dvec, C, dvec ); - ivec_Scale( dest->rel_box, C, rel_box ); - // rvec_Scale( dest->ext_factor, C, ext_factor ); - } - - - /* In case bond restrictions are applied, this method checks if - atom1 and atom2 are allowed to bond with each other */ - inline int can_Bond( static_storage *workspace, int atom1, int atom2 ) - { - int i; - - // fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 ); - - if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] ) - return 1; - - for( i = 0; i < workspace->restricted[ atom1 ]; ++i ) - if( workspace->restricted_list[ atom1 ][i] == atom2 ) - return 1; - - for( i = 0; i < workspace->restricted[ atom2 ]; ++i ) - if( workspace->restricted_list[ atom2 ][i] == atom1 ) - return 1; - - return 0; - } - - - /* check if atom2 is on atom1's near neighbor list */ - inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 ) - { - int i; - - for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i ) - if( near_nbrs->select.near_nbr_list[i].nbr == atom2 ) - { - // fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 ); - return 1; - } - - return 0; - } - - void Generate_Neighbor_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int i, j, k; - int x, y, z; - int *nbr_atoms; - int atom1, atom2, max; - int num_far; - int c, count; - int grid_top; - grid *g = &( system->g ); - list *far_nbrs = (*lists) + FAR_NBRS; - //int hb_type1, hb_type2; - //list *hbonds = (*lists) + HBOND; - //int top_hbond1, top_hbond2; - get_far_neighbors_function Get_Far_Neighbors; - far_neighbor_data new_nbrs[125]; -#ifndef REORDER_ATOMS - int l, m; -#endif - - // fprintf( stderr, "\n\tentered nbrs - " ); - if( control->ensemble == iNPT || control->ensemble == sNPT || - control->ensemble == NPT ) - Update_Grid( system ); - // fprintf( stderr, "grid updated - " ); - - Bin_Atoms( system, out_control ); - // fprintf( stderr, "atoms sorted - " ); - -#ifdef REORDER_ATOMS - Cluster_Atoms( system, workspace ); - // fprintf( stderr, "atoms clustered - " ); -#endif - - Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors ); - // fprintf( stderr, "function chosen - " ); - - Reset_Neighbor_Lists( system, workspace, lists ); - // fprintf( stderr, "lists cleared - " ); - - num_far = 0; - num_near = 0; - c = 0; - - /* first pick up a cell in the grid */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs = g->nbrs[i][j][k]; - nbrs_cp = g->nbrs_cp[i][j][k]; - - /* pick up an atom from the current cell */ - //#ifdef REORDER_ATOMS - // for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++) - //#else - for(l = 0; l < g->top[i][j][k]; ++l ){ - atom1 = g->atoms[i][j][k][l]; - Set_End_Index( atom1, num_far, far_nbrs ); - // fprintf( stderr, "atom %d:\n", atom1 ); - - itr = 0; - while( nbrs[itr][0] > 0 ){ - x = nbrs[itr][0]; - y = nbrs[itr][1]; - z = nbrs[itr][2]; - - // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= - // SQR(control->r_cut)) - nbr_atoms = g->atoms[x][y][z]; - max_atoms = g->top[x][y][z]; - - /* pick up another atom from the neighbor cell - - we have to compare atom1 with its own periodic images as well, - that's why there is also equality in the if stmt below */ - //#ifdef REORDER_ATOMS - //for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++) - //#else - for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] ) - if( atom1 >= atom2 ) { - //fprintf( stderr, "\tatom2 %d", atom2 ); - //top_near1 = End_Index( atom1, near_nbrs ); - //Set_Start_Index( atom1, num_far, far_nbrs ); - //hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond; - Get_Far_Neighbors( system->atoms[atom1].x, - system->atoms[atom2].x, - &(system->box), control, new_nbrs, &count ); - fprintf( stderr, "\t%d count:%d\n", atom2, count ); - - for( c = 0; c < count; ++c ) - if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){ - Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]), - atom2, new_nbrs[c].d, 1.0, - new_nbrs[c].dvec, new_nbrs[c].rel_box ); - ++num_far; - - /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n", - atom1, atom2, new_nbrs[c].d, - new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], - new_nbrs[c].dvec[2] ); */ - - - /* hydrogen bond lists */ - /*if( control->hb_cut > 0.1 && - new_nbrs[c].d <= control->hb_cut ) { - // fprintf( stderr, "%d %d\n", atom1, atom2 ); - hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond; - if( hb_type1 == 1 && hb_type2 == 2 ) { - top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds); - Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]), - atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec, - new_nbrs[c].rel_box ); - Set_End_Index( workspace->hbond_index[atom1], - top_hbond1 + 1, hbonds ); - } - else if( hb_type1 == 2 && hb_type2 == 1 ) { - top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds ); - Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]), - atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, - new_nbrs[c].rel_box ); - Set_End_Index( workspace->hbond_index[atom2], - top_hbond2 + 1, hbonds ); - }*/ - } - } - } - - Set_End_Index( atom1, top_far1, far_nbrs ); - } - } - - - fprintf( stderr, "nbrs done-" ); - - - /* apply restrictions on near neighbors only */ - if( (data->step - data->prev_steps) < control->restrict_bonds ) { - for( atom1 = 0; atom1 < system->N; ++atom1 ) - if( workspace->restricted[ atom1 ] ) { - // fprintf( stderr, "atom1: %d\n", atom1 ); - - top_near1 = End_Index( atom1, near_nbrs ); - - for( j = 0; j < workspace->restricted[ atom1 ]; ++j ) - if(!is_Near_Neighbor(near_nbrs, atom1, - atom2 = workspace->restricted_list[atom1][j])) { - fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n", - atom1, atom2 ); - - top_near2 = End_Index( atom2, near_nbrs ); - - /* we just would like to get the nearest image, so a call to - Get_Periodic_Far_Neighbors_Big_Box is good enough. */ - Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, - system->atoms[ atom2 ].x, - &(system->box), control, - new_nbrs, &count ); - - Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]), - atom2, new_nbrs[c].d, 1.0, - new_nbrs[c].dvec, new_nbrs[c].rel_box ); - ++top_near1; - - Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]), - atom1, new_nbrs[c].d, -1.0, - new_nbrs[c].dvec, new_nbrs[c].rel_box ); - Set_End_Index( atom2, top_near2+1, near_nbrs ); - } - - Set_End_Index( atom1, top_near1, near_nbrs ); - } - } - // fprintf( stderr, "restrictions applied-" ); - - - /* verify nbrlists, count num_intrs, sort nearnbrs */ - near_nbrs->num_intrs = 0; - far_nbrs->num_intrs = 0; - for( i = 0; i < system->N-1; ++i ) { - if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) { - fprintf( stderr, - "step%3d: nearnbr list of atom%d is overwritten by atom%d\n", - data->step, i+1, i ); - exit( 1 ); - } - - near_nbrs->num_intrs += Num_Entries(i, near_nbrs); - - if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) { - fprintf( stderr, - "step%3d: farnbr list of atom%d is overwritten by atom%d\n", - data->step, i+1, i ); - exit( 1 ); - } - - far_nbrs->num_intrs += Num_Entries(i, far_nbrs); - } - - for( i = 0; i < system->N; ++i ) { - qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]), - Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), - compare_near_nbrs ); - } - // fprintf( stderr, "near nbrs sorted\n" ); - - -#ifdef TEST_ENERGY - /* for( i = 0; i < system->N; ++i ) { - qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), - Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), - compare_far_nbrs ); - } */ - - fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", - num_near / system->N ); - fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", - num_far / system->N, control->max_far_nbrs ); -#endif - - //fprintf( stderr, "step%d: num of nearnbrs = %6d num of farnbrs: %6d\n", - // data->step, num_near, num_far ); - - //fprintf( stderr, "\talloc nearnbrs = %6d alloc farnbrs: %6d\n", - // system->N * near_nbrs->intrs_per_unit, - // system->N * far_nbrs->intrs_per_unit ); - } - - - - void Generate_Neighbor_Lists( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) - { - int i, j, k, l, m, itr; - int x, y, z; - int atom1, atom2, max; - int num_far, c, count; - int *nbr_atoms; - ivec *nbrs; - rvec *nbrs_cp; - grid *g; - list *far_nbrs; - get_far_neighbors_function Get_Far_Neighbors; - far_neighbor_data new_nbrs[125]; - - g = &( system->g ); - far_nbrs = (*lists) + FAR_NBRS; - - // fprintf( stderr, "\n\tentered nbrs - " ); - if( control->ensemble == iNPT || - control->ensemble == sNPT || - control->ensemble == NPT ) - Update_Grid( system ); - // fprintf( stderr, "grid updated - " ); - - Bin_Atoms( system, out_control ); - // fprintf( stderr, "atoms sorted - " ); - Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors ); - // fprintf( stderr, "function chosen - " ); - Reset_Neighbor_Lists( system, workspace, lists ); - // fprintf( stderr, "lists cleared - " ); - - num_far = 0; - c = 0; - - /* first pick up a cell in the grid */ - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ) { - nbrs = g->nbrs[i][j][k]; - nbrs_cp = g->nbrs_cp[i][j][k]; - fprintf( stderr, "gridcell %d %d %d\n", i, j, k ); - - /* pick up an atom from the current cell */ - for(l = 0; l < g->top[i][j][k]; ++l ){ - atom1 = g->atoms[i][j][k][l]; - Set_Start_Index( atom1, num_far, far_nbrs ); - fprintf( stderr, "\tatom %d\n", atom1 ); - - itr = 0; - while( nbrs[itr][0] > 0 ){ - x = nbrs[itr][0]; - y = nbrs[itr][1]; - z = nbrs[itr][2]; - fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z ); - - // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= - // SQR(control->r_cut)) - nbr_atoms = g->atoms[x][y][z]; - max = g->top[x][y][z]; - fprintf( stderr, "\t\tmax: %d\n", max ); - - - /* pick up another atom from the neighbor cell - - we have to compare atom1 with its own periodic images as well, - that's why there is also equality in the if stmt below */ - for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] ) - if( atom1 >= atom2 ) { - Get_Far_Neighbors( system->atoms[atom1].x, - system->atoms[atom2].x, - &(system->box), control, new_nbrs, &count ); - fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count ); - - for( c = 0; c < count; ++c ) - if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){ - Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]), - atom2, new_nbrs[c].d, 1.0, - new_nbrs[c].dvec, new_nbrs[c].rel_box ); - ++num_far; - - /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n", - atom1, atom2, new_nbrs[c].d, - new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], - new_nbrs[c].dvec[2] ); */ - } - } - - ++itr; - } - - Set_End_Index( atom1, num_far, far_nbrs ); - } - } - - far_nbrs->num_intrs = num_far; - fprintf( stderr, "nbrs done, num_far: %d\n", num_far ); - -#if defined(DEBUG) - for( i = 0; i < system->N; ++i ) { - qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), - Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), - compare_far_nbrs ); - } - - fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far ); - fprintf( stderr, "\tallocated farnbrs: %6d\n", - system->N * far_nbrs->intrs_per_unit ); -#endif - } - - - -#endif diff --git a/PuReMD-GPU/src/neighbors.h b/PuReMD-GPU/src/neighbors.h index 465d61de9775bdc20130d5f69f537031aaa98ff8..64c14ad29d5194006aacb057a7d80ef54aeee8e4 100644 --- a/PuReMD-GPU/src/neighbors.h +++ b/PuReMD-GPU/src/neighbors.h @@ -23,25 +23,35 @@ #include "mytypes.h" + void Generate_Neighbor_Lists( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); -void Cuda_Generate_Neighbor_Lists (reax_system *system, - static_storage *workspace, control_params *control, bool); + static_storage*, list**, output_controls* ); int Estimate_NumNeighbors( reax_system*, control_params*, - static_storage*, list** ); + static_storage*, list** ); + +int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* ); + + +static inline HOST_DEVICE int index_grid_debug (int x, int y, int z, int blocksize) +{ + return x * 8 * 8 * blocksize + + y * 8 * blocksize + + z * blocksize ; +} + -HOST_DEVICE int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* ); +static inline HOST_DEVICE real DistSqr_to_CP( rvec cp, rvec x ) +{ + int i; + real d_sqr = 0; -GLOBAL void Estimate_NumNeighbors ( reax_atom *, grid , simulation_box *, control_params *, int *); -GLOBAL void Generate_Neighbor_Lists( reax_atom *, grid , simulation_box *, control_params *, list ); + for( i = 0; i < 3; ++i ) + if( cp[i] > NEG_INF ) + d_sqr += SQR( cp[i] - x[i] ); -GLOBAL void Estimate_NumNeighbors ( reax_atom *, - grid , - simulation_box *, - control_params *, - int *, int *, int, int , int, int); -GLOBAL void fix_sym_indices_far_nbrs (list , int ); + return d_sqr; +} #endif diff --git a/PuReMD-GPU/src/param.h b/PuReMD-GPU/src/param.h index f8101896932a4f06a53d3d99c6b18a5be1710078..2b24b056983233840966a8de29ce902ca6beb981 100644 --- a/PuReMD-GPU/src/param.h +++ b/PuReMD-GPU/src/param.h @@ -27,13 +27,15 @@ #define MAX_TOKENS 20 #define MAX_TOKEN_LEN 1024 -int Get_Atom_Type( reax_interaction*, char* ); -int Tokenize( char*, char*** ); +int Get_Atom_Type( reax_interaction*, char* ); + +int Tokenize( char*, char*** ); char Read_Force_Field( FILE*, reax_interaction* ); char Read_Control_File( FILE*, reax_system*, control_params*, - output_controls* ); + output_controls* ); + #endif diff --git a/PuReMD-GPU/src/random.h b/PuReMD-GPU/src/random.h index f7edb397293676c6e5a7a7d34ba5d12b8f3dab4a..b19bc58e3dcef04a324b108be718bfbff3e5c06c 100644 --- a/PuReMD-GPU/src/random.h +++ b/PuReMD-GPU/src/random.h @@ -23,31 +23,29 @@ #include "mytypes.h" -HOST_DEVICE inline double Random(double); -HOST_DEVICE inline void Randomize(); -HOST_DEVICE inline double GRandom(double , double ); - /* System random number generator used linear congruance method with large periodicity for generation of pseudo random number. function Random returns this random number appropriately scaled so that 0 <= Random(range) < range */ -HOST_DEVICE inline double Random(double range) +static inline HOST_DEVICE double Random(double range) { return (random() * range) / 2147483647L; } + /* This function seeds the system pseudo random number generator with current time. Use this function once in the begining to initialize the system */ -HOST_DEVICE inline void Randomize() +static inline HOST_DEVICE void Randomize( ) { - srandom(time(NULL)); + srandom( time(NULL) ); } + /* GRandom return random number with gaussian distribution with mean and standard deviation "sigma" */ -HOST_DEVICE inline double GRandom(double mean, double sigma) +static inline HOST_DEVICE double GRandom(double mean, double sigma) { double v1 = Random(2.0) - 1.0; double v2 = Random(2.0) - 1.0; @@ -63,4 +61,5 @@ HOST_DEVICE inline double GRandom(double mean, double sigma) return mean + v1 * sigma * sqrt(-2.0 * log(rsq) / rsq); } + #endif diff --git a/PuReMD-GPU/src/reset_utils.c b/PuReMD-GPU/src/reset_utils.c new file mode 100644 index 0000000000000000000000000000000000000000..f79596aa9d29a65f673448d18a28c73c00444e43 --- /dev/null +++ b/PuReMD-GPU/src/reset_utils.c @@ -0,0 +1,162 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "reset_utils.h" + +#include "list.h" +#include "vector.h" + + +void Reset_Atoms( reax_system* system ) +{ + int i; + + for( i = 0; i < system->N; ++i ) + memset( system->atoms[i].f, 0.0, RVEC_SIZE ); +} + + +void Reset_Pressures( simulation_data *data ) +{ + rtensor_MakeZero( data->flex_bar.P ); + data->iso_bar.P = 0; + rvec_MakeZero( data->int_press ); + rvec_MakeZero( data->ext_press ); + /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */ +} + + +void Reset_Simulation_Data( simulation_data* data ) +{ + data->E_BE = 0; + data->E_Ov = 0; + data->E_Un = 0; + data->E_Lp = 0; + data->E_Ang = 0; + data->E_Pen = 0; + data->E_Coa = 0; + data->E_HB = 0; + data->E_Tor = 0; + data->E_Con = 0; + data->E_vdW = 0; + data->E_Ele = 0; + data->E_Kin = 0; +} + + +#ifdef TEST_FORCES +void Reset_Test_Forces( reax_system *system, static_storage *workspace ) +{ + memset( workspace->f_ele, 0, system->N * sizeof(rvec) ); + memset( workspace->f_vdw, 0, system->N * sizeof(rvec) ); + memset( workspace->f_bo, 0, system->N * sizeof(rvec) ); + memset( workspace->f_be, 0, system->N * sizeof(rvec) ); + memset( workspace->f_lp, 0, system->N * sizeof(rvec) ); + memset( workspace->f_ov, 0, system->N * sizeof(rvec) ); + memset( workspace->f_un, 0, system->N * sizeof(rvec) ); + memset( workspace->f_ang, 0, system->N * sizeof(rvec) ); + memset( workspace->f_coa, 0, system->N * sizeof(rvec) ); + memset( workspace->f_pen, 0, system->N * sizeof(rvec) ); + memset( workspace->f_hb, 0, system->N * sizeof(rvec) ); + memset( workspace->f_tor, 0, system->N * sizeof(rvec) ); + memset( workspace->f_con, 0, system->N * sizeof(rvec) ); +} +#endif + + +void Reset_Workspace( reax_system *system, static_storage *workspace ) +{ + memset( workspace->total_bond_order, 0, system->N * sizeof( real ) ); + memset( workspace->dDeltap_self, 0, system->N * sizeof( rvec ) ); + + memset( workspace->CdDelta, 0, system->N * sizeof( real ) ); + //memset( workspace->virial_forces, 0, system->N * sizeof( rvec ) ); + +#ifdef TEST_FORCES + memset( workspace->dDelta, 0, sizeof(rvec) * system->N ); + Reset_Test_Forces( system, workspace ); +#endif +} + + +void Reset_Neighbor_Lists( reax_system *system, control_params *control, + static_storage *workspace, list **lists ) +{ + int i, tmp; + list *bonds = (*lists) + BONDS; + list *hbonds = (*lists) + HBONDS; + + for( i = 0; i < system->N; ++i ) { + tmp = Start_Index( i, bonds ); + Set_End_Index( i, tmp, bonds ); + } + + //TODO check if this is needed + memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs ); + + if( control->hb_cut > 0 ) + for( i = 0; i < system->N; ++i ) + if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) { + tmp = Start_Index( workspace->hbond_index[i], hbonds ); + Set_End_Index( workspace->hbond_index[i], tmp, hbonds ); + /* fprintf( stderr, "i:%d, hbond: %d-%d\n", + i, Start_Index( workspace->hbond_index[i], hbonds ), + End_Index( workspace->hbond_index[i], hbonds ) );*/ + } +} + + +void Reset( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, list **lists ) +{ + Reset_Atoms( system ); + + Reset_Simulation_Data( data ); + + if( control->ensemble == NPT || control->ensemble == sNPT || + control->ensemble == iNPT ) + Reset_Pressures( data ); + + Reset_Workspace( system, workspace ); + + Reset_Neighbor_Lists( system, control, workspace, lists ); + +#if defined(DEBUG_FOCUS) + fprintf( stderr, "reset - "); +#endif +} + + +void Reset_Grid( grid *g ) +{ + memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]); +} + + +void Reset_Marks( grid *g, ivec *grid_stack, int grid_top ) +{ + int i; + + for( i = 0; i < grid_top; ++i ) + g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + + grid_stack[i][1] * g->ncell[2] + + grid_stack[i][2]] = 0; +} diff --git a/PuReMD-GPU/src/reset_utils.h b/PuReMD-GPU/src/reset_utils.h index 7fb318e8e6b9fec7941ea01f1102886aa7425e3e..190bd7f5632f3b0a2b3291edf82e03aa4922793e 100644 --- a/PuReMD-GPU/src/reset_utils.h +++ b/PuReMD-GPU/src/reset_utils.h @@ -23,6 +23,11 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Reset_Atoms( reax_system* ); void Reset_Pressures( simulation_data* ); @@ -36,10 +41,10 @@ void Reset_Test_Forces( reax_system*, static_storage* ); void Reset_Workspace( reax_system*, static_storage* ); void Reset_Neighbor_Lists( reax_system*, control_params*, - static_storage*, list** ); + static_storage*, list** ); void Reset( reax_system*, control_params*, simulation_data*, - static_storage*, list** ); + static_storage*, list** ); //void Reset_Neighbor_Lists( reax_system*, static_storage*, list** ); @@ -47,12 +52,9 @@ void Reset_Grid( grid* ); void Reset_Marks( grid*, ivec*, int ); -void Cuda_Reset_Grid( grid* ); +#ifdef __cplusplus +} +#endif -//CUDA functions -void Cuda_Reset_Workspace (reax_system *, static_storage *); -void Cuda_Reset( reax_system*, control_params*, simulation_data*, - static_storage*, list** ); -void Cuda_Reset_Atoms (reax_system *); #endif diff --git a/PuReMD-GPU/src/single_body_interactions.c b/PuReMD-GPU/src/single_body_interactions.c new file mode 100644 index 0000000000000000000000000000000000000000..b26f493e703819f066389991a4845acab113b326 --- /dev/null +++ b/PuReMD-GPU/src/single_body_interactions.c @@ -0,0 +1,314 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "single_body_interactions.h" + +#include "bond_orders.h" +#include "index_utils.h" +#include "list.h" +#include "lookup.h" +#include "vector.h" + + +void LonePair_OverUnder_Coordination_Energy( reax_system *system, + control_params *control, + simulation_data *data, + static_storage *workspace, + list **lists, + output_controls *out_control ) +{ + int i, j, pj, type_i, type_j; + real Delta_lpcorr, dfvl; + real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi; + real e_lph, Di, vov3, deahu2dbo, deahu2dsbo; + real e_ov, CEover1, CEover2, CEover3, CEover4; + real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2; + real exp_ovun2n, exp_ovun6, exp_ovun8; + real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8; + real e_un, CEunder1, CEunder2, CEunder3, CEunder4; + real p_lp1, p_lp2, p_lp3; + real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8; + + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_data *pbond; + bond_order_data *bo_ij; + list *bonds = (*lists) + BONDS; + + /* Initialize parameters */ + p_lp1 = system->reaxprm.gp.l[15]; + p_lp3 = system->reaxprm.gp.l[5]; + p_ovun3 = system->reaxprm.gp.l[32]; + p_ovun4 = system->reaxprm.gp.l[31]; + p_ovun6 = system->reaxprm.gp.l[6]; + p_ovun7 = system->reaxprm.gp.l[8]; + p_ovun8 = system->reaxprm.gp.l[9]; + + for( i = 0; i < system->N; ++i ) { + /* set the parameter pointer */ + type_i = system->atoms[i].type; + sbp_i = &(system->reaxprm.sbp[ type_i ]); + + /* lone-pair Energy */ + p_lp2 = sbp_i->p_lp2; + expvd2 = EXP( -75 * workspace->Delta_lp[i] ); + inv_expvd2 = 1. / (1. + expvd2 ); + + /* calculate the energy */ + data->E_Lp += e_lp = + p_lp2 * workspace->Delta_lp[i] * inv_expvd2; + + dElp = p_lp2 * inv_expvd2 + + 75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2); + CElp = dElp * workspace->dDelta_lp[i]; + + workspace->CdDelta[i] += CElp; // lp - 1st term + +#ifdef TEST_ENERGY + fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", + p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp ); + fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n", + workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp ); +#endif +#ifdef TEST_FORCES + Add_dDelta( system, lists, i, CElp, workspace->f_lp ); // lp - 1st term +#endif + + /* correction for C2 */ + if( system->reaxprm.gp.l[5] > 0.001 && + !strcmp( system->reaxprm.sbp[type_i].name, "C" ) ) + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) + if( i < bonds->select.bond_list[pj].nbr ) { + j = bonds->select.bond_list[pj].nbr; + type_j = system->atoms[j].type; + + if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) { + twbp = &( system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + Di = workspace->Delta[i]; + vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.); + + if( vov3 > 3. ) { + data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0); + //estrain(i) += e_lph; + + deahu2dbo = 2.*p_lp3*(vov3 - 3.); + deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.)); + + bo_ij->Cdbo += deahu2dbo; + workspace->CdDelta[i] += deahu2dsbo; +#ifdef TEST_ENERGY + fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n", + // workspace->orig_id[i], workspace->orig_id[j], + i+1, j+1, e_lph, deahu2dbo, deahu2dsbo ); +#endif +#ifdef TEST_FORCES + Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp); + Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp); +#endif + } + } + + } + } + + + for( i = 0; i < system->N; ++i ) { + type_i = system->atoms[i].type; + sbp_i = &(system->reaxprm.sbp[ type_i ]); + + /* over-coordination energy */ + if( sbp_i->mass > 21.0 ) + dfvl = 0.0; + else dfvl = 1.0; // only for 1st-row elements + + p_ovun2 = sbp_i->p_ovun2; + sum_ovun1 = 0; + sum_ovun2 = 0; + + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) { + j = bonds->select.bond_list[pj].nbr; + type_j = system->atoms[j].type; + bo_ij = &(bonds->select.bond_list[pj].bo_data); + sbp_j = &(system->reaxprm.sbp[ type_j ]); + twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]); + + sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO; + sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])* + ( bo_ij->BO_pi + bo_ij->BO_pi2 ); + + /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", + i+1, j+1, + dfvl * workspace->Delta_lp_temp[j], + sbp_j->nlp_opt, + workspace->nlp_temp[j] );*/ + } + + exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 ); + inv_exp_ovun1 = 1.0 / (1 + exp_ovun1); + Delta_lpcorr = workspace->Delta[i] - + (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1; + + exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr ); + inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2); + + DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 ); + CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2; + + data->E_Ov += e_ov = sum_ovun1 * CEover1; + + CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 * + ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) ); + + CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 ); + + CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1); + + + /* under-coordination potential */ + p_ovun2 = sbp_i->p_ovun2; + p_ovun5 = sbp_i->p_ovun5; + + exp_ovun2n = 1.0 / exp_ovun2; + exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr ); + exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2); + inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n); + inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8); + + data->E_Un += e_un = + -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8; + + CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 + + p_ovun2 * e_un * exp_ovun2n); + CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8; + CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1); + CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * + p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2; + + //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n", + // i+1, sum_ovun2, e_ov, e_un ); + + /* forces */ + workspace->CdDelta[i] += CEover3; // OvCoor - 2nd term + workspace->CdDelta[i] += CEunder3; // UnCoor - 1st term + +#ifdef TEST_FORCES + Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor - 2nd + Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st +#endif + + + for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ + pbond = &(bonds->select.bond_list[pj]); + j = pbond->nbr; + type_j = system->atoms[j].type; + bo_ij = &(pbond->bo_data); + twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]); + + + bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st + workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* + (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a + bo_ij->Cdbopi += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b + bo_ij->Cdbopi2 += CEover4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b + + + workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2); // UnCoor - 2a + bo_ij->Cdbopi += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b + bo_ij->Cdbopi2 += CEunder4 * + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b + + +#ifdef TEST_ENERGY + /* fprintf( out_control->eov, "%6d%23.15e%23.15e" + workspace->orig_id[j]+1, + //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2, + CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */ + + /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[j]+1, + CEover4, + CEover4* + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), + CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), + (1.0 - dfvl*workspace->dDelta_lp[j]), + CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2) );*/ + + /* fprintf( out_control->eun, "%6d%23.15e\n", + workspace->orig_id[j]+1, CEunder3 ); */ + + /*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[j]+1, + CEunder4, + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), + CEunder4* + (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]), + CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])* + (bo_ij->BO_pi + bo_ij->BO_pi2) );*/ +#endif + +#ifdef TEST_FORCES + Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, + workspace->f_ov ); // OvCoor - 1st term + + Add_dDelta( system, lists, j, + CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a + + Add_dBOpinpi2( system, lists, i, pj, + CEover4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + CEover4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + workspace->f_ov, workspace->f_ov ); // OvCoor - 3b + + Add_dDelta( system, lists, j, + CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * + (bo_ij->BO_pi + bo_ij->BO_pi2), + workspace->f_un ); // UnCoor - 2a + + Add_dBOpinpi2( system, lists, i, pj, + CEunder4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + CEunder4 * (workspace->Delta[j] - + dfvl * workspace->Delta_lp_temp[j]), + workspace->f_un, workspace->f_un ); // UnCoor - 2b +#endif + } + +#ifdef TEST_ENERGY + + fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", + i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); + + fprintf( out_control->eov, "%6d%15.8f%15.8f\n", + i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un ); + + fprintf( out_control->eov, "%6d%15.8f%15.8f\n", + i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un ); +#endif + } +} diff --git a/PuReMD-GPU/src/single_body_interactions.h b/PuReMD-GPU/src/single_body_interactions.h index dd26679755915e5faeecb1c3da90e81ff686132f..5ebe03b85785e956074423a17f37494f1c6ae36c 100644 --- a/PuReMD-GPU/src/single_body_interactions.h +++ b/PuReMD-GPU/src/single_body_interactions.h @@ -21,33 +21,13 @@ #ifndef __SINGLE_BODY_INTERACTIONS_H_ #define __SINGLE_BODY_INTERACTIONS_H_ -#include <mytypes.h> +#include "mytypes.h" + void LonePair_OverUnder_Coordination_Energy( reax_system*, control_params*, simulation_data*, static_storage*, list**, output_controls* ); -//CUDA Functions... -GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters , - single_body_parameters *, two_body_parameters *, - static_storage , simulation_data *, - list , int , int ); - -GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *, global_parameters , - single_body_parameters *, two_body_parameters *, - static_storage , simulation_data *, - list , int , int, - real *, real *, real *); - -GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters , - single_body_parameters *, two_body_parameters *, - static_storage , simulation_data *, - list , int , int, - real *, real *, real *); - -GLOBAL void test_LonePair_Postprocess ( reax_atom *, global_parameters , - single_body_parameters *, two_body_parameters *, - static_storage , simulation_data *, - list , int , int ); + #endif diff --git a/PuReMD-GPU/src/sort.h b/PuReMD-GPU/src/sort.h index 1ccd7116ea529a817496d1265e83b6d81d186430..11bb61288384191a057e58788b03883a7703e5a1 100644 --- a/PuReMD-GPU/src/sort.h +++ b/PuReMD-GPU/src/sort.h @@ -23,14 +23,16 @@ #include "mytypes.h" -HOST_DEVICE inline void h_swap(sparse_matrix_entry *array, int index1, int index2) + +static inline HOST_DEVICE void h_swap(sparse_matrix_entry *array, int index1, int index2) { sparse_matrix_entry temp = array[index1]; array[index1] = array[index2]; array[index2] = temp; } -HOST_DEVICE inline void h_quick_sort(sparse_matrix_entry *array, int start, int end) + +static inline HOST_DEVICE void h_quick_sort(sparse_matrix_entry *array, int start, int end) { int i = start; int k = end; @@ -51,14 +53,16 @@ HOST_DEVICE inline void h_quick_sort(sparse_matrix_entry *array, int start, int } } -inline void d_swap(sparse_matrix_entry *array, int index1, int index2) + +static inline void d_swap(sparse_matrix_entry *array, int index1, int index2) { sparse_matrix_entry temp = array[index1]; array[index1] = array[index2]; array[index2] = temp; } -inline void d_quick_sort(sparse_matrix_entry *array, int start, int end) + +static inline void d_quick_sort(sparse_matrix_entry *array, int start, int end) { int i = start; int k = end; @@ -82,5 +86,4 @@ inline void d_quick_sort(sparse_matrix_entry *array, int start, int end) } - #endif diff --git a/PuReMD-GPU/src/system_props.c b/PuReMD-GPU/src/system_props.c new file mode 100644 index 0000000000000000000000000000000000000000..0126b86b776dce8fd30aea0c228731b95104b216 --- /dev/null +++ b/PuReMD-GPU/src/system_props.c @@ -0,0 +1,348 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "system_props.h" + +#include "box.h" +#include "vector.h" + + +HOST real Get_Time( ) +{ + struct timeval tim; + + gettimeofday(&tim, NULL ); + return( tim.tv_sec + (tim.tv_usec / 1000000.0) ); +} + + +HOST real Get_Timing_Info( real t_start ) +{ + struct timeval tim; + real t_end; + + gettimeofday(&tim, NULL ); + t_end = tim.tv_sec + (tim.tv_usec / 1000000.0); + return (t_end - t_start); +} + + +void Temperature_Control( control_params *control, simulation_data *data, + output_controls *out_control ) +{ + real tmp; + + if( control->T_mode == 1 ) { // step-wise temperature control + if( (data->step - data->prev_steps) % + ((int)(control->T_freq / control->dt)) == 0 ) { + if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) ) + control->T += control->T_rate; + else control->T = control->T_final; + } + } + else if( control->T_mode == 2 ) { // constant slope control + tmp = control->T_rate * control->dt / control->T_freq; + + if( fabs( control->T - control->T_final ) >= fabs( tmp ) ) + control->T += tmp; + } +} + + +void Compute_Total_Mass( reax_system *system, simulation_data *data ) +{ + int i; + int blocks; + int block_size; + real *partial_sums = 0; + + data->M = 0; + + for( i = 0; i < system->N; i++ ) + data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass; + + data->inv_M = 1. / data->M; +} + + +void Compute_Center_of_Mass( reax_system *system, simulation_data *data, + FILE *fout ) +{ + int i; + real m, xx, xy, xz, yy, yz, zz, det; + rvec tvec, diff; + rtensor mat, inv; + + int blocks; + int block_size; + rvec *l_xcm, *l_vcm, *l_amcm; + real t_start, t_end; + + rvec_MakeZero( data->xcm ); // position of CoM + rvec_MakeZero( data->vcm ); // velocity of CoM + rvec_MakeZero( data->amcm ); // angular momentum of CoM + rvec_MakeZero( data->avcm ); // angular velocity of CoM + + /* Compute the position, velocity and angular momentum about the CoM */ + for( i = 0; i < system->N; ++i ) { + m = system->reaxprm.sbp[ system->atoms[i].type ].mass; + + rvec_ScaledAdd( data->xcm, m, system->atoms[i].x ); + rvec_ScaledAdd( data->vcm, m, system->atoms[i].v ); + + rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v ); + rvec_ScaledAdd( data->amcm, m, tvec ); + + /*fprintf( fout,"%3d %g %g %g\n", + i+1, + system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); + fprintf( fout, "vcm: %g %g %g\n", + data->vcm[0], data->vcm[1], data->vcm[2] ); + */ + } + + rvec_Scale( data->xcm, data->inv_M, data->xcm ); + rvec_Scale( data->vcm, data->inv_M, data->vcm ); + + rvec_Cross( tvec, data->xcm, data->vcm ); + rvec_ScaledAdd( data->amcm, -data->M, tvec ); + + data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm ); + + /* Calculate and then invert the inertial tensor */ + xx = xy = xz = yy = yz = zz = 0; + + for( i = 0; i < system->N; ++i ) { + m = system->reaxprm.sbp[ system->atoms[i].type ].mass; + + rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm ); + xx += diff[0] * diff[0] * m; + xy += diff[0] * diff[1] * m; + xz += diff[0] * diff[2] * m; + yy += diff[1] * diff[1] * m; + yz += diff[1] * diff[2] * m; + zz += diff[2] * diff[2] * m; + } + +#ifdef __DEBUG_CUDA__ + fprintf (stderr, " xx: %f \n", xx); + fprintf (stderr, " xy: %f \n", xy); + fprintf (stderr, " xz: %f \n", xz); + fprintf (stderr, " yy: %f \n", yy); + fprintf (stderr, " yz: %f \n", yz); + fprintf (stderr, " zz: %f \n", zz); +#endif + + mat[0][0] = yy + zz; + mat[0][1] = mat[1][0] = -xy; + mat[0][2] = mat[2][0] = -xz; + mat[1][1] = xx + zz; + mat[2][1] = mat[1][2] = -yz; + mat[2][2] = xx + yy; + + /* invert the inertial tensor */ + det = ( mat[0][0] * mat[1][1] * mat[2][2] + + mat[0][1] * mat[1][2] * mat[2][0] + + mat[0][2] * mat[1][0] * mat[2][1] ) - + ( mat[0][0] * mat[1][2] * mat[2][1] + + mat[0][1] * mat[1][0] * mat[2][2] + + mat[0][2] * mat[1][1] * mat[2][0] ); + + inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1]; + inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2]; + inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1]; + inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2]; + inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0]; + inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2]; + inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1]; + inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1]; + inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; + + if( fabs(det) > ALMOST_ZERO ) + rtensor_Scale( inv, 1./det, inv ); + else + rtensor_MakeZero( inv ); + + /* Compute the angular velocity about the centre of mass */ + rtensor_MatVec( data->avcm, inv, data->amcm ); + data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); + +#if defined(DEBUG) + fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", + data->xcm[0], data->xcm[1], data->xcm[2] ); + fprintf( stderr, "vcm: %24.15e %24.15e %24.15e\n", + data->vcm[0], data->vcm[1], data->vcm[2] ); + fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", + data->amcm[0], data->amcm[1], data->amcm[2] ); + /* fprintf( fout, "mat: %f %f %f\n %f %f %f\n %f %f %f\n", + mat[0][0], mat[0][1], mat[0][2], + mat[1][0], mat[1][1], mat[1][2], + mat[2][0], mat[2][1], mat[2][2] ); + fprintf( fout, "inv: %g %g %g\n %g %g %g\n %g %g %g\n", + inv[0][0], inv[0][1], inv[0][2], + inv[1][0], inv[1][1], inv[1][2], + inv[2][0], inv[2][1], inv[2][2] ); + fflush( fout ); */ + fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n", + data->avcm[0], data->avcm[1], data->avcm[2] ); +#endif +} + + +void Compute_Kinetic_Energy( reax_system* system, simulation_data* data ) +{ + int i; + rvec p; + real m; + + data->E_Kin = 0.0; + + for (i=0; i < system->N; i++) { + m = system->reaxprm.sbp[system->atoms[i].type].mass; + + rvec_Scale( p, m, system->atoms[i].v ); + data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v ); + + /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n", + i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], + system->reaxprm.sbp[system->atoms[i].type].mass); */ + } + + data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B); + + if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */ + data->therm.T = ALMOST_ZERO; +} + + +/* IMPORTANT: This function assumes that current kinetic energy and + * the center of mass of the system is already computed before. + * + * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs + * to be added when there are long-range interactions or long-range + * corrections to short-range interactions present. + * We may want to add that for more accuracy. + */ +void Compute_Pressure_Isotropic( reax_system* system, control_params *control, + simulation_data* data, + output_controls *out_control ) +{ + int i; + reax_atom *p_atom; + rvec tx; + rvec tmp; + simulation_box *box = &(system->box); + + /* Calculate internal pressure */ + rvec_MakeZero( data->int_press ); + + // 0: both int and ext, 1: ext only, 2: int only + if( control->press_mode == 0 || control->press_mode == 2 ) { + for( i = 0; i < system->N; ++i ) { + p_atom = &( system->atoms[i] ); + + /* transform x into unitbox coordinates */ + Transform_to_UnitBox( p_atom->x, box, 1, tx ); + + /* this atom's contribution to internal pressure */ + rvec_Multiply( tmp, p_atom->f, tx ); + rvec_Add( data->int_press, tmp ); + + if( out_control->debug_level > 0 ) { + fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", + i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] ); + fprintf( out_control->prs, "%8.2f%8.2f%8.2f", + p_atom->f[0], p_atom->f[1], p_atom->f[2] ); + fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", + data->int_press[0],data->int_press[1],data->int_press[2]); + } + } + } + + /* kinetic contribution */ + data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV ); + + /* Calculate total pressure in each direction */ + data->tot_press[0] = data->kin_press - + ((data->int_press[0] + data->ext_press[0]) / + (box->box_norms[1] * box->box_norms[2] * P_CONV)); + + data->tot_press[1] = data->kin_press - + ((data->int_press[1] + data->ext_press[1])/ + (box->box_norms[0] * box->box_norms[2] * P_CONV)); + + data->tot_press[2] = data->kin_press - + ((data->int_press[2] + data->ext_press[2])/ + (box->box_norms[0] * box->box_norms[1] * P_CONV)); + + /* Average pressure for the whole box */ + data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3; +} + + +void Compute_Pressure_Isotropic_Klein( reax_system* system, + simulation_data* data ) +{ + int i; + reax_atom *p_atom; + rvec dx; + + // IMPORTANT: This function assumes that current kinetic energy and + // the center of mass of the system is already computed before. + data->iso_bar.P = 2.0 * data->E_Kin; + + for( i = 0; i < system->N; ++i ) + { + p_atom = &( system->atoms[i] ); + rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm); + data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) ); + } + + data->iso_bar.P /= (3.0 * system->box.volume); + + // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs + // to be added when there are long-range interactions or long-range + // corrections to short-range interactions present. + // We may want to add that for more accuracy. +} + + +void Compute_Pressure( reax_system* system, simulation_data* data, + static_storage *workspace ) +{ + int i; + reax_atom *p_atom; + rtensor temp; + + rtensor_MakeZero( data->flex_bar.P ); + + for( i = 0; i < system->N; ++i ) { + p_atom = &( system->atoms[i] ); + // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx ); + rvec_OuterProduct( temp, p_atom->v, p_atom->v ); + rtensor_ScaledAdd( data->flex_bar.P, + system->reaxprm.sbp[ p_atom->type ].mass, temp ); + // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); + rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp ); + } + + rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P ); + data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0; +} diff --git a/PuReMD-GPU/src/system_props.h b/PuReMD-GPU/src/system_props.h index d287992f7b178f589808acfbaae1ed474a7a65e2..874132451d02b2d62d87c82065874f04a35b2d37 100644 --- a/PuReMD-GPU/src/system_props.h +++ b/PuReMD-GPU/src/system_props.h @@ -21,7 +21,12 @@ #ifndef __SYSTEM_PROP_H_ #define __SYSTEM_PROP_H_ -#include <mytypes.h> +#include "mytypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif real Get_Time( ); @@ -30,21 +35,18 @@ real Get_Timing_Info( real ); void Temperature_Control( control_params*, simulation_data*, output_controls* ); void Compute_Total_Mass( reax_system*, simulation_data* ); -void Cuda_Compute_Total_Mass( reax_system*, simulation_data* ); void Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* ); -void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* ); void Compute_Kinetic_Energy( reax_system*, simulation_data* ); -void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* ); void Compute_Pressure( reax_system*, simulation_data*, static_storage* ); void Compute_Pressure_Isotropic( reax_system*, control_params*, simulation_data*, output_controls* ); -void prep_dev_system (reax_system *system); -GLOBAL void Compute_Total_Mass (single_body_parameters *, reax_atom *, real *, size_t ); -//GLOBAL void Compute_Kinetic_Energy (single_body_parameters *, reax_atom *, unsigned int , simulation_data *, real *); +#ifdef __cplusplus +} +#endif #endif diff --git a/PuReMD-GPU/src/testmd.cu b/PuReMD-GPU/src/testmd.c similarity index 51% rename from PuReMD-GPU/src/testmd.cu rename to PuReMD-GPU/src/testmd.c index 93f286cc2c0f99cc4aa9788195881dd7ffa15f9f..afe0cd4a5cf7e7282d7e2409f7f0edc66c34296a 100644 --- a/PuReMD-GPU/src/testmd.cu +++ b/PuReMD-GPU/src/testmd.c @@ -19,9 +19,11 @@ ----------------------------------------------------------------------*/ #include "mytypes.h" + #include "analyze.h" #include "box.h" #include "forces.h" +#include "grid.h" #include "init_md.h" #include "integrate.h" #include "neighbors.h" @@ -34,11 +36,17 @@ #include "traj.h" #include "vector.h" -#include "grid.h" -#include "cuda_utils.h" -#include "cuda_copy.h" -#include "validation.h" +#include "cuda_environment.h" +#include "cuda_forces.h" +#include "cuda_init_md.h" +#include "cuda_neighbors.h" +#include "cuda_post_evolve.h" +#include "cuda_reset_utils.h" +#include "cuda_system_props.h" +#ifdef __BUILD_DEBUG__ + #include "validation.h" +#endif interaction_function Interaction_Functions[NO_OF_INTERACTIONS]; @@ -48,11 +56,10 @@ print_interaction Print_Interactions[NO_OF_INTERACTIONS]; LR_lookup_table *LR; LR_lookup_table *d_LR; -list *dev_lists; +list *dev_lists; static_storage *dev_workspace; reax_timing d_timing; - //TODO real *testdata; @@ -61,13 +68,6 @@ void *scratch; int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE; int MATVEC_BLOCKS; -cublasStatus_t cublasStatus; -cublasHandle_t cublasHandle; - -cusparseHandle_t cusparseHandle; -cusparseStatus_t cusparseStatus; -cusparseMatDescr_t matdescriptor; - void Post_Evolve( reax_system* system, control_params* control, simulation_data* data, static_storage* workspace, @@ -90,12 +90,14 @@ void Post_Evolve( reax_system* system, control_params* control, /* remove rotational and translational velocity of the center of mass */ if( control->ensemble != NVE && control->remove_CoM_vel && - data->step && data->step % control->remove_CoM_vel == 0 ) { + data->step && data->step % control->remove_CoM_vel == 0 ) + { /* compute velocity of the center of mass */ Compute_Center_of_Mass( system, data, out_control->prs ); - for( i = 0; i < system->N; i++ ) { + for( i = 0; i < system->N; i++ ) + { // remove translational rvec_ScaledAdd( system->atoms[i].v, -1., data->vcm ); @@ -107,98 +109,6 @@ void Post_Evolve( reax_system* system, control_params* control, } } -GLOBAL void Update_Atoms_Post_Evolve (reax_atom *atoms, simulation_data *data, int N) -{ - rvec diff, cross; - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= N) return; - - //for( i = 0; i < system->N; i++ ) { - // remove translational - rvec_ScaledAdd( atoms[i].v, -1., data->vcm ); - - // remove rotational - rvec_ScaledSum( diff, 1., atoms[i].x, -1., data->xcm ); - rvec_Cross( cross, data->avcm, diff ); - rvec_ScaledAdd( atoms[i].v, -1., cross ); - //} -} - -void Cuda_Post_Evolve( reax_system* system, control_params* control, - simulation_data* data, static_storage* workspace, - list** lists, output_controls *out_control ) -{ - int i; - rvec diff, cross; - - /* compute kinetic energy of the system */ - /* - real *results = (real *) scratch; - cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); - Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> - (system->reaxprm.d_sbp, system->d_atoms, system->N, - (simulation_data *)data->d_simulation_data, (real *) results); - cudaThreadSynchronize (); - cudaCheckError (); - */ - - //fprintf (stderr, "Cuda_Post_Evolve: Begin\n"); - Cuda_Compute_Kinetic_Energy (system, data); - //fprintf (stderr, " Cuda_Compute_Kinetic_Energy done.... \n"); - - /* remove rotational and translational velocity of the center of mass */ - if( control->ensemble != NVE && - control->remove_CoM_vel && - data->step && data->step % control->remove_CoM_vel == 0 ) { - - /* - rvec t_xcm, t_vcm, t_avcm; - rvec_MakeZero (t_xcm); - rvec_MakeZero (t_vcm); - rvec_MakeZero (t_avcm); - - rvec_Copy (t_xcm, data->xcm); - rvec_Copy (t_vcm, data->vcm); - rvec_Copy (t_avcm, data->avcm); - */ - - /* compute velocity of the center of mass */ - Cuda_Compute_Center_of_Mass( system, data, out_control->prs ); - //fprintf (stderr, "Cuda_Compute_Center_of_Mass done... \n"); - /* - fprintf (stderr, "center of mass done on the device \n"); - - fprintf (stderr, "xcm --> %4.10f %4.10f \n", t_xcm, data->xcm ); - fprintf (stderr, "vcm --> %4.10f %4.10f \n", t_vcm, data->vcm ); - fprintf (stderr, "avcm --> %4.10f %4.10f \n", t_avcm, data->avcm ); - - if (check_zero (t_xcm, data->xcm) || - check_zero (t_vcm, data->vcm) || - check_zero (t_avcm, data->avcm)){ - fprintf (stderr, "SimulationData (xcm, vcm, avcm) does not match between device and host \n"); - exit (0); - } - */ - - //xcm, avcm, - copy_host_device (data->vcm, ((simulation_data *)data->d_simulation_data)->vcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); - copy_host_device (data->xcm, ((simulation_data *)data->d_simulation_data)->xcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); - copy_host_device (data->avcm, ((simulation_data *)data->d_simulation_data)->avcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); - - //fprintf (stderr, "data copied.... \n"); - - Update_Atoms_Post_Evolve <<< BLOCKS, BLOCK_SIZE >>> - (system->d_atoms, (simulation_data *)data->d_simulation_data, system->N); - cudaThreadSynchronize (); - cudaCheckError (); - - //fprintf (stderr, " Cuda_Post_Evolve:End \n"); - - } -} - - - void Read_System( char *geof, char *ff, char *ctrlf, reax_system *system, control_params *control, @@ -217,15 +127,21 @@ void Read_System( char *geof, char *ff, char *ctrlf, Read_Control_File( ctrl, system, control, out_control ); /* geo file */ - if( control->geo_format == XYZ ) { + if( control->geo_format == XYZ ) + { fprintf( stderr, "xyz input is not implemented yet\n" ); - exit(1); + exit( 1 ); } else if( control->geo_format == PDB ) + { Read_PDB( geof, system, control, data, workspace ); + } else if( control->geo_format == BGF ) + { Read_BGF( geof, system, control, data, workspace ); - else if( control->geo_format == ASCII_RESTART ) { + } + else if( control->geo_format == ASCII_RESTART ) + { Read_ASCII_Restart( geof, system, control, data, workspace ); control->restart = 1; } @@ -233,9 +149,10 @@ void Read_System( char *geof, char *ff, char *ctrlf, Read_Binary_Restart( geof, system, control, data, workspace ); control->restart = 1; } - else { + else + { fprintf( stderr, "unknown geo file format. terminating!\n" ); - exit(1); + exit( 1 ); } #if defined(DEBUG_FOCUS) @@ -244,17 +161,18 @@ void Read_System( char *geof, char *ff, char *ctrlf, #endif } -void Init_Data_Structures (simulation_data *data) + +void Init_Data_Structures( simulation_data *data ) { //data->step = 0; //data->prev_steps = 0; //data->time = 0; - memset (data, 0, SIMULATION_DATA_SIZE ); + memset( data, 0, SIMULATION_DATA_SIZE ); } -int main(int argc, char* argv[]) +int main( int argc, char* argv[] ) { reax_system system; control_params control; @@ -271,16 +189,7 @@ int main(int argc, char* argv[]) lists = (list*) malloc( sizeof(list) * LIST_N ); - cudaDeviceSetLimit (cudaLimitStackSize, 8192); - cudaDeviceSetCacheConfig (cudaFuncCachePreferL1); - cudaCheckError (); - - cublasCheckError (cublasStatus = cublasCreate (&cublasHandle)); - - cusparseCheckError (cusparseStatus = cusparseCreate (&cusparseHandle)); - cusparseCheckError (cusparseCreateMatDescr (&matdescriptor)); - cusparseSetMatType (matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase (matdescriptor, CUSPARSE_INDEX_BASE_ZERO); + Setup_Cuda_Environment( 0, 1, 1 ); dev_lists = (list *) malloc (sizeof (list) * LIST_N ); dev_workspace = (static_storage *) malloc (STORAGE_SIZE); @@ -289,32 +198,14 @@ int main(int argc, char* argv[]) dev_workspace->realloc.estimate_nbrs = -1; //Cleanup before usage. - Init_Data_Structures (&data); - system.init_thblist = false; + Init_Data_Structures( &data ); + system.init_thblist = FALSE; Read_System( argv[1], argv[2], argv[3], &system, &control, &data, &workspace, &out_control ); - compute_blocks (&BLOCKS, &BLOCK_SIZE, system.N); - compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2); - - //MATVEC_BLOCKS = system.N; - //MATVEC_BLOCK_SIZE = 32; - - MATVEC_BLOCKS = (system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) + - ((system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) == 0 ? 0 : 1); - -#ifdef __DEBUG_CUDA__ - fprintf (stderr, " MATVEC Blocks : %d, Block_Size : %d \n", MATVEC_BLOCKS, MATVEC_BLOCK_SIZE ); - fprintf (stderr, " Blocks : %d, Blocks_Pow_2 : %d, Block_Size : %d \n", BLOCKS, BLOCKS_POW_2, BLOCK_SIZE ); - fprintf (stderr, " Size of far neighbor data %d \n", sizeof (far_neighbor_data)); - fprintf (stderr, " Size of reax_atom %d \n", sizeof (reax_atom)); - fprintf (stderr, " size of sparse matrix entry %d \n", sizeof (sparse_matrix_entry)); - fprintf (stderr, " TOTAL NUMBER OF ATOMS IN THE SYSTEM --> %d \n", system.N); -#endif - #ifdef __CUDA_MEM__ - print_device_mem_usage (); + print_device_mem_usage( ); #endif #ifdef __BUILD_DEBUG__ @@ -322,71 +213,51 @@ int main(int argc, char* argv[]) &out_control, &Evolve ); #endif - t_start = Get_Time (); + t_start = Get_Time( ); Cuda_Initialize( &system, &control, &data, &workspace, &lists, - &out_control, &Cuda_Evolve); - t_elapsed = Get_Timing_Info (t_start); + &out_control, &Cuda_Evolve ); + t_elapsed = Get_Timing_Info( t_start ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Cuda Initialize timing ---> %f \n", t_elapsed ); + fprintf( stderr, " Cuda Initialize timing ---> %f \n", t_elapsed ); #endif - #ifdef __CUDA_MEM__ - print_device_mem_usage (); + print_device_mem_usage( ); #endif #ifdef __BUILD_DEBUG__ Reset( &system, &control, &data, &workspace, &lists ); #endif - Cuda_Reset( &system, &control, &data, &workspace, &lists ); - + Cuda_Reset( &system, &control, &data, &workspace, &lists ); #ifdef __BUILD_DEBUG__ - Generate_Neighbor_Lists ( &system, &control, &data, &workspace, + Generate_Neighbor_Lists( &system, &control, &data, &workspace, &lists, &out_control ); #endif - /* - dim3 blockspergrid (system.g.ncell[0], system.g.ncell[1], system.g.ncell[2]); - dim3 threadsperblock (system.g.max_atoms); - - t_start = Get_Time (); - Cuda_Bin_Atoms (&system, &workspace); - Cuda_Bin_Atoms_Sync ( &system ); - - Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> - (system.d_atoms, system.d_g, system.d_box, - (control_params *)control.d_control, *(dev_lists + FAR_NBRS)); - cudaThreadSynchronize (); - cudaCheckError (); - t_elapsed = Get_Timing_Info (t_start); - d_timing.nbrs += t_elapsed; - */ - - Cuda_Generate_Neighbor_Lists (&system, &workspace, &control, false); + Cuda_Generate_Neighbor_Lists( &system, &workspace, &control, FALSE ); #ifdef __BUILD_DEBUG__ Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control); #endif - Cuda_Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control); + Cuda_Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control); #ifdef __BUILD_DEBUG__ Compute_Kinetic_Energy( &system, &data ); #endif - Cuda_Compute_Kinetic_Energy (&system, &data); + Cuda_Compute_Kinetic_Energy (&system, &data); #ifndef __BUILD_DEBUG__ - // Here sync the simulation data, because it has been changed. - Prep_Device_For_Output ( &system, &data ); + Cuda_Setup_Output( &system, &data ); Output_Results(&system, &control, &data, &workspace, &lists, &out_control); #endif #ifdef __BUILD_DEBUG__ - if (!validate_device (&system, &data, &workspace, &lists) ) + if( !validate_device (&system, &data, &workspace, &lists) ) { fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step); exit (1); @@ -397,50 +268,45 @@ int main(int argc, char* argv[]) fprintf (stderr, "step -> %d <- done. \n", data.step); #endif - ++data.step; - - for( ; data.step <= control.nsteps; data.step++ ) { - - //fprintf (stderr, "Begin ... \n"); - //to Sync step to the device. - //Sync_Host_Device (&data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice ); - copy_host_device (&data.step, &((simulation_data *)data.d_simulation_data)->step, - INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); + for( ; data.step <= control.nsteps; data.step++ ) + { + Cuda_Setup_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); //fprintf (stderr, "Synched data .... \n"); - if( control.T_mode ) { + if( control.T_mode ) + { Temperature_Control( &control, &data, &out_control ); - Sync_Host_Device (&control, (control_params *)control.d_control, cudaMemcpyHostToDevice ); + Cuda_Sync_Temp( &control ); } //fprintf (stderr, "Temp. Control done ... \n"); #ifdef __BUILD_DEBUG__ Evolve( &system, &control, &data, &workspace, &lists, &out_control ); #endif - Cuda_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); + Cuda_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); //fprintf (stderr, "Evolve done \n"); - #ifdef __BUILD_DEBUG__ Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); #endif + Cuda_Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control ); //fprintf (stderr, "Post Evolve done \n"); #ifndef __BUILD_DEBUG__ - Prep_Device_For_Output ( &system, &data ); - Output_Results(&system, &control, &data, &workspace, &lists, &out_control); + Cuda_Setup_Output( &system, &data ); + Output_Results( &system, &control, &data, &workspace, &lists, &out_control ); - /* - Analysis( &system, &control, &data, &workspace, &lists, &out_control ); - */ + //Analysis( &system, &control, &data, &workspace, &lists, &out_control ); steps = data.step - data.prev_steps; if( steps && out_control.restart_freq && steps % out_control.restart_freq == 0 ) + { Write_Restart( &system, &control, &data, &workspace, &out_control ); + } #endif #ifdef __BUILD_DEBUG__ @@ -463,5 +329,7 @@ int main(int argc, char* argv[]) data.timing.elapsed = Get_Timing_Info( data.timing.start ); fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed ); + Cleanup_Cuda_Environment( ); + return 0; } diff --git a/PuReMD-GPU/src/three_body_interactions.c b/PuReMD-GPU/src/three_body_interactions.c new file mode 100644 index 0000000000000000000000000000000000000000..7ac96e057c6c799ba88204f3f6339fe54b3c61da --- /dev/null +++ b/PuReMD-GPU/src/three_body_interactions.c @@ -0,0 +1,801 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "three_body_interactions.h" + +#include "bond_orders.h" +#include "list.h" +#include "lookup.h" +#include "vector.h" +#include "index_utils.h" + + +/* calculates the theta angle between i-j-k */ +void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, + real *theta, real *cos_theta ) +{ + (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk ); + if( *cos_theta > 1. ) *cos_theta = 1.0; + if( *cos_theta < -1. ) *cos_theta = -1.0; + + (*theta) = ACOS( *cos_theta ); +} + + +/* calculates the derivative of the cosine of the angle between i-j-k */ +void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, + rvec* dcos_theta_di, rvec* dcos_theta_dj, + rvec* dcos_theta_dk ) +{ + int t; + real sqr_d_ji = SQR(d_ji); + real sqr_d_jk = SQR(d_jk); + real inv_dists = 1.0 / (d_ji * d_jk); + real inv_dists3 = POW( inv_dists, 3 ); + real dot_dvecs = Dot( dvec_ji, dvec_jk, 3 ); + real Cdot_inv3 = dot_dvecs * inv_dists3; + + for( t = 0; t < 3; ++t ) { + (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - + Cdot_inv3 * sqr_d_jk * dvec_ji[t]; + + (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists + + Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] ); + + (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - + Cdot_inv3 * sqr_d_ji * dvec_jk[t]; + } + + /*fprintf( stderr, + "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + dvec_jk[t] * inv_dists*/ +} + + +/* this is a 3-body interaction in which the main role is + played by j which sits in the middle of the other two. */ +void Three_Body_Interactions( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, j, pi, k, pk, t; + int type_i, type_j, type_k; + int start_j, end_j, start_pk, end_pk; + int flag, cnt, num_thb_intrs; + + real temp, temp_bo_jt, pBOjt7; + real p_val1, p_val2, p_val3, p_val4, p_val5; + real p_val6, p_val7, p_val8, p_val9, p_val10; + real p_pen1, p_pen2, p_pen3, p_pen4; + real p_coa1, p_coa2, p_coa3, p_coa4; + real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; + real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; + real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO; + real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; + real CEpen1, CEpen2, CEpen3; + real e_ang, e_coa, e_pen; + real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; + real Cf7ij, Cf7jk, Cf8j, Cf9j; + real f7_ij, f7_jk, f8_Dj, f9_Dj; + real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; + real r_ij, r_jk; + real BOA_ij, BOA_jk; + real vlpadj; + rvec force, ext_press; + // rtensor temp_rtensor, total_rtensor; + real *total_bo; + three_body_header *thbh; + three_body_parameters *thbp; + three_body_interaction_data *p_ijk, *p_kji; + bond_data *pbond_ij, *pbond_jk, *pbond_jt; + bond_order_data *bo_ij, *bo_jk, *bo_jt; + list *bonds, *thb_intrs; + bond_data *bond_list; + three_body_interaction_data *thb_list; + + total_bo = workspace->total_bond_order; + bonds = (*lists) + BONDS; + bond_list = bonds->select.bond_list; + thb_intrs = (*lists) + THREE_BODIES; + thb_list = thb_intrs->select.three_body_list; + + /* global parameters used in these calculations */ + p_val6 = system->reaxprm.gp.l[14]; + p_val8 = system->reaxprm.gp.l[33]; + p_val9 = system->reaxprm.gp.l[16]; + p_val10 = system->reaxprm.gp.l[17]; + num_thb_intrs = 0; + + for( j = 0; j < system->N; ++j ) { + // fprintf( out_control->eval, "j: %d\n", j ); + type_j = system->atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + + p_val3 = system->reaxprm.sbp[ type_j ].p_val3; + p_val5 = system->reaxprm.sbp[ type_j ].p_val5; + + SBOp = 0, prod_SBO = 1; + for( t = start_j; t < end_j; ++t ) { + bo_jt = &(bond_list[t].bo_data); + SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); + temp = SQR( bo_jt->BO ); + temp *= temp; + temp *= temp; + prod_SBO *= EXP( -temp ); + } + + /* modifications to match Adri's code - 09/01/09 */ + if( workspace->vlpex[j] >= 0 ){ + vlpadj = 0; + dSBO2 = prod_SBO - 1; + } + else{ + vlpadj = workspace->nlp[j]; + dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); + } + + SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); + dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); + + if( SBO <= 0 ) + SBO2 = 0, CSBO2 = 0; + else if( SBO > 0 && SBO <= 1 ) { + SBO2 = POW( SBO, p_val9 ); + CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); + } + else if( SBO > 1 && SBO < 2 ) { + SBO2 = 2 - POW( 2-SBO, p_val9 ); + CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); + } + else + SBO2 = 2, CSBO2 = 0; + + expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); + + /* unlike 2-body intrs where we enforce i<j, we cannot put any such + restrictions here. such a restriction would prevent us from producing + all 4-body intrs correctly */ + for( pi = start_j; pi < end_j; ++pi ) { + Set_Start_Index( pi, num_thb_intrs, thb_intrs ); + + pbond_ij = &(bond_list[pi]); + bo_ij = &(pbond_ij->bo_data); + BOA_ij = bo_ij->BO - control->thb_cut; + + + if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) { + i = pbond_ij->nbr; + r_ij = pbond_ij->d; + type_i = system->atoms[i].type; + // fprintf( out_control->eval, "i: %d\n", i ); + + + /* first copy 3-body intrs from previously computed ones where i>k. + IMPORTANT: if it is less costly to compute theta and its + derivative, we should definitely re-compute them, + instead of copying! + in the second for-loop below, we compute only new 3-body intrs + where i < k */ + for( pk = start_j; pk < pi; ++pk ) { + // fprintf( out_control->eval, "pk: %d\n", pk ); + start_pk = Start_Index( pk, thb_intrs ); + end_pk = End_Index( pk, thb_intrs ); + + for( t = start_pk; t < end_pk; ++t ) + if( thb_list[t].thb == i ) { + p_ijk = &(thb_list[num_thb_intrs]); + p_kji = &(thb_list[t]); + + p_ijk->thb = bond_list[pk].nbr; + p_ijk->pthb = pk; + p_ijk->theta = p_kji->theta; + rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); + rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); + rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); + + //if (j == 12) + //fprintf (stderr, "Adding one for matched atom %d \n", i); + + ++num_thb_intrs; + break; + } + } + + + /* and this is the second for loop mentioned above */ + for( pk = pi+1; pk < end_j; ++pk ) { + pbond_jk = &(bond_list[pk]); + bo_jk = &(pbond_jk->bo_data); + BOA_jk = bo_jk->BO - control->thb_cut; + k = pbond_jk->nbr; + type_k = system->atoms[k].type; + p_ijk = &( thb_list[num_thb_intrs] ); + + //TODO - CHANGE ORIGINAL + if (BOA_jk <= 0) continue; + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &theta, &cos_theta ); + + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, + pbond_jk->dvec, pbond_jk->d, + &(p_ijk->dcos_di), &(p_ijk->dcos_dj), + &(p_ijk->dcos_dk) ); + + p_ijk->thb = k; + p_ijk->pthb = pk; + p_ijk->theta = theta; + + //if (j == 12) + //fprintf (stderr, "Adding one for the rest %d \n", k); + + sin_theta = SIN( theta ); + if( sin_theta < 1.0e-5 ) + sin_theta = 1.0e-5; + + ++num_thb_intrs; + + + if( BOA_jk > 0.0 && + (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) { + r_jk = pbond_jk->d; + thbh = &( system->reaxprm.thbp[ index_thbp(type_i,type_j,type_k,system->reaxprm.num_atom_types) ] ); + flag = 0; + + /* if( workspace->orig_id[i] < workspace->orig_id[k] ) + fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta ); + else + fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", + workspace->orig_id[k], workspace->orig_id[j], + workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ + + + for( cnt = 0; cnt < thbh->cnt; ++cnt ) { + // fprintf( out_control->eval, + // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 ); + + if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { + thbp = &( thbh->prm[cnt] ); + + /* ANGLE ENERGY */ + p_val1 = thbp->p_val1; + p_val2 = thbp->p_val2; + p_val4 = thbp->p_val4; + p_val7 = thbp->p_val7; + theta_00 = thbp->theta_00; + + exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); + f7_ij = 1.0 - exp3ij; + Cf7ij = p_val3 * p_val4 * + POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; + + exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); + f7_jk = 1.0 - exp3jk; + Cf7jk = p_val3 * p_val4 * + POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; + + expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); + trm8 = 1.0 + expval6 + expval7; + f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); + Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * + (p_val6 * expval6 * trm8 - + (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 )); + + theta_0 = 180.0 - + theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); + theta_0 = DEG2RAD( theta_0 ); + + expval2theta = EXP(-p_val2 * SQR(theta_0-theta)); + if( p_val1 >= 0 ) + expval12theta = p_val1 * (1.0 - expval2theta); + else // To avoid linear Me-H-Me angles (6/6/06) + expval12theta = p_val1 * -expval2theta; + + CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; + CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; + CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; + CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * + expval2theta * (theta_0 - theta); + + Ctheta_0 = p_val10 * DEG2RAD(theta_00) * + exp( -p_val10 * (2.0 - SBO2) ); + + CEval5 = -CEval4 * Ctheta_0 * CSBO2; + CEval6 = CEval5 * dSBO1; + CEval7 = CEval5 * dSBO2; + CEval8 = -CEval4 / sin_theta; + + data->E_Ang += e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; + /* END ANGLE ENERGY*/ + + + /* PENALTY ENERGY */ + p_pen1 = thbp->p_pen1; + p_pen2 = system->reaxprm.gp.l[19]; + p_pen3 = system->reaxprm.gp.l[20]; + p_pen4 = system->reaxprm.gp.l[21]; + + exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); + exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); + exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); + exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); + trm_pen34 = 1.0 + exp_pen3 + exp_pen4; + f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; + Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - + (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + + p_pen4 * exp_pen4 )) / + SQR( trm_pen34 ); + + data->E_Pen += e_pen = + p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; + + CEpen1 = e_pen * Cf9j / f9_Dj; + temp = -2.0 * p_pen2 * e_pen; + CEpen2 = temp * (BOA_ij - 2.0); + CEpen3 = temp * (BOA_jk - 2.0); + /* END PENALTY ENERGY */ + + + /* COALITION ENERGY */ + p_coa1 = thbp->p_coa1; + p_coa2 = system->reaxprm.gp.l[2]; + p_coa3 = system->reaxprm.gp.l[38]; + p_coa4 = system->reaxprm.gp.l[30]; + + exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); + data->E_Coa += e_coa = + p_coa1 / (1. + exp_coa2) * + EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * + EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * + EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * + EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); + + CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; + CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; + CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2); + CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa; + CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa; + /* END COALITION ENERGY */ + + /* FORCES */ + bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)); + bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)); + workspace->CdDelta[j] += ((CEval3 + CEval7) + + CEpen1 + CEcoa3); + workspace->CdDelta[i] += CEcoa4; + workspace->CdDelta[k] += CEcoa5; + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + // fprintf( out_control->eval, "%6d%12.8f\n", + // workspace->orig_id[ bond_list[t].nbr ], + // (CEval6 * pBOjt7) ); + + bo_jt->Cdbo += (CEval6 * pBOjt7); + bo_jt->Cdbopi += CEval5; + bo_jt->Cdbopi2 += CEval5; + } + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + + rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk ); + + /* + if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j); + if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j); + */ + } + else { + /* terms not related to bond order derivatives + are added directly into + forces and pressure vector/tensor */ + rvec_Scale( force, CEval8, p_ijk->dcos_di ); + rvec_Add( system->atoms[i].f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj ); + + rvec_Scale( force, CEval8, p_ijk->dcos_dk ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); + rvec_Add( data->ext_press, ext_press ); + + + /* This part is for a fully-flexible box */ + /* rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_di, system->atoms[i].x ); + rtensor_Scale( total_rtensor, +CEval8, temp_rtensor ); + + rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_dj, system->atoms[j].x ); + rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); + + rvec_OuterProduct( temp_rtensor, + p_ijk->dcos_dk, system->atoms[k].x ); + rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); + + if( pbond_ij->imaginary || pbond_jk->imaginary ) + rtensor_ScaledAdd( data->flex_bar.P, + -1.0, total_rtensor ); + else + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } + +#ifdef TEST_ENERGY + fprintf( out_control->eval, + //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e", + "%6d%6d%6d%23.15e%23.15e%23.15e\n", + i+1, j+1, k+1, + //workspace->orig_id[i]+1, + //workspace->orig_id[j]+1, + //workspace->orig_id[k]+1, + //workspace->Delta_boc[j], + RAD2DEG(theta), /*BOA_ij, BOA_jk, */ + e_ang, data->E_Ang ); + + /*fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e", + p_val3, p_val4, BOA_ij, BOA_jk ); + fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e", + f7_ij, f7_jk, f8_Dj, expval12theta ); + fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e%23.15e\n", + CEval1, CEval2, CEval3, CEval4, CEval5 + //CEval6, CEval7, CEval8 );*/ + + /*fprintf( out_control->eval, + "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", + -p_ijk->dcos_di[0]/sin_theta, + -p_ijk->dcos_di[1]/sin_theta, + -p_ijk->dcos_di[2]/sin_theta, + -p_ijk->dcos_dj[0]/sin_theta, + -p_ijk->dcos_dj[1]/sin_theta, + -p_ijk->dcos_dj[2]/sin_theta, + -p_ijk->dcos_dk[0]/sin_theta, + -p_ijk->dcos_dk[1]/sin_theta, + -p_ijk->dcos_dk[2]/sin_theta );*/ + + /* fprintf( out_control->epen, + "%23.15e%23.15e%23.15e\n", + CEpen1, CEpen2, CEpen3 ); + fprintf( out_control->epen, + "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + workspace->orig_id[k], RAD2DEG(theta), + BOA_ij, BOA_jk, e_pen, data->E_Pen ); */ + + fprintf( out_control->ecoa, + "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], + workspace->orig_id[j], + workspace->orig_id[k], + RAD2DEG(theta), BOA_ij, BOA_jk, + e_coa, data->E_Coa ); +#endif + +#ifdef TEST_FORCES /* angle forces */ + Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); + Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); + Add_dDelta( system, lists, + j, CEval3 + CEval7, workspace->f_ang ); + + for( t = start_j; t < end_j; ++t ) { + pbond_jt = &( bond_list[t] ); + bo_jt = &(pbond_jt->bo_data); + temp_bo_jt = bo_jt->BO; + temp = CUBE( temp_bo_jt ); + pBOjt7 = temp * temp * temp_bo_jt; + + Add_dBO( system, lists, j, t, pBOjt7 * CEval6, + workspace->f_ang ); + Add_dBOpinpi2( system, lists, j, t, + CEval5, CEval5, + workspace->f_ang, workspace->f_ang ); + } + + rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); + rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); + rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); + /* end angle forces */ + + /* penalty forces */ + Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); + Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); + Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); + /* end penalty forces */ + + /* coalition forces */ + Add_dBO( system, lists, + j, pi, CEcoa1-CEcoa4, workspace->f_coa ); + Add_dBO( system, lists, + j, pk, CEcoa2-CEcoa5, workspace->f_coa ); + Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); + Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); + Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); + /* end coalition forces */ +#endif + } + } + } + } + } + + Set_End_Index(pi, num_thb_intrs, thb_intrs ); + } + } + + if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) { + workspace->realloc.num_3body = num_thb_intrs; + if( num_thb_intrs > thb_intrs->num_intrs ) { + fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d", + data->step, num_thb_intrs, thb_intrs->num_intrs ); + exit( INSUFFICIENT_SPACE ); + } + } + + //fprintf( stderr,"%d: Number of angle interactions: %d\n", + // data->step, num_thb_intrs ); +#ifdef TEST_ENERGY + fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs ); + + fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n", + data->E_Ang, data->E_Pen, data->E_Coa ); + + fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] ); +#endif +} + + +void Hydrogen_Bonds( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, j, k, pi, pk, itr, top; + int type_i, type_j, type_k; + int start_j, end_j, hb_start_j, hb_end_j; + int hblist[MAX_BONDS]; + int num_hb_intrs = 0; + real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; + real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; + rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; + rvec dvec_jk, force, ext_press; + ivec rel_jk; + // rtensor temp_rtensor, total_rtensor; + hbond_parameters *hbp; + bond_order_data *bo_ij; + bond_data *pbond_ij; + far_neighbor_data *nbr_jk; + list *bonds, *hbonds; + bond_data *bond_list; + hbond_data *hbond_list; + + bonds = (*lists) + BONDS; + bond_list = bonds->select.bond_list; + + hbonds = (*lists) + HBONDS; + hbond_list = hbonds->select.hbond_list; + + /* loops below discover the Hydrogen bonds between i-j-k triplets. + here j is H atom and there has to be some bond between i and j. + Hydrogen bond is between j and k. + so in this function i->X, j->H, k->Z when we map + variables onto the ones in the handout.*/ + for( j = 0; j < system->N; ++j ) + if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H + /*set j's variables */ + type_j = system->atoms[j].type; + start_j = Start_Index(j, bonds); + end_j = End_Index(j, bonds); + hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); + hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); + + top = 0; + for( pi = start_j; pi < end_j; ++pi ) { + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + bo_ij = &(pbond_ij->bo_data); + type_i = system->atoms[i].type; + + if( system->reaxprm.sbp[type_i].p_hbond == 2 && + bo_ij->BO >= HB_THRESHOLD ) + hblist[top++] = pi; + } + + // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", + // j, top, hb_start_j, hb_end_j ); + + for( pk = hb_start_j; pk < hb_end_j; ++pk ) { + /* set k's varibles */ + k = hbond_list[pk].nbr; + type_k = system->atoms[k].type; + nbr_jk = hbond_list[pk].ptr; + r_jk = nbr_jk->d; + rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); + + for( itr=0; itr < top; ++itr ) { + pi = hblist[itr]; + pbond_ij = &( bond_list[pi] ); + i = pbond_ij->nbr; + + if( i != k ) { + bo_ij = &(pbond_ij->bo_data); + type_i = system->atoms[i].type; + r_ij = pbond_ij->d; + hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, system->reaxprm.num_atom_types) ]); + ++num_hb_intrs; + + Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &theta, &cos_theta ); + /* the derivative of cos(theta) */ + Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + &dcos_theta_di, &dcos_theta_dj, + &dcos_theta_dk ); + + /* hydrogen bond energy*/ + sin_theta2 = SIN( theta/2.0 ); + sin_xhz4 = SQR(sin_theta2); + sin_xhz4 *= sin_xhz4; + cos_xhz1 = ( 1.0 - cos_theta ); + exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); + exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + + r_jk / hbp->r0_hb - 2.0 ) ); + + data->E_HB += e_hb = + hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; + + CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; + CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; + CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + + 1.0 / hbp->r0_hb); + + /* hydrogen bond forces */ + bo_ij->Cdbo += CEhb1; // dbo term + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { + rvec_ScaledAdd( system->atoms[i].f, + +CEhb2, dcos_theta_di ); //dcos terms + rvec_ScaledAdd( system->atoms[j].f, + +CEhb2, dcos_theta_dj ); + + + + + //TODO + rvec_ScaledAdd( system->atoms[k].f, + +CEhb2, dcos_theta_dk ); + + //dr terms + rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk ); + + + //TODO + rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk ); + } + else + { + /* for pressure coupling, terms that are not related + to bond order derivatives are added directly into + pressure vector/tensor */ + rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms + rvec_Add( system->atoms[i].f, force ); + rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); + rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); + + rvec_ScaledAdd( system->atoms[j].f, +CEhb2, dcos_theta_dj ); + + ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); + rvec_Scale( force, +CEhb2, dcos_theta_dk ); + + + + //TODO + rvec_Add( system->atoms[k].f, force ); + + + + rvec_iMultiply( ext_press, rel_jk, force ); + rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); + + //dr terms + rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk ); + + rvec_Scale( force, CEhb3/r_jk, dvec_jk ); + rvec_Add( system->atoms[k].f, force ); + rvec_iMultiply( ext_press, rel_jk, force ); + rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); + + /* This part is intended for a fully-flexible box */ + /* rvec_OuterProduct( temp_rtensor, + dcos_theta_di, system->atoms[i].x ); + rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj, + -CEhb3/r_jk, pbond_jk->dvec ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[j].x ); + rtensor_Add( total_rtensor, temp_rtensor ); + + rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk, + +CEhb3/r_jk, pbond_jk->dvec ); + rvec_OuterProduct( temp_rtensor, + temp_rvec, system->atoms[k].x ); + rtensor_Add( total_rtensor, temp_rtensor ); + + if( pbond_ij->imaginary || pbond_jk->imaginary ) + rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor ); + else + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } + +#ifdef TEST_ENERGY + /*fprintf( out_control->ehb, + "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n", + dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], + dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], + dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]); + fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n", + CEhb1, CEhb2, CEhb3 ); */ + fprintf( stderr, //out_control->ehb, + "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", + workspace->orig_id[i], + workspace->orig_id[j], + workspace->orig_id[k], + r_jk, theta, bo_ij->BO, e_hb, data->E_HB ); + +#endif +#ifdef TEST_FORCES + // dbo term + Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); + // dcos terms + rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); + rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj ); + rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk ); + // dr terms + rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk ); + rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk ); +#endif + } + } + } + } + + /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */ + +#ifdef TEST_FORCES + fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs ); + fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->E_HB ); +#endif +} diff --git a/PuReMD-GPU/src/three_body_interactions.cu b/PuReMD-GPU/src/three_body_interactions.cu deleted file mode 100644 index c2eed63bc52c8e5f3db040d74d9d8d083478ec82..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/three_body_interactions.cu +++ /dev/null @@ -1,2462 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#include "three_body_interactions.h" -#include "bond_orders.h" -#include "list.h" -#include "lookup.h" -#include "vector.h" -#include "index_utils.h" - -#include "cuda_helpers.h" - - -/* calculates the theta angle between i-j-k */ -HOST_DEVICE void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, - real *theta, real *cos_theta ) -{ - (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk ); - if( *cos_theta > 1. ) *cos_theta = 1.0; - if( *cos_theta < -1. ) *cos_theta = -1.0; - - (*theta) = ACOS( *cos_theta ); -} - - -/* calculates the derivative of the cosine of the angle between i-j-k */ -HOST_DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, - rvec* dcos_theta_di, rvec* dcos_theta_dj, - rvec* dcos_theta_dk ) -{ - int t; - real sqr_d_ji = SQR(d_ji); - real sqr_d_jk = SQR(d_jk); - real inv_dists = 1.0 / (d_ji * d_jk); - real inv_dists3 = POW( inv_dists, 3 ); - real dot_dvecs = Dot( dvec_ji, dvec_jk, 3 ); - real Cdot_inv3 = dot_dvecs * inv_dists3; - - for( t = 0; t < 3; ++t ) { - (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - - Cdot_inv3 * sqr_d_jk * dvec_ji[t]; - - (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists + - Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] ); - - (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - - Cdot_inv3 * sqr_d_ji * dvec_jk[t]; - } - - /*fprintf( stderr, - "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - dvec_jk[t] * inv_dists*/ -} - - - - - - - - - - - - -/* this is a 3-body interaction in which the main role is - played by j which sits in the middle of the other two. */ -void Three_Body_Interactions( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) -{ - int i, j, pi, k, pk, t; - int type_i, type_j, type_k; - int start_j, end_j, start_pk, end_pk; - int flag, cnt, num_thb_intrs; - - real temp, temp_bo_jt, pBOjt7; - real p_val1, p_val2, p_val3, p_val4, p_val5; - real p_val6, p_val7, p_val8, p_val9, p_val10; - real p_pen1, p_pen2, p_pen3, p_pen4; - real p_coa1, p_coa2, p_coa3, p_coa4; - real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; - real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; - real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO; - real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; - real CEpen1, CEpen2, CEpen3; - real e_ang, e_coa, e_pen; - real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; - real Cf7ij, Cf7jk, Cf8j, Cf9j; - real f7_ij, f7_jk, f8_Dj, f9_Dj; - real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; - real r_ij, r_jk; - real BOA_ij, BOA_jk; - real vlpadj; - rvec force, ext_press; - // rtensor temp_rtensor, total_rtensor; - real *total_bo; - three_body_header *thbh; - three_body_parameters *thbp; - three_body_interaction_data *p_ijk, *p_kji; - bond_data *pbond_ij, *pbond_jk, *pbond_jt; - bond_order_data *bo_ij, *bo_jk, *bo_jt; - list *bonds, *thb_intrs; - bond_data *bond_list; - three_body_interaction_data *thb_list; - - total_bo = workspace->total_bond_order; - bonds = (*lists) + BONDS; - bond_list = bonds->select.bond_list; - thb_intrs = (*lists) + THREE_BODIES; - thb_list = thb_intrs->select.three_body_list; - - /* global parameters used in these calculations */ - p_val6 = system->reaxprm.gp.l[14]; - p_val8 = system->reaxprm.gp.l[33]; - p_val9 = system->reaxprm.gp.l[16]; - p_val10 = system->reaxprm.gp.l[17]; - num_thb_intrs = 0; - - for( j = 0; j < system->N; ++j ) { - // fprintf( out_control->eval, "j: %d\n", j ); - type_j = system->atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - - p_val3 = system->reaxprm.sbp[ type_j ].p_val3; - p_val5 = system->reaxprm.sbp[ type_j ].p_val5; - - SBOp = 0, prod_SBO = 1; - for( t = start_j; t < end_j; ++t ) { - bo_jt = &(bond_list[t].bo_data); - SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); - temp = SQR( bo_jt->BO ); - temp *= temp; - temp *= temp; - prod_SBO *= EXP( -temp ); - } - - /* modifications to match Adri's code - 09/01/09 */ - if( workspace->vlpex[j] >= 0 ){ - vlpadj = 0; - dSBO2 = prod_SBO - 1; - } - else{ - vlpadj = workspace->nlp[j]; - dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); - } - - SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); - dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); - - if( SBO <= 0 ) - SBO2 = 0, CSBO2 = 0; - else if( SBO > 0 && SBO <= 1 ) { - SBO2 = POW( SBO, p_val9 ); - CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); - } - else if( SBO > 1 && SBO < 2 ) { - SBO2 = 2 - POW( 2-SBO, p_val9 ); - CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); - } - else - SBO2 = 2, CSBO2 = 0; - - expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); - - /* unlike 2-body intrs where we enforce i<j, we cannot put any such - restrictions here. such a restriction would prevent us from producing - all 4-body intrs correctly */ - for( pi = start_j; pi < end_j; ++pi ) { - Set_Start_Index( pi, num_thb_intrs, thb_intrs ); - - pbond_ij = &(bond_list[pi]); - bo_ij = &(pbond_ij->bo_data); - BOA_ij = bo_ij->BO - control->thb_cut; - - - if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) { - i = pbond_ij->nbr; - r_ij = pbond_ij->d; - type_i = system->atoms[i].type; - // fprintf( out_control->eval, "i: %d\n", i ); - - - /* first copy 3-body intrs from previously computed ones where i>k. -IMPORTANT: if it is less costly to compute theta and its -derivative, we should definitely re-compute them, -instead of copying! -in the second for-loop below, we compute only new 3-body intrs -where i < k */ - for( pk = start_j; pk < pi; ++pk ) { - // fprintf( out_control->eval, "pk: %d\n", pk ); - start_pk = Start_Index( pk, thb_intrs ); - end_pk = End_Index( pk, thb_intrs ); - - for( t = start_pk; t < end_pk; ++t ) - if( thb_list[t].thb == i ) { - p_ijk = &(thb_list[num_thb_intrs]); - p_kji = &(thb_list[t]); - - p_ijk->thb = bond_list[pk].nbr; - p_ijk->pthb = pk; - p_ijk->theta = p_kji->theta; - rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); - rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); - rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); - - //if (j == 12) - //fprintf (stderr, "Adding one for matched atom %d \n", i); - - ++num_thb_intrs; - break; - } - } - - - /* and this is the second for loop mentioned above */ - for( pk = pi+1; pk < end_j; ++pk ) { - pbond_jk = &(bond_list[pk]); - bo_jk = &(pbond_jk->bo_data); - BOA_jk = bo_jk->BO - control->thb_cut; - k = pbond_jk->nbr; - type_k = system->atoms[k].type; - p_ijk = &( thb_list[num_thb_intrs] ); - - //TODO - CHANGE ORIGINAL - if (BOA_jk <= 0) continue; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &theta, &cos_theta ); - - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &(p_ijk->dcos_di), &(p_ijk->dcos_dj), - &(p_ijk->dcos_dk) ); - - p_ijk->thb = k; - p_ijk->pthb = pk; - p_ijk->theta = theta; - - //if (j == 12) - //fprintf (stderr, "Adding one for the rest %d \n", k); - - sin_theta = SIN( theta ); - if( sin_theta < 1.0e-5 ) - sin_theta = 1.0e-5; - - ++num_thb_intrs; - - - if( BOA_jk > 0.0 && - (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) { - r_jk = pbond_jk->d; - thbh = &( system->reaxprm.thbp[ index_thbp (type_i,type_j,type_k,&system->reaxprm) ] ); - flag = 0; - - /* if( workspace->orig_id[i] < workspace->orig_id[k] ) - fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta ); - else - fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - workspace->orig_id[k], workspace->orig_id[j], - workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ - - - for( cnt = 0; cnt < thbh->cnt; ++cnt ) { - // fprintf( out_control->eval, - // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 ); - - if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { - thbp = &( thbh->prm[cnt] ); - - /* ANGLE ENERGY */ - p_val1 = thbp->p_val1; - p_val2 = thbp->p_val2; - p_val4 = thbp->p_val4; - p_val7 = thbp->p_val7; - theta_00 = thbp->theta_00; - - exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); - f7_ij = 1.0 - exp3ij; - Cf7ij = p_val3 * p_val4 * - POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; - - exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); - f7_jk = 1.0 - exp3jk; - Cf7jk = p_val3 * p_val4 * - POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; - - expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); - trm8 = 1.0 + expval6 + expval7; - f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); - Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * - (p_val6 * expval6 * trm8 - - (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 )); - - theta_0 = 180.0 - - theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); - theta_0 = DEG2RAD( theta_0 ); - - expval2theta = EXP(-p_val2 * SQR(theta_0-theta)); - if( p_val1 >= 0 ) - expval12theta = p_val1 * (1.0 - expval2theta); - else // To avoid linear Me-H-Me angles (6/6/06) - expval12theta = p_val1 * -expval2theta; - - CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; - CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; - CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; - CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * - expval2theta * (theta_0 - theta); - - Ctheta_0 = p_val10 * DEG2RAD(theta_00) * - exp( -p_val10 * (2.0 - SBO2) ); - - CEval5 = -CEval4 * Ctheta_0 * CSBO2; - CEval6 = CEval5 * dSBO1; - CEval7 = CEval5 * dSBO2; - CEval8 = -CEval4 / sin_theta; - - data->E_Ang += e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; - /* END ANGLE ENERGY*/ - - - /* PENALTY ENERGY */ - p_pen1 = thbp->p_pen1; - p_pen2 = system->reaxprm.gp.l[19]; - p_pen3 = system->reaxprm.gp.l[20]; - p_pen4 = system->reaxprm.gp.l[21]; - - exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); - exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); - exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); - exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); - trm_pen34 = 1.0 + exp_pen3 + exp_pen4; - f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; - Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + - p_pen4 * exp_pen4 )) / - SQR( trm_pen34 ); - - data->E_Pen += e_pen = - p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; - - CEpen1 = e_pen * Cf9j / f9_Dj; - temp = -2.0 * p_pen2 * e_pen; - CEpen2 = temp * (BOA_ij - 2.0); - CEpen3 = temp * (BOA_jk - 2.0); - /* END PENALTY ENERGY */ - - - /* COALITION ENERGY */ - p_coa1 = thbp->p_coa1; - p_coa2 = system->reaxprm.gp.l[2]; - p_coa3 = system->reaxprm.gp.l[38]; - p_coa4 = system->reaxprm.gp.l[30]; - - exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); - data->E_Coa += e_coa = - p_coa1 / (1. + exp_coa2) * - EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * - EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * - EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * - EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); - - CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; - CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; - CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2); - CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa; - CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa; - /* END COALITION ENERGY */ - - /* FORCES */ - bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)); - bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)); - workspace->CdDelta[j] += ((CEval3 + CEval7) + - CEpen1 + CEcoa3); - workspace->CdDelta[i] += CEcoa4; - workspace->CdDelta[k] += CEcoa5; - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - // fprintf( out_control->eval, "%6d%12.8f\n", - // workspace->orig_id[ bond_list[t].nbr ], - // (CEval6 * pBOjt7) ); - - bo_jt->Cdbo += (CEval6 * pBOjt7); - bo_jt->Cdbopi += CEval5; - bo_jt->Cdbopi2 += CEval5; - } - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - - rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk ); - - /* - if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j); - if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j); - */ - } - else { - /* terms not related to bond order derivatives - are added directly into - forces and pressure vector/tensor */ - rvec_Scale( force, CEval8, p_ijk->dcos_di ); - rvec_Add( system->atoms[i].f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj ); - - rvec_Scale( force, CEval8, p_ijk->dcos_dk ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( data->ext_press, ext_press ); - - - /* This part is for a fully-flexible box */ - /* rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_di, system->atoms[i].x ); - rtensor_Scale( total_rtensor, +CEval8, temp_rtensor ); - - rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_dj, system->atoms[j].x ); - rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); - - rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_dk, system->atoms[k].x ); - rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); - - if( pbond_ij->imaginary || pbond_jk->imaginary ) - rtensor_ScaledAdd( data->flex_bar.P, - -1.0, total_rtensor ); - else - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } - -#ifdef TEST_ENERGY - fprintf( out_control->eval, - //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e", - "%6d%6d%6d%23.15e%23.15e%23.15e\n", - i+1, j+1, k+1, - //workspace->orig_id[i]+1, - //workspace->orig_id[j]+1, - //workspace->orig_id[k]+1, - //workspace->Delta_boc[j], - RAD2DEG(theta), /*BOA_ij, BOA_jk, */ - e_ang, data->E_Ang ); - - /*fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e", - p_val3, p_val4, BOA_ij, BOA_jk ); - fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e", - f7_ij, f7_jk, f8_Dj, expval12theta ); - fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e%23.15e\n", - CEval1, CEval2, CEval3, CEval4, CEval5 - //CEval6, CEval7, CEval8 );*/ - - /*fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - -p_ijk->dcos_di[0]/sin_theta, - -p_ijk->dcos_di[1]/sin_theta, - -p_ijk->dcos_di[2]/sin_theta, - -p_ijk->dcos_dj[0]/sin_theta, - -p_ijk->dcos_dj[1]/sin_theta, - -p_ijk->dcos_dj[2]/sin_theta, - -p_ijk->dcos_dk[0]/sin_theta, - -p_ijk->dcos_dk[1]/sin_theta, - -p_ijk->dcos_dk[2]/sin_theta );*/ - - /* fprintf( out_control->epen, - "%23.15e%23.15e%23.15e\n", - CEpen1, CEpen2, CEpen3 ); - fprintf( out_control->epen, - "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], RAD2DEG(theta), - BOA_ij, BOA_jk, e_pen, data->E_Pen ); */ - - fprintf( out_control->ecoa, - "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], - workspace->orig_id[j], - workspace->orig_id[k], - RAD2DEG(theta), BOA_ij, BOA_jk, - e_coa, data->E_Coa ); -#endif - -#ifdef TEST_FORCES /* angle forces */ - Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); - Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); - Add_dDelta( system, lists, - j, CEval3 + CEval7, workspace->f_ang ); - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - Add_dBO( system, lists, j, t, pBOjt7 * CEval6, - workspace->f_ang ); - Add_dBOpinpi2( system, lists, j, t, - CEval5, CEval5, - workspace->f_ang, workspace->f_ang ); - } - - rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); - /* end angle forces */ - - /* penalty forces */ - Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); - Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); - Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); - /* end penalty forces */ - - /* coalition forces */ - Add_dBO( system, lists, - j, pi, CEcoa1-CEcoa4, workspace->f_coa ); - Add_dBO( system, lists, - j, pk, CEcoa2-CEcoa5, workspace->f_coa ); - Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); - Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); - Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); - /* end coalition forces */ -#endif - } - } - } - } - } - - Set_End_Index(pi, num_thb_intrs, thb_intrs ); - } - } - - if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) { - workspace->realloc.num_3body = num_thb_intrs; - if( num_thb_intrs > thb_intrs->num_intrs ) { - fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d", - data->step, num_thb_intrs, thb_intrs->num_intrs ); - exit( INSUFFICIENT_SPACE ); - } - } - - //fprintf( stderr,"%d: Number of angle interactions: %d\n", - // data->step, num_thb_intrs ); -#ifdef TEST_ENERGY - fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs ); - - fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n", - data->E_Ang, data->E_Pen, data->E_Coa ); - - fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] ); -#endif -} - -////////////////////////////////////////////////////////////////////// -//Cuda Function for the Three body interactions. -////////////////////////////////////////////////////////////////////// - - - -/* this is a 3-body interaction in which the main role is - played by j which sits in the middle of the other two. */ -GLOBAL void Three_Body_Interactions( reax_atom *atoms, - single_body_parameters *sbp, - three_body_header *d_thbp, - global_parameters g_params, - control_params *control, - simulation_data *data, - static_storage p_workspace, - list p_bonds, list p_thb_intrs, - int N, int num_atom_types, - real *E_Ang, real *E_Pen, real *E_Coa, rvec *aux_ext_press ) -{ - int i, j, pi, k, pk, t; - int type_i, type_j, type_k; - int start_j, end_j, start_pk, end_pk; - int flag, cnt, num_thb_intrs; - - real temp, temp_bo_jt, pBOjt7; - real p_val1, p_val2, p_val3, p_val4, p_val5; - real p_val6, p_val7, p_val8, p_val9, p_val10; - real p_pen1, p_pen2, p_pen3, p_pen4; - real p_coa1, p_coa2, p_coa3, p_coa4; - real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk; - real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2; - real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO; - real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8; - real CEpen1, CEpen2, CEpen3; - real e_ang, e_coa, e_pen; - real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5; - real Cf7ij, Cf7jk, Cf8j, Cf9j; - real f7_ij, f7_jk, f8_Dj, f9_Dj; - real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta; - real r_ij, r_jk; - real BOA_ij, BOA_jk; - real vlpadj; - rvec force, ext_press; - // rtensor temp_rtensor, total_rtensor; - real *total_bo; - three_body_header *thbh; - three_body_parameters *thbp; - three_body_interaction_data *p_ijk, *p_kji; - bond_data *pbond_ij, *pbond_jk, *pbond_jt; - bond_order_data *bo_ij, *bo_jk, *bo_jt; - list *bonds, *thb_intrs; - bond_data *bond_list; - three_body_interaction_data *thb_list; - static_storage *workspace = &p_workspace; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - - - total_bo = workspace->total_bond_order; - bonds = &p_bonds; - bond_list = bonds->select.bond_list; - thb_intrs = &p_thb_intrs; - thb_list = thb_intrs->select.three_body_list; - - /* global parameters used in these calculations */ - p_val6 = g_params.l[14]; - p_val8 = g_params.l[33]; - p_val9 = g_params.l[16]; - p_val10 = g_params.l[17]; - - //TODO check this, initially this was zero, - // I am changing it to the starting index for this atom. - //num_thb_intrs = j * MAX_TH_BODY; - - //for( j = 0; j < system->N; ++j ) { - // fprintf( out_control->eval, "j: %d\n", j ); - type_j = atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - - p_val3 = sbp[ type_j ].p_val3; - p_val5 = sbp[ type_j ].p_val5; - - SBOp = 0, prod_SBO = 1; - for( t = start_j; t < end_j; ++t ) { - bo_jt = &(bond_list[t].bo_data); - SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2); - temp = SQR( bo_jt->BO ); - temp *= temp; - temp *= temp; - prod_SBO *= EXP( -temp ); - } - - /* modifications to match Adri's code - 09/01/09 */ - if( workspace->vlpex[j] >= 0 ){ - vlpadj = 0; - dSBO2 = prod_SBO - 1; - } - else{ - vlpadj = workspace->nlp[j]; - dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]); - } - - SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj); - dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj ); - - if( SBO <= 0 ) - SBO2 = 0, CSBO2 = 0; - else if( SBO > 0 && SBO <= 1 ) { - SBO2 = POW( SBO, p_val9 ); - CSBO2 = p_val9 * POW( SBO, p_val9 - 1 ); - } - else if( SBO > 1 && SBO < 2 ) { - SBO2 = 2 - POW( 2-SBO, p_val9 ); - CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 ); - } - else - SBO2 = 2, CSBO2 = 0; - - expval6 = EXP( p_val6 * workspace->Delta_boc[j] ); - - /* unlike 2-body intrs where we enforce i<j, we cannot put any such - restrictions here. such a restriction would prevent us from producing - all 4-body intrs correctly */ - for( pi = start_j; pi < end_j; ++pi ) { - - //TODO - //num_thb_intrs = pi * MAX_THREE_BODIES; - //TODO - - //Set_Start_Index( pi, num_thb_intrs, thb_intrs ); - num_thb_intrs = Start_Index (pi, thb_intrs); - - pbond_ij = &(bond_list[pi]); - bo_ij = &(pbond_ij->bo_data); - BOA_ij = bo_ij->BO - control->thb_cut; - - - if( BOA_ij/*bo_ij->BO*/ > 0.0 ) { - i = pbond_ij->nbr; - r_ij = pbond_ij->d; - type_i = atoms[i].type; - // fprintf( out_control->eval, "i: %d\n", i ); - - - /* first copy 3-body intrs from previously computed ones where i>k. -IMPORTANT: if it is less costly to compute theta and its -derivative, we should definitely re-compute them, -instead of copying! -in the second for-loop below, we compute only new 3-body intrs -where i < k */ - for( pk = start_j; pk < pi; ++pk ) { - // fprintf( out_control->eval, "pk: %d\n", pk ); - start_pk = Start_Index( pk, thb_intrs ); - end_pk = End_Index( pk, thb_intrs ); - - for( t = start_pk; t < end_pk; ++t ) - if( thb_list[t].thb == i ) { - p_ijk = &(thb_list[num_thb_intrs]); - p_kji = &(thb_list[t]); - - p_ijk->thb = bond_list[pk].nbr; - p_ijk->pthb = pk; - p_ijk->theta = p_kji->theta; - rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); - rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); - rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); - - ++num_thb_intrs; - break; - } - } - - - /* and this is the second for loop mentioned above */ - for( pk = pi+1; pk < end_j; ++pk ) { - pbond_jk = &(bond_list[pk]); - bo_jk = &(pbond_jk->bo_data); - BOA_jk = bo_jk->BO - control->thb_cut; - k = pbond_jk->nbr; - type_k = atoms[k].type; - p_ijk = &( thb_list[num_thb_intrs] ); - - //CHANGE ORIGINAL - if (BOA_jk <= 0) continue; - //CHANGE ORIGINAL - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &theta, &cos_theta ); - - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, - pbond_jk->dvec, pbond_jk->d, - &(p_ijk->dcos_di), &(p_ijk->dcos_dj), - &(p_ijk->dcos_dk) ); - - p_ijk->thb = k; - p_ijk->pthb = pk; - p_ijk->theta = theta; - - sin_theta = SIN( theta ); - if( sin_theta < 1.0e-5 ) - sin_theta = 1.0e-5; - - ++num_thb_intrs; - - - if( BOA_jk > 0.0 && - (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) { - r_jk = pbond_jk->d; - thbh = &( d_thbp[ index_thbp (type_i,type_j,type_k,num_atom_types) ] ); - flag = 0; - - /* if( workspace->orig_id[i] < workspace->orig_id[k] ) - fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta ); - else - fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - workspace->orig_id[k], workspace->orig_id[j], - workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ - - //TODO: - //pbond_jk->scratch = thbh->cnt; - - for( cnt = 0; cnt < thbh->cnt; ++cnt ) { - // fprintf( out_control->eval, - // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 ); - - if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) { - thbp = &( thbh->prm[cnt] ); - - /* ANGLE ENERGY */ - p_val1 = thbp->p_val1; - p_val2 = thbp->p_val2; - p_val4 = thbp->p_val4; - p_val7 = thbp->p_val7; - theta_00 = thbp->theta_00; - - exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) ); - f7_ij = 1.0 - exp3ij; - Cf7ij = p_val3 * p_val4 * - POW( BOA_ij, p_val4 - 1.0 ) * exp3ij; - - exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) ); - f7_jk = 1.0 - exp3jk; - Cf7jk = p_val3 * p_val4 * - POW( BOA_jk, p_val4 - 1.0 ) * exp3jk; - - expval7 = EXP( -p_val7 * workspace->Delta_boc[j] ); - trm8 = 1.0 + expval6 + expval7; - f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 ); - Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) * - (p_val6 * expval6 * trm8 - - (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 )); - - theta_0 = 180.0 - - theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2))); - theta_0 = DEG2RAD( theta_0 ); - - expval2theta = EXP(-p_val2 * SQR(theta_0-theta)); - if( p_val1 >= 0 ) - expval12theta = p_val1 * (1.0 - expval2theta); - else // To avoid linear Me-H-Me angles (6/6/06) - expval12theta = p_val1 * -expval2theta; - - CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta; - CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta; - CEval3 = Cf8j * f7_ij * f7_jk * expval12theta; - CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * - expval2theta * (theta_0 - theta); - - Ctheta_0 = p_val10 * DEG2RAD(theta_00) * - exp( -p_val10 * (2.0 - SBO2) ); - - CEval5 = -CEval4 * Ctheta_0 * CSBO2; - CEval6 = CEval5 * dSBO1; - CEval7 = CEval5 * dSBO2; - CEval8 = -CEval4 / sin_theta; - - e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; - //PERFORMANCE IMPACT - //atomicAdd (&data->E_Ang, e_ang); - E_Ang [j] += e_ang; - /* END ANGLE ENERGY*/ - - - /* PENALTY ENERGY */ - p_pen1 = thbp->p_pen1; - p_pen2 = g_params.l[19]; - p_pen3 = g_params.l[20]; - p_pen4 = g_params.l[21]; - - exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) ); - exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) ); - exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] ); - exp_pen4 = EXP( p_pen4 * workspace->Delta[j] ); - trm_pen34 = 1.0 + exp_pen3 + exp_pen4; - f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34; - Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - - (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + - p_pen4 * exp_pen4 )) / - SQR( trm_pen34 ); - - e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk; - //PERFORMANCE IMPACT - //atomicAdd (&data->E_Pen, e_pen); - E_Pen [j] += e_pen; - - - CEpen1 = e_pen * Cf9j / f9_Dj; - temp = -2.0 * p_pen2 * e_pen; - CEpen2 = temp * (BOA_ij - 2.0); - CEpen3 = temp * (BOA_jk - 2.0); - /* END PENALTY ENERGY */ - - - /* COALITION ENERGY */ - p_coa1 = thbp->p_coa1; - p_coa2 = g_params.l[2]; - p_coa3 = g_params.l[38]; - p_coa4 = g_params.l[30]; - - exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] ); - e_coa = - p_coa1 / (1. + exp_coa2) * - EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * - EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * - EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * - EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); - - //PERFORMANCE IMPACT - //atomicAdd (&data->E_Coa, e_coa); - E_Coa [j] += e_coa; - - CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; - CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa; - CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2); - CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa; - CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa; - /* END COALITION ENERGY */ - - /* FORCES */ - /* - atomicAdd (&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ); - atomicAdd (&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ); - atomicAdd (&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) ); - atomicAdd (&workspace->CdDelta[i], CEcoa4 ); - atomicAdd (&workspace->CdDelta[k], CEcoa5 ); - */ - - bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ; - bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ; - workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ; - //atomicAdd (&workspace->CdDelta[i], CEcoa4 ); - pbond_ij->CdDelta_ij += CEcoa4 ; - //atomicAdd (&workspace->CdDelta[k], CEcoa5 ); - pbond_jk->CdDelta_ij += CEcoa5; - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - // fprintf( out_control->eval, "%6d%12.8f\n", - // workspace->orig_id[ bond_list[t].nbr ], - // (CEval6 * pBOjt7) ); - - /* - atomicAdd (&bo_jt->Cdbo, (CEval6 * pBOjt7) ); - atomicAdd (&bo_jt->Cdbopi, CEval5 ); - atomicAdd (&bo_jt->Cdbopi2, CEval5 ); - */ - bo_jt->Cdbo += (CEval6 * pBOjt7) ; - bo_jt->Cdbopi += CEval5 ; - bo_jt->Cdbopi2 += CEval5 ; - } - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - /* - atomic_rvecScaledAdd( atoms[i].f, CEval8, p_ijk->dcos_di ); - atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); - atomic_rvecScaledAdd( atoms[k].f, CEval8, p_ijk->dcos_dk ); - */ - rvec_ScaledAdd( pbond_ij->f, CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( pbond_jk->f, CEval8, p_ijk->dcos_dk ); - - - } - else { - /* terms not related to bond order derivatives - are added directly into - forces and pressure vector/tensor */ - rvec_Scale( force, CEval8, p_ijk->dcos_di ); - //atomic_rvecAdd( atoms[i].f, force ); - rvec_Add( pbond_ij->f, force ); - - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - //atomic_rvecAdd( data->ext_press, ext_press ); - rvec_Add( aux_ext_press [j], ext_press ); - - //atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj ); - - rvec_Scale( force, CEval8, p_ijk->dcos_dk ); - //atomic_rvecAdd( atoms[k].f, force ); - rvec_Add( pbond_jk->f, force ); - rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - //atomic_rvecAdd( data->ext_press, ext_press ); - rvec_Add( aux_ext_press [j], ext_press ); - - - /* This part is for a fully-flexible box */ - /* rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_di, system->atoms[i].x ); - rtensor_Scale( total_rtensor, +CEval8, temp_rtensor ); - - rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_dj, system->atoms[j].x ); - rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); - - rvec_OuterProduct( temp_rtensor, - p_ijk->dcos_dk, system->atoms[k].x ); - rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor); - - if( pbond_ij->imaginary || pbond_jk->imaginary ) - rtensor_ScaledAdd( data->flex_bar.P, - -1.0, total_rtensor ); - else - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } - -#ifdef TEST_ENERGY - //TODO -- check this - // fprintf( out_control->eval, - //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e", - // "%6d%6d%6d%23.15e%23.15e%23.15e\n", - // i+1, j+1, k+1, - //workspace->orig_id[i]+1, - //workspace->orig_id[j]+1, - //workspace->orig_id[k]+1, - //workspace->Delta_boc[j], - // RAD2DEG(theta), /*BOA_ij, BOA_jk, */ - // e_ang, data->E_Ang ); - - /*fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e", - p_val3, p_val4, BOA_ij, BOA_jk ); - fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e", - f7_ij, f7_jk, f8_Dj, expval12theta ); - fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e%23.15e\n", - CEval1, CEval2, CEval3, CEval4, CEval5 - //CEval6, CEval7, CEval8 );*/ - - /*fprintf( out_control->eval, - "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", - -p_ijk->dcos_di[0]/sin_theta, - -p_ijk->dcos_di[1]/sin_theta, - -p_ijk->dcos_di[2]/sin_theta, - -p_ijk->dcos_dj[0]/sin_theta, - -p_ijk->dcos_dj[1]/sin_theta, - -p_ijk->dcos_dj[2]/sin_theta, - -p_ijk->dcos_dk[0]/sin_theta, - -p_ijk->dcos_dk[1]/sin_theta, - -p_ijk->dcos_dk[2]/sin_theta );*/ - - /* fprintf( out_control->epen, - "%23.15e%23.15e%23.15e\n", - CEpen1, CEpen2, CEpen3 ); - fprintf( out_control->epen, - "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - workspace->orig_id[k], RAD2DEG(theta), - BOA_ij, BOA_jk, e_pen, data->E_Pen ); */ - - // fprintf( out_control->ecoa, - // "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - // workspace->orig_id[i], - // workspace->orig_id[j], - // workspace->orig_id[k], - // RAD2DEG(theta), BOA_ij, BOA_jk, - // e_coa, data->E_Coa ); -#endif - -#ifdef TEST_FORCES /* angle forces */ - //TODO -- check this - /* - Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang ); - Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang ); - Add_dDelta( system, lists, - j, CEval3 + CEval7, workspace->f_ang ); - - for( t = start_j; t < end_j; ++t ) { - pbond_jt = &( bond_list[t] ); - bo_jt = &(pbond_jt->bo_data); - temp_bo_jt = bo_jt->BO; - temp = CUBE( temp_bo_jt ); - pBOjt7 = temp * temp * temp_bo_jt; - - Add_dBO( system, lists, j, t, pBOjt7 * CEval6, - workspace->f_ang ); - Add_dBOpinpi2( system, lists, j, t, - CEval5, CEval5, - workspace->f_ang, workspace->f_ang ); - } - - rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk ); - // end angle forces - - // penalty forces - Add_dDelta( system, lists, j, CEpen1, workspace->f_pen ); - Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen ); - Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen ); - // end penalty forces - - // coalition forces - Add_dBO( system, lists, - j, pi, CEcoa1-CEcoa4, workspace->f_coa ); - Add_dBO( system, lists, - j, pk, CEcoa2-CEcoa5, workspace->f_coa ); - Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa ); - Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa ); - Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa ); - // end coalition forces - - */ -#endif - } - } - } - } - } - - Set_End_Index(pi, num_thb_intrs, thb_intrs ); - } - // } // end of the main for loop here - - - //TODO - to be done on the CPU - /* - - if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) { - workspace->realloc.num_3body = num_thb_intrs; - if( num_thb_intrs > thb_intrs->num_intrs ) { - fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d", - data->step, num_thb_intrs, thb_intrs->num_intrs ); - exit( INSUFFICIENT_SPACE ); - } - } - */ - - //fprintf( stderr,"%d: Number of angle interactions: %d\n", - // data->step, num_thb_intrs ); - -#ifdef TEST_ENERGY - /* - fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs ); - - fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n", - data->E_Ang, data->E_Pen, data->E_Coa ); - - fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] ); - */ -#endif -} - - -GLOBAL void Three_Body_Interactions_results ( reax_atom *atoms, control_params *control, - static_storage p_workspace, - list p_bonds, int N ) -{ - int i, pj; - - bond_data *pbond; - bond_data *sym_index_bond; - list *bonds = &p_bonds; - static_storage *workspace = &p_workspace; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= N) return; - - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - - pbond = &(bonds->select.bond_list[pj]); - sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); - - workspace->CdDelta [i] += sym_index_bond->CdDelta_ij; - - rvec_Add (atoms[i].f, sym_index_bond->f ); - } -} - - - -////////////////////////////////////////////////////////////////////////// -// Three Body Estimation -////////////////////////////////////////////////////////////////////////// - - -/* this is a 3-body interaction in which the main role is - played by j which sits in the middle of the other two. */ -GLOBAL void Three_Body_Estimate ( reax_atom *atoms, - control_params *control, - list p_bonds, int N, - int *count) -{ - int i, j, pi, k, pk, t; - int type_i, type_j, type_k; - int start_j, end_j ; - int flag, cnt, num_thb_intrs; - - real r_ij, r_jk; - real BOA_ij, BOA_jk; - list *bonds; - - bond_order_data *bo_ij, *bo_jk, *bo_jt; - bond_data *bond_list; - bond_data *pbond_ij, *pbond_jk, *pbond_jt; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - - bonds = &p_bonds; - bond_list = bonds->select.bond_list; - - type_j = atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - - - for( pi = start_j; pi < end_j; ++pi ) { - - num_thb_intrs = 0; - count [pi] = 0; - - pbond_ij = &(bond_list[pi]); - bo_ij = &(pbond_ij->bo_data); - BOA_ij = bo_ij->BO - control->thb_cut; - - if( BOA_ij/*bo_ij->BO*/ > 0.0 ) { - i = pbond_ij->nbr; - r_ij = pbond_ij->d; - type_i = atoms[i].type; - - /* - for( pk = start_j; pk < pi; ++pk ) { - start_pk = Start_Index( pk, thb_intrs ); - end_pk = End_Index( pk, thb_intrs ); - - for( t = start_pk; t < end_pk; ++t ) - if( thb_list[t].thb == i ) { - - ++num_thb_intrs; - break; - } - } - */ - - /* and this is the second for loop mentioned above */ - for( pk = start_j; pk < end_j; ++pk ) { - if (pk == pi) continue; - - pbond_jk = &(bond_list[pk]); - bo_jk = &(pbond_jk->bo_data); - BOA_jk = bo_jk->BO - control->thb_cut; - - if (BOA_jk <= 0) continue; - - ++num_thb_intrs; - } - } - - count [pi] = num_thb_intrs; - } -} - - - -////////////////////////////////////////////////////////////////////// -//End here -////////////////////////////////////////////////////////////////////// - - - - - - - - - - - - - - -void Hydrogen_Bonds( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) -{ - int i, j, k, pi, pk, itr, top; - int type_i, type_j, type_k; - int start_j, end_j, hb_start_j, hb_end_j; - int hblist[MAX_BONDS]; - int num_hb_intrs = 0; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; - rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; - ivec rel_jk; - // rtensor temp_rtensor, total_rtensor; - hbond_parameters *hbp; - bond_order_data *bo_ij; - bond_data *pbond_ij; - far_neighbor_data *nbr_jk; - list *bonds, *hbonds; - bond_data *bond_list; - hbond_data *hbond_list; - - bonds = (*lists) + BONDS; - bond_list = bonds->select.bond_list; - - hbonds = (*lists) + HBONDS; - hbond_list = hbonds->select.hbond_list; - - /* loops below discover the Hydrogen bonds between i-j-k triplets. - here j is H atom and there has to be some bond between i and j. - Hydrogen bond is between j and k. - so in this function i->X, j->H, k->Z when we map - variables onto the ones in the handout.*/ - for( j = 0; j < system->N; ++j ) - if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H - /*set j's variables */ - type_j = system->atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); - hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); - - top = 0; - for( pi = start_j; pi < end_j; ++pi ) { - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - bo_ij = &(pbond_ij->bo_data); - type_i = system->atoms[i].type; - - if( system->reaxprm.sbp[type_i].p_hbond == 2 && - bo_ij->BO >= HB_THRESHOLD ) - hblist[top++] = pi; - } - - // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", - // j, top, hb_start_j, hb_end_j ); - - for( pk = hb_start_j; pk < hb_end_j; ++pk ) { - /* set k's varibles */ - k = hbond_list[pk].nbr; - type_k = system->atoms[k].type; - nbr_jk = hbond_list[pk].ptr; - r_jk = nbr_jk->d; - rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); - - for( itr=0; itr < top; ++itr ) { - pi = hblist[itr]; - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - - if( i != k ) { - bo_ij = &(pbond_ij->bo_data); - type_i = system->atoms[i].type; - r_ij = pbond_ij->d; - hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, &system->reaxprm) ]); - ++num_hb_intrs; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &theta, &cos_theta ); - /* the derivative of cos(theta) */ - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &dcos_theta_di, &dcos_theta_dj, - &dcos_theta_dk ); - - /* hydrogen bond energy*/ - sin_theta2 = SIN( theta/2.0 ); - sin_xhz4 = SQR(sin_theta2); - sin_xhz4 *= sin_xhz4; - cos_xhz1 = ( 1.0 - cos_theta ); - exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); - exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + - r_jk / hbp->r0_hb - 2.0 ) ); - - data->E_HB += e_hb = - hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - - CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; - CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; - CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + - 1.0 / hbp->r0_hb); - - /* hydrogen bond forces */ - bo_ij->Cdbo += CEhb1; // dbo term - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { - rvec_ScaledAdd( system->atoms[i].f, - +CEhb2, dcos_theta_di ); //dcos terms - rvec_ScaledAdd( system->atoms[j].f, - +CEhb2, dcos_theta_dj ); - - - - - //TODO - rvec_ScaledAdd( system->atoms[k].f, - +CEhb2, dcos_theta_dk ); - - //dr terms - rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk ); - - - //TODO - rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk ); - } - else - { - /* for pressure coupling, terms that are not related - to bond order derivatives are added directly into - pressure vector/tensor */ - rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms - rvec_Add( system->atoms[i].f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); - - rvec_ScaledAdd( system->atoms[j].f, +CEhb2, dcos_theta_dj ); - - ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); - rvec_Scale( force, +CEhb2, dcos_theta_dk ); - - - - //TODO - rvec_Add( system->atoms[k].f, force ); - - - - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); - - //dr terms - rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk ); - - rvec_Scale( force, CEhb3/r_jk, dvec_jk ); - rvec_Add( system->atoms[k].f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - rvec_ScaledAdd( data->ext_press, 1.0, ext_press ); - - /* This part is intended for a fully-flexible box */ - /* rvec_OuterProduct( temp_rtensor, - dcos_theta_di, system->atoms[i].x ); - rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj, - -CEhb3/r_jk, pbond_jk->dvec ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[j].x ); - rtensor_Add( total_rtensor, temp_rtensor ); - - rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk, - +CEhb3/r_jk, pbond_jk->dvec ); - rvec_OuterProduct( temp_rtensor, - temp_rvec, system->atoms[k].x ); - rtensor_Add( total_rtensor, temp_rtensor ); - - if( pbond_ij->imaginary || pbond_jk->imaginary ) - rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor ); - else - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } - -#ifdef TEST_ENERGY - /*fprintf( out_control->ehb, - "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n", - dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], - dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], - dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]); - fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n", - CEhb1, CEhb2, CEhb3 ); */ - fprintf( stderr, //out_control->ehb, - "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", - workspace->orig_id[i], - workspace->orig_id[j], - workspace->orig_id[k], - r_jk, theta, bo_ij->BO, e_hb, data->E_HB ); - -#endif -#ifdef TEST_FORCES - // dbo term - Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); - // dcos terms - rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); - rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk ); - // dr terms - rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk ); - rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk ); -#endif - } - } - } - } - - /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */ - -#ifdef TEST_FORCES - fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs ); - fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->E_HB ); -#endif -} - - - - - - -//////////////////////////////////////////////////////////////////// -// Cuda Function -//////////////////////////////////////////////////////////////////// - -GLOBAL void Hydrogen_Bonds ( reax_atom *atoms, - single_body_parameters *sbp, - hbond_parameters *d_hbp, - control_params *control, - simulation_data *data, - static_storage p_workspace, - list p_bonds, list p_hbonds, - int N, int num_atom_types, - real *E_HB, rvec *aux_ext_press, rvec *atoms_f ) -{ - extern __shared__ real t_hb[]; - extern __shared__ real t_f[]; - //extern __shared__ rvec t_cdbo[]; - //extern __shared__ rvec t_hf []; - - real *sh_hb = t_hb; - rvec *sh_atomf = (rvec *)(t_hb + blockDim.x); - //real *sh_cdbo = t_hb + blockDim.x; - //rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); - - int i, j, k, pi, pk, itr, top; - int type_i, type_j, type_k; - int start_j, end_j, hb_start_j, hb_end_j; - int hblist[MAX_BONDS]; - int num_hb_intrs = 0; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; - rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; - ivec rel_jk; - // rtensor temp_rtensor, total_rtensor; - hbond_parameters *hbp; - bond_order_data *bo_ij; - bond_data *pbond_ij; - far_neighbor_data *nbr_jk; - list *bonds, *hbonds; - bond_data *bond_list; - hbond_data *hbond_list, *hbond_jk; - static_storage *workspace = &p_workspace; - - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - - //j = blockIdx.x; - - bonds = &p_bonds; - bond_list = bonds->select.bond_list; - - hbonds = &p_hbonds; - hbond_list = hbonds->select.hbond_list; - - // loops below discover the Hydrogen bonds between i-j-k triplets. - // here j is H atom and there has to be some bond between i and j. - // Hydrogen bond is between j and k. - // so in this function i->X, j->H, k->Z when we map - // variables onto the ones in the handout. - - //for( j = 0; j < system->N; ++j ) - sh_hb [threadIdx.x] = 0; - rvec_MakeZero ( sh_atomf[ threadIdx.x] ); - - if( sbp[atoms[j].type].p_hbond==1) {// j must be H - //set j's variables - type_j = atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); - hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); - - top = 0; - for( pi = start_j; pi < end_j; ++pi ) { - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - bo_ij = &(pbond_ij->bo_data); - type_i = atoms[i].type; - - if( sbp[type_i].p_hbond == 2 && - bo_ij->BO >= HB_THRESHOLD ) - hblist[top++] = pi; - } - - // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", - // j, top, hb_start_j, hb_end_j ); - - for( pk = hb_start_j; pk < hb_end_j; ++pk ) - //pk = hb_start_j + threadIdx.x; - //while (pk < hb_end_j) - { - // set k's varibles - //TODO - hbond_jk = &( hbond_list[pk] ); - //TODO - k = hbond_list[pk].nbr; - type_k = atoms[k].type; - nbr_jk = hbond_list[pk].ptr; - r_jk = nbr_jk->d; - rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); - - //TODO Double check this Hydrogen Bonds fix - //rvec_MakeZero ( nbr_jk->h_f ); - rvec_MakeZero ( hbond_jk->h_f ); - //TODO Double check this Hydrogen Bonds fix - - //sh_hb [threadIdx.x] = 0; - - - //itr = threadIdx.x; - for( itr=0; itr < top; ++itr ) { - //while (itr < top) { - pi = hblist[itr]; - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - - //TODO - //rvec_MakeZero (sh_hf [threadIdx.x]); - //sh_cdbo [threadIdx.x] = 0; - - //rvec_MakeZero ( sh_atomf[ threadIdx.x] ); - - - if( i != k ) { - bo_ij = &(pbond_ij->bo_data); - type_i = atoms[i].type; - r_ij = pbond_ij->d; - hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); - ++num_hb_intrs; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &theta, &cos_theta ); - // the derivative of cos(theta) - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &dcos_theta_di, &dcos_theta_dj, - &dcos_theta_dk ); - - // hydrogen bond energy - sin_theta2 = SIN( theta/2.0 ); - sin_xhz4 = SQR(sin_theta2); - sin_xhz4 *= sin_xhz4; - cos_xhz1 = ( 1.0 - cos_theta ); - exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); - exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + - r_jk / hbp->r0_hb - 2.0 ) ); - - //PERFORMANCE IMPACT - e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - //atomicAdd ( &data->E_HB, e_hb ); - //E_HB [j] += e_hb; - sh_hb [threadIdx.x] += e_hb; - - CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; - CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; - CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + - 1.0 / hbp->r0_hb); - - //this is the problem here - //TODO - // hydrogen bond forces - bo_ij->Cdbo += CEhb1; // dbo term - //sh_cdbo[threadIdx.x] += CEhb1; - //TODO - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - - //PERFORMANCE IMPACT - /* - atomic_rvecScaledAdd( atoms[i].f, - +CEhb2, dcos_theta_di ); //dcos terms - atomic_rvecScaledAdd( atoms[j].f, - +CEhb2, dcos_theta_dj ); - atomic_rvecScaledAdd( atoms[k].f, - +CEhb2, dcos_theta_dk ); - //dr terms - atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk ); - */ - - //PERFORMANCE IMPACT - rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms - //rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms - - //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); - - //TODO you forgot here - //TODO Hydrogen bonds fix. -- BE VERY CAREFUL ***** - rvec_ScaledAdd( hbond_jk->h_f, - +CEhb2, dcos_theta_dk ); - - //rvec_ScaledAdd( nbr_jk->h_f, - // +CEhb2, dcos_theta_dk ); - - //dr terms - //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); - - //atoms_f [j] ++; - - //TODO you forgot - rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk ); - //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk ); - } - else - { - // for pressure coupling, terms that are not related - // to bond order derivatives are added directly into - // pressure vector/tensor - rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms - rvec_Add( pbond_ij->h_f, force ); - rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press ); - - rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); - - ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); - rvec_Scale( force, +CEhb2, dcos_theta_dk ); - - //rvec_Add( nbr_jk->h_f, force ); - rvec_Add( hbond_jk->h_f, force ); - - rvec_iMultiply( ext_press, rel_jk, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); - - //dr terms - rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - - rvec_Scale( force, CEhb3/r_jk, dvec_jk ); - rvec_Add( hbond_jk->h_f, force ); - rvec_iMultiply( ext_press, rel_jk, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); - - } - - //do the reduction for the bond_ij here - /* - if (threadIdx.x < 16){ - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); - } - if (threadIdx.x < 8){ - //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); - } - if (threadIdx.x < 4){ - //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); - } - if (threadIdx.x < 2){ - //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); - } - if (threadIdx.x < 1){ - //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); - - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); - } - if (threadIdx.x == 0){ - //bo_ij->Cdbo += sh_cdbo [threadIdx.x]; - //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); - - E_HB [j] += sh_hb [threadIdx.x]; - //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - } - */ - - - } // i != k if statement - - - //itr += blockDim.x; - - } //itr for statement - - /* - __syncthreads (); - - for (int x = 1; x < blockDim.x; x++) - sh_hb [0] += sh_hb [x]; - - E_HB [j] += sh_hb[0]; - if (threadIdx.x < 16) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; - if (threadIdx.x < 8) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; - if (threadIdx.x < 4) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; - if (threadIdx.x < 2) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; - if (threadIdx.x < 1) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; - if (threadIdx.x == 0) E_HB [j] += sh_hb [threadIdx.x]; - */ - - - //pk += blockDim.x; - - } // pk for statement - } // main if statment - - //do the reduction for the bond_ij here - /* - if (threadIdx.x < 16){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); - } - if (threadIdx.x < 8){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); - } - if (threadIdx.x < 4){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); - } - if (threadIdx.x < 2){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); - } - if (threadIdx.x < 1){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; - //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); - } - if (threadIdx.x == 0){ - E_HB [j] += sh_hb [threadIdx.x]; - //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - } - */ - - E_HB [j] += sh_hb [threadIdx.x]; - rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - - //rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]); - } - - - DEVICE void warpReduce(volatile real* sdata, int tid) - { - if (tid < 16) sdata[tid] += sdata[tid + 16]; - if (tid < 8) sdata[tid] += sdata[tid + 8]; - if (tid < 4) sdata[tid] += sdata[tid + 4]; - if (tid < 2) sdata[tid] += sdata[tid + 2]; - if (tid < 1) sdata[tid] += sdata[tid + 1]; - } - - - - - GLOBAL void Hydrogen_Bonds_HB ( reax_atom *atoms, - single_body_parameters *sbp, - hbond_parameters *d_hbp, - control_params *control, - simulation_data *data, - static_storage p_workspace, - list p_bonds, list p_hbonds, - int N, int num_atom_types, - real *E_HB, rvec *aux_ext_press, rvec *atoms_f ) - { - extern __shared__ real t_hb[]; - extern __shared__ rvec t__f[]; - extern __shared__ rvec t_cdbo[]; - extern __shared__ rvec t_hf []; - - real *sh_hb = t_hb; - real *sh_cdbo = t_hb + blockDim.x; - rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x); - rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x); - - int __THREADS_PER_ATOM__ = HBONDS_THREADS_PER_ATOM; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warp_id = thread_id / __THREADS_PER_ATOM__; - int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); - int my_bucket = threadIdx.x / __THREADS_PER_ATOM__; - - if (warp_id >= N ) return; - - - int i, j, k, pi, pk, itr, top; - int type_i, type_j, type_k; - int start_j, end_j, hb_start_j, hb_end_j; - int hblist[MAX_BONDS]; - int num_hb_intrs = 0; - real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; - real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3; - rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk; - rvec dvec_jk, force, ext_press; - ivec rel_jk; - // rtensor temp_rtensor, total_rtensor; - hbond_parameters *hbp; - bond_order_data *bo_ij; - bond_data *pbond_ij; - far_neighbor_data *nbr_jk; - list *bonds, *hbonds; - bond_data *bond_list; - hbond_data *hbond_list, *hbond_jk; - static_storage *workspace = &p_workspace; - - /* - j = blockIdx.x * blockDim.x + threadIdx.x; - if (j >= N) return; - */ - - // j = blockIdx.x; - - j = warp_id; - - bonds = &p_bonds; - bond_list = bonds->select.bond_list; - - hbonds = &p_hbonds; - hbond_list = hbonds->select.hbond_list; - - // loops below discover the Hydrogen bonds between i-j-k triplets. - // here j is H atom and there has to be some bond between i and j. - // Hydrogen bond is between j and k. - // so in this function i->X, j->H, k->Z when we map - // variables onto the ones in the handout. - - //for( j = 0; j < system->N; ++j ) - sh_hb [threadIdx.x] = 0; - rvec_MakeZero ( sh_atomf[ threadIdx.x] ); - - if( sbp[atoms[j].type].p_hbond==1) {// j must be H - //set j's variables - type_j = atoms[j].type; - start_j = Start_Index(j, bonds); - end_j = End_Index(j, bonds); - hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); - hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); - - top = 0; - for( pi = start_j; pi < end_j; ++pi ) { - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - bo_ij = &(pbond_ij->bo_data); - type_i = atoms[i].type; - - if( sbp[type_i].p_hbond == 2 && - bo_ij->BO >= HB_THRESHOLD ) { - hblist[top++] = pi; - } - } - - // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", - // j, top, hb_start_j, hb_end_j ); - - for( itr=0; itr < top; ++itr ) { - pi = hblist[itr]; - pbond_ij = &( bond_list[pi] ); - i = pbond_ij->nbr; - - //TODO - rvec_MakeZero (sh_hf [threadIdx.x]); - sh_cdbo [threadIdx.x] = 0; - - - //for( pk = hb_start_j; pk < hb_end_j; ++pk ) - int loopcount = (hb_end_j - hb_start_j) / HBONDS_THREADS_PER_ATOM + (((hb_end_j - hb_start_j)%HBONDS_THREADS_PER_ATOM == 0) ? 0 : 1); - int count = 0; - //jpk = hb_start_j + threadIdx.x; - pk = hb_start_j + lane_id; - //while (pk < hb_end_j) - while (count < loopcount) - { - - if (pk < hb_end_j) - { - // set k's varibles - //TODO - hbond_jk = &( hbond_list[pk] ); - //TODO - k = hbond_list[pk].nbr; - type_k = atoms[k].type; - nbr_jk = hbond_list[pk].ptr; - r_jk = nbr_jk->d; - rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec ); - } - else k = -1; - - //TODO Double check this Hydrogen Bonds fix - //rvec_MakeZero ( nbr_jk->h_f ); - //rvec_MakeZero ( hbond_jk->h_f ); - //TODO Double check this Hydrogen Bonds fix - - //sh_hb [threadIdx.x] = 0; - //rvec_MakeZero ( sh_atomf[ threadIdx.x] ); - //__syncthreads (); - - - if(( i != k ) && (k != -1)) { - bo_ij = &(pbond_ij->bo_data); - type_i = atoms[i].type; - r_ij = pbond_ij->d; - hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); - ++num_hb_intrs; - - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &theta, &cos_theta ); - // the derivative of cos(theta) - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, - &dcos_theta_di, &dcos_theta_dj, - &dcos_theta_dk ); - - // hydrogen bond energy - sin_theta2 = SIN( theta/2.0 ); - sin_xhz4 = SQR(sin_theta2); - sin_xhz4 *= sin_xhz4; - cos_xhz1 = ( 1.0 - cos_theta ); - exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO ); - exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + - r_jk / hbp->r0_hb - 2.0 ) ); - - //PERFORMANCE IMPACT - e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4; - //atomicAdd ( &data->E_HB, e_hb ); - //E_HB [j] += e_hb; - sh_hb [threadIdx.x] += e_hb; - - CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4; - CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1; - CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + - 1.0 / hbp->r0_hb); - - //this is the problem here - //TODO - // hydrogen bond forces - //bo_ij->Cdbo += CEhb1; // dbo term - sh_cdbo[threadIdx.x] += CEhb1; - //TODO - //warpReduce (sh_cdbo, threadIdx.x); - //if (threadIdx.x == 0) - // bo_ij->Cdbo += sh_cdbo [0]; - - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { - - //PERFORMANCE IMPACT - /* - atomic_rvecScaledAdd( atoms[i].f, - +CEhb2, dcos_theta_di ); //dcos terms - atomic_rvecScaledAdd( atoms[j].f, - +CEhb2, dcos_theta_dj ); - atomic_rvecScaledAdd( atoms[k].f, - +CEhb2, dcos_theta_dk ); - //dr terms - atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk ); - */ - - //PERFORMANCE IMPACT - //rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms - rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms - - //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); - rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj ); - - - //TODO you forgot here - //TODO Hydrogen bonds fix. -- BE VERY CAREFUL ***** - rvec_ScaledAdd( hbond_jk->h_f, +CEhb2, dcos_theta_dk ); - - //rvec_ScaledAdd( nbr_jk->h_f, - // +CEhb2, dcos_theta_dk ); - - //dr terms - //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); - - //TODO you forgot - rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk ); - //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk ); - } - else - { - // for pressure coupling, terms that are not related - // to bond order derivatives are added directly into - // pressure vector/tensor - //rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms - //rvec_Add( pbond_ij->h_f, force ); - //rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press ); - - //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj ); - - //ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box ); - //rvec_Scale( force, +CEhb2, dcos_theta_dk ); - - //rvec_Add( nbr_jk->h_f, force ); - //rvec_Add( hbond_jk->h_f, force ); - - //rvec_iMultiply( ext_press, rel_jk, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); - - //dr terms - //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk ); - - //rvec_Scale( force, CEhb3/r_jk, dvec_jk ); - //rvec_Add( hbond_jk->h_f, force ); - //rvec_iMultiply( ext_press, rel_jk, force ); - //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press ); - //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press ); - - } - - } // i != k if statement - - pk += __THREADS_PER_ATOM__; - count ++; - - } // pk for statement - - //__syncthreads (); - - //at this point done with one bond.... - //do the reduction now - //if (threadIdx.x == 0){ - if (lane_id < 16) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]); - } - if (lane_id < 8) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]); - } - if (lane_id < 4) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]); - } - if (lane_id < 2) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]); - } - if (lane_id < 1) { - sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1]; - rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]); - - bo_ij->Cdbo += sh_cdbo [threadIdx.x]; - rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); - } - /* - if (lane_id == 0){ - for (i = 1; i < 32; i++) - { - //sh_cdbo [threadIdx.x] += sh_cdbo [i]; - //rvec_Add (sh_hf [threadIdx.x], sh_hf [i]); - - sh_cdbo [lane_id] += sh_cdbo [lane_id + i]; - rvec_Add (sh_hf [lane_id], sh_hf [lane_id + i]); - } - - //bo_ij->Cdbo += sh_cdbo [threadIdx.x]; - //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]); - - bo_ij->Cdbo += sh_cdbo [lane_id]; - rvec_Add (pbond_ij->h_f, sh_hf [lane_id]); - } - */ - - } //itr for statement - - //__syncthreads (); - } // main if statment - - //__syncthreads (); - - - //do the reduction for the bond_ij here - if (lane_id < 16){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] ); - } - if (lane_id < 8){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] ); - } - if (lane_id < 4){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] ); - } - if (lane_id < 2){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] ); - } - if (lane_id < 1){ - sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1]; - rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] ); - - E_HB [j] += sh_hb [threadIdx.x]; - rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - } - /* - if (lane == 0){ - //E_HB [j] += sh_hb [threadIdx.x]; - rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]); - } - */ - //if (threadIdx.x == 0){ - /* - if (lane_id == 0){ - for (i = 1; i < 32; i++) - { - //sh_hb [threadIdx.x] += sh_hb [i]; - //rvec_Add (sh_atomf [threadIdx.x], sh_atomf [i]); - sh_hb [lane_id] += sh_hb [lane_id + i]; - rvec_Add (sh_atomf [lane_id], sh_atomf [lane_id + i]); - } - - //E_HB [j] += sh_hb [threadIdx.x]; - //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - - E_HB [j] += sh_hb [lane_id]; - rvec_Add (atoms[j].f, sh_atomf [lane_id]); - //rvec_Copy (atoms_f[j], sh_atomf [threadIdx.x]); - } - */ - - //E_HB [j] += sh_hb [threadIdx.x]; - //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]); - } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - GLOBAL void Hydrogen_Bonds_Postprocess ( reax_atom *atoms, - single_body_parameters *sbp, - static_storage p_workspace, - list p_bonds, list p_hbonds, list p_far_nbrs, int N, - real *e_hb) - { - - int i, pj, hj, nbr, k, j; - int start, end; - - bond_data *pbond; - bond_data *sym_index_bond; - far_neighbor_data *nbr_pj, *sym_index_nbr; - - list *bonds = &p_bonds; - list *far_nbrs = &p_far_nbrs; - - i = blockIdx.x * blockDim.x + threadIdx.x; - - if ( i >= N) return; - - // For processing ij information - start = Start_Index(i, bonds); - end = End_Index(i, bonds); - - //rvec_Scale (atoms[i].f, e_hb[i], atoms[i].f); - - for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){ - - pbond = &(bonds->select.bond_list[pj]); - sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); - - rvec_Add (atoms[i].f, sym_index_bond->h_f ); - } - - /* - for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++) - { - // check if the neighbor is of h_type - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - - sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]); - rvec_Add (atoms[i].f, sym_index_nbr->h_f ); - } - */ - - // if (workspace->hbond_index [j] != -1) - // { - // hb_start_j = Start_Index( workspace->hbond_index[j], hbonds ); - // hb_end_j = End_Index ( workspace->hbond_index[j], hbonds ); - - // for ( hj = hb_start_j; hj < hb_end_j; hj ++ ) - // { - // h_bond_data = &( hbonds->select.hbond_list [hj] ); - // nbr = h_bond_data->nbr; - - // if (nbr == i) { - // rvec_Add (atoms[i].f, h_bond_data->h_f ); - // } - // } - // } - } - - GLOBAL void Hydrogen_Bonds_Far_Nbrs ( reax_atom *atoms, - single_body_parameters *sbp, - static_storage p_workspace, - list p_bonds, list p_hbonds, list p_far_nbrs, int N ) - { - - extern __shared__ rvec __f[]; - int i, pj,j; - int start, end; - - far_neighbor_data *nbr_pj, *sym_index_nbr; - list *far_nbrs = &p_far_nbrs; - - i = blockIdx.x; - - start = Start_Index (i, far_nbrs); - end = End_Index (i, far_nbrs); - pj = start + threadIdx.x; - - rvec_MakeZero (__f[threadIdx.x]); - - while (pj < end) - { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - - //sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]); - // - //rvec_Add (atoms[i].f, sym_index_nbr->h_f ); - // - //rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f ); - - pj += blockDim.x; - } - - if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); - if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); - if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); - if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); - if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); - - if (threadIdx.x == 0) - rvec_Add (atoms[i].f, __f[0]); - } - - GLOBAL void Hydrogen_Bonds_HNbrs ( reax_atom *atoms, - single_body_parameters *sbp, - static_storage p_workspace, - list p_bonds, list p_hbonds, list p_far_nbrs, int N ) - { - - extern __shared__ rvec __f[]; - int i, pj,j; - int start, end; - - hbond_data *nbr_pj, *sym_index_nbr; - list *hbonds = &p_hbonds; - - i = blockIdx.x; - - start = Start_Index (i, hbonds); - end = End_Index (i, hbonds); - pj = start + threadIdx.x; - - rvec_MakeZero (__f[threadIdx.x]); - - while (pj < end) - { - nbr_pj = &( hbonds->select.hbond_list[pj] ); - j = nbr_pj->nbr; - - sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]); - rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f ); - - pj += blockDim.x; - } - - if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]); - if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]); - if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]); - if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]); - if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]); - - if (threadIdx.x == 0) - rvec_Add (atoms[i].f, __f[0]); - } - diff --git a/PuReMD-GPU/src/three_body_interactions.h b/PuReMD-GPU/src/three_body_interactions.h index 2aa0d4434a7001793a5065a82e238b6d349f49d7..dcbadb0697f951b11b98d53488bd466a633a3fb6 100644 --- a/PuReMD-GPU/src/three_body_interactions.h +++ b/PuReMD-GPU/src/three_body_interactions.h @@ -23,52 +23,15 @@ #include "mytypes.h" + void Three_Body_Interactions( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); + static_storage*, list**, output_controls* ); void Hydrogen_Bonds( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); - - -//CUDA Functions. -HOST_DEVICE void Calculate_Theta( rvec, real, rvec, real, real*, real* ); - -HOST_DEVICE void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* ); - -GLOBAL void Three_Body_Interactions( reax_atom *, single_body_parameters *, three_body_header *, - global_parameters , control_params *, simulation_data *, - static_storage , - list , list , int , int , real *, real *, real *, rvec *); - -GLOBAL void Three_Body_Interactions_results ( reax_atom *, - control_params *, - static_storage , - list , int ); + static_storage*, list**, output_controls* ); -GLOBAL void Three_Body_Estimate ( reax_atom *atoms, - control_params *control, - list p_bonds, int N, - int *count); +void Calculate_Theta( rvec, real, rvec, real, real*, real* ); -GLOBAL void Hydrogen_Bonds ( reax_atom *, - single_body_parameters *, hbond_parameters *, - control_params *, simulation_data *, static_storage , - list , list , int , int, real *, rvec *, rvec *); -GLOBAL void Hydrogen_Bonds_HB ( reax_atom *, - single_body_parameters *, hbond_parameters *, - control_params *, simulation_data *, static_storage , - list , list , int , int, real *, rvec *, rvec *); +void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* ); -GLOBAL void Hydrogen_Bonds_Postprocess ( reax_atom *, - single_body_parameters *, - static_storage , list, - list , list , int, real * ); -GLOBAL void Hydrogen_Bonds_Far_Nbrs ( reax_atom *, - single_body_parameters *, - static_storage , list, - list , list , int ); -GLOBAL void Hydrogen_Bonds_HNbrs ( reax_atom *, - single_body_parameters *, - static_storage , list, - list , list , int ); #endif diff --git a/PuReMD-GPU/src/traj.cu b/PuReMD-GPU/src/traj.c similarity index 98% rename from PuReMD-GPU/src/traj.cu rename to PuReMD-GPU/src/traj.c index 97496e7f7b8dc4e9add824ee198d0c2907180a98..2844c370ee79702ed0c75d090afe545149aae185 100644 --- a/PuReMD-GPU/src/traj.cu +++ b/PuReMD-GPU/src/traj.c @@ -19,13 +19,17 @@ ----------------------------------------------------------------------*/ #include "traj.h" + #include "list.h" -#include "cuda_copy.h" + +#ifdef __PRINT_CPU_RESULTS__ + #include "cuda_copy.h" +#endif + /************************************************/ /* CUSTOM FORMAT ROUTINES */ /************************************************/ - int Write_Custom_Header(reax_system *system, control_params *control, static_storage *workspace, output_controls *out_control) { @@ -207,9 +211,9 @@ int Append_Custom_Frame( reax_system *system, control_params *control, if( write_bonds ) { -#ifndef __PRINT_CPU_RESULTS__ +#ifdef __PRINT_CPU_RESULTS__ //fprintf (stderr, "Synching bonds from device for printing ....\n"); - Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND ); + Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND ); #endif for( i = 0; i < system->N; ++i ) @@ -239,12 +243,12 @@ int Append_Custom_Frame( reax_system *system, control_params *control, num_thb_intrs = 0; if( write_angles ) { -#ifndef __PRINT_CPU_RESULTS__ +#ifdef __PRINT_CPU_RESULTS__ //fprintf (stderr, "Synching three bodies from deivce for printing ... \n"); - Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY ); + Sync_Host_Device_List( thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY ); if ( !write_bonds) { //fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n"); - Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND ); + Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND ); } #endif diff --git a/PuReMD-GPU/src/traj.h b/PuReMD-GPU/src/traj.h index d8c1792d0f6941d6881939d559bd9003b286bfa1..35d92602eee7c2d0b5ee83889623df2cb2106c71 100644 --- a/PuReMD-GPU/src/traj.h +++ b/PuReMD-GPU/src/traj.h @@ -22,8 +22,8 @@ #define __TRAJ_H__ #include "mytypes.h" -#include "zlib.h" +#include <zlib.h> #define BLOCK_MARK "REAX_BLOCK_MARK " #define BLOCK_MARK_LEN 16 @@ -73,12 +73,14 @@ #define SIZE_INFO_LINE3 "%-10d %-10d %-10d\n" #define SIZE_INFO_LEN3 33 + enum ATOM_LINE_OPTS {OPT_NOATOM = 0, OPT_ATOM_BASIC = 4, OPT_ATOM_wF = 5, OPT_ATOM_wV = 6, OPT_ATOM_FULL = 7 }; enum BOND_LINE_OPTS {OPT_NOBOND, OPT_BOND_BASIC, OPT_BOND_FULL}; enum ANGLE_LINE_OPTS {OPT_NOANGLE, OPT_ANGLE_BASIC}; + struct { int no_of_sub_blocks; @@ -89,11 +91,11 @@ struct typedef struct __block block; + int Write_Block( gzFile, block* ); int Read_Next_Block( gzFile, block*, int* ); int Skip_Next_Block( gzFile, int*); - /* Format for trajectory file @@ -141,8 +143,6 @@ int Skip_Next_Block( gzFile, int*); No. of torsion entries (int) Torsion info lines as per torsion format. */ - - int Write_Custom_Header( reax_system*, control_params*, static_storage*, output_controls* ); int Write_xyz_Header ( reax_system*, control_params*, diff --git a/PuReMD-GPU/src/two_body_interactions.c b/PuReMD-GPU/src/two_body_interactions.c new file mode 100644 index 0000000000000000000000000000000000000000..2e7a6daf9039ea26c22b2fcfda5913e46255ad75 --- /dev/null +++ b/PuReMD-GPU/src/two_body_interactions.c @@ -0,0 +1,571 @@ +/*---------------------------------------------------------------------- + PuReMD-GPU - Reax Force Field Simulator + + Copyright (2014) Purdue University + Sudhir Kylasa, skylasa@purdue.edu + Hasan Metin Aktulga, haktulga@cs.purdue.edu + Ananth Y Grama, ayg@cs.purdue.edu + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details: + <http://www.gnu.org/licenses/>. + ----------------------------------------------------------------------*/ + +#include "two_body_interactions.h" + +#include "bond_orders.h" +#include "list.h" +#include "lookup.h" +#include "vector.h" +#include "index_utils.h" + + +void Bond_Energy( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, j, pj; + int start_i, end_i; + int type_i, type_j; + real ebond, pow_BOs_be2, exp_be12, CEbo; + real gp3, gp4, gp7, gp10, gp37; + real exphu, exphua1, exphub1, exphuov, hulpov, estriph; + real decobdbo, decobdboua, decobdboub; + single_body_parameters *sbp_i, *sbp_j; + two_body_parameters *twbp; + bond_order_data *bo_ij; + list *bonds; + + bonds = (*lists) + BONDS; + gp3 = system->reaxprm.gp.l[3]; + gp4 = system->reaxprm.gp.l[4]; + gp7 = system->reaxprm.gp.l[7]; + gp10 = system->reaxprm.gp.l[10]; + gp37 = (int) system->reaxprm.gp.l[37]; + + for( i=0; i < system->N; ++i ) { + start_i = Start_Index(i, bonds); + end_i = End_Index(i, bonds); + //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i ); + for( pj = start_i; pj < end_i; ++pj ) + if( i < bonds->select.bond_list[pj].nbr ) { + /* set the pointers */ + j = bonds->select.bond_list[pj].nbr; + type_i = system->atoms[i].type; + type_j = system->atoms[j].type; + sbp_i = &( system->reaxprm.sbp[type_i] ); + sbp_j = &( system->reaxprm.sbp[type_j] ); + twbp = &( system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ] ); + bo_ij = &( bonds->select.bond_list[pj].bo_data ); + + /* calculate the constants */ + pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); + exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); + CEbo = -twbp->De_s * exp_be12 * + ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); + + /* calculate the Bond Energy */ + ebond = + -twbp->De_s * bo_ij->BO_s * exp_be12 + -twbp->De_p * bo_ij->BO_pi + -twbp->De_pp * bo_ij->BO_pi2; + + data->E_BE += ebond; + + /* calculate derivatives of Bond Orders */ + bo_ij->Cdbo += CEbo; + bo_ij->Cdbopi -= (CEbo + twbp->De_p); + bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); + +#ifdef TEST_ENERGY + fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + // i+1, j+1, + bo_ij->BO, ebond/*, data->E_BE*/ ); + /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", + workspace->orig_id[i], workspace->orig_id[j], + CEbo, -twbp->De_p, -twbp->De_pp );*/ +#endif +#ifdef TEST_FORCES + Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); + Add_dBOpinpi2( system, lists, i, pj, + -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), + workspace->f_be, workspace->f_be ); +#endif + + /* Stabilisation terminal triple bond */ + if( bo_ij->BO >= 1.00 ) { + if( gp37 == 2 || + (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || + (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { + // ba = SQR(bo_ij->BO - 2.50); + exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); + //oboa=abo(j1)-boa; + //obob=abo(j2)-boa; + exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO)); + exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO)); + //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2); + exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j])); + hulpov = 1.0 / (1.0 + 25.0 * exphuov); + + estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); + //estrain(j1) = estrain(j1) + 0.50*estriph; + //estrain(j2) = estrain(j2) + 0.50*estriph; + data->E_BE += estriph; + + decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * + ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); + decobdboua = -gp10 * exphu * hulpov * + (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + decobdboub = -gp10 * exphu * hulpov * + (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); + + bo_ij->Cdbo += decobdbo; + workspace->CdDelta[i] += decobdboua; + workspace->CdDelta[j] += decobdboub; + //loop_j ++; + //fprintf (stderr, "incrementing loopj %d \n", loop_j); +#ifdef TEST_ENERGY + fprintf( out_control->ebond, + "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + //i+1, j+1, + estriph, decobdbo, decobdboua, decobdboub ); +#endif +#ifdef TEST_FORCES + Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); + Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); + Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); +#endif + } + } + } + } +} + + +void vdW_Coulomb_Energy( reax_system *system, control_params *control, + simulation_data *data, static_storage *workspace, + list **lists, output_controls *out_control ) +{ + int i, j, pj; + int start_i, end_i; + real self_coef; + real p_vdW1, p_vdW1i; + real powr_vdW1, powgi_vdW1; + real tmp, r_ij, fn13, exp1, exp2; + real Tap, dTap, dfn13, CEvd, CEclmb; + real dr3gamij_1, dr3gamij_3; + real e_ele, e_vdW, e_core, de_core; + rvec temp, ext_press; + // rtensor temp_rtensor, total_rtensor; + two_body_parameters *twbp; + far_neighbor_data *nbr_pj; + list *far_nbrs; + + p_vdW1 = system->reaxprm.gp.l[28]; + p_vdW1i = 1.0 / p_vdW1; + far_nbrs = (*lists) + FAR_NBRS; + e_ele = 0; + e_vdW = 0; + e_core = 0; + de_core = 0; + + for( i = 0; i < system->N; ++i ) { + start_i = Start_Index(i, far_nbrs); + end_i = End_Index(i, far_nbrs); + // fprintf( stderr, "i: %d, start: %d, end: %d\n", + // i, start_i, end_i ); + + for( pj = start_i; pj < end_i; ++pj ) + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + r_ij = nbr_pj->d; + twbp = &(system->reaxprm.tbp[ index_tbp(system->atoms[i].type, system->atoms[j].type, system->reaxprm.num_atom_types) ]); + self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! + + /* Calculate Taper and its derivative */ + // Tap = nbr_pj->Tap; -- precomputed during compte_H + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; + dTap = dTap * r_ij + 5*control->Tap5; + dTap = dTap * r_ij + 4*control->Tap4; + dTap = dTap * r_ij + 3*control->Tap3; + dTap = dTap * r_ij + 2*control->Tap2; + dTap += control->Tap1/r_ij; + + /*vdWaals Calculations*/ + if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) { + // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + data->E_vdW += e_vdW = + self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) * dfn13 ); + } + else{ // no shielding + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + + data->E_vdW += e_vdW = + self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); + + CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * + (exp1 - exp2) ); + } + + if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) { + // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); + e_vdW += self_coef * Tap * e_core; + data->E_vdW += self_coef * Tap * e_core; + + de_core = -(twbp->acore/twbp->rcore) * e_core; + CEvd += self_coef * ( dTap * e_core + Tap * de_core ); + } + + /*Coulomb Calculations*/ + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + tmp = Tap / dr3gamij_3; + //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H + data->E_Ele += e_ele = + self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp; + + + CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * + ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* + ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/ + + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + rvec_ScaledAdd( system->atoms[i].f, + -(CEvd+CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( system->atoms[j].f, + +(CEvd+CEclmb), nbr_pj->dvec ); + } + else { // NPT, iNPT or sNPT + /* for pressure coupling, terms not related to bond order + derivatives are added directly into pressure vector/tensor */ + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + + rvec_ScaledAdd( system->atoms[i].f, -1., temp ); + rvec_Add( system->atoms[j].f, temp ); + + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + + /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", + i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] ); + + fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] ); + + fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n", + data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ + + /* This part is intended for a fully-flexible box */ + /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, + system->atoms[i].x ); + rtensor_Scale( total_rtensor, + F_C * -(CEvd + CEclmb), temp_rtensor ); + rvec_OuterProduct( temp_rtensor, + nbr_pj->dvec, system->atoms[j].x ); + rtensor_ScaledAdd( total_rtensor, + F_C * +(CEvd + CEclmb), temp_rtensor ); + + if( nbr_pj->imaginary ) + // This is an external force due to an imaginary nbr + rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor ); + else + // This interaction is completely internal + rtensor_Add( data->flex_bar.P, total_rtensor ); */ + } + +#ifdef TEST_ENERGY + rvec_MakeZero( temp ); + rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec ); + fprintf( out_control->evdw, + "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", + //i+1, j+1, + MIN( workspace->orig_id[i], workspace->orig_id[j] ), + MAX( workspace->orig_id[i], workspace->orig_id[j] ), + r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ ); + + fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", + MIN( workspace->orig_id[i], workspace->orig_id[j] ), + MAX( workspace->orig_id[i], workspace->orig_id[j] ), + r_ij, system->atoms[i].q, system->atoms[j].q, + e_ele/*, data->E_Ele*/ ); +#endif +#ifdef TEST_FORCES + rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); +#endif + } + } + + // fclose( fout ); + + // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", + // data->ext_press[0], data->ext_press[1], data->ext_press[2] ); +} + + +void LR_vdW_Coulomb( reax_system *system, control_params *control, + int i, int j, real r_ij, LR_data *lr ) +{ + real p_vdW1 = system->reaxprm.gp.l[28]; + real p_vdW1i = 1.0 / p_vdW1; + real powr_vdW1, powgi_vdW1; + real tmp, fn13, exp1, exp2; + real Tap, dTap, dfn13; + real dr3gamij_1, dr3gamij_3; + real e_core, de_core; + two_body_parameters *twbp; + + twbp = &(system->reaxprm.tbp[ index_tbp(i,j,system->reaxprm.num_atom_types) ]); + e_core = 0; + de_core = 0; + + /* calculate taper and its derivative */ + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; + dTap = dTap * r_ij + 5*control->Tap5; + dTap = dTap * r_ij + 4*control->Tap4; + dTap = dTap * r_ij + 3*control->Tap3; + dTap = dTap * r_ij + 2*control->Tap2; + dTap += control->Tap1/r_ij; + + + /* vdWaals calculations */ + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\ +Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n", +Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), +powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */ + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0); + + lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; + + /*vdWaals Calculations*/ + if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) + { // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); + + lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; + } + else{ // no shielding + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + + lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); + } + + if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) + { // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); + lr->e_vdW += Tap * e_core; + + de_core = -(twbp->acore/twbp->rcore) * e_core; + lr->CEvd += dTap * e_core + Tap * de_core; + } + + /* Coulomb calculations */ + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + tmp = Tap / dr3gamij_3; + lr->H = EV_to_KCALpMOL * tmp; + lr->e_ele = C_ele * tmp; + /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\ +Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n", +i, system->atoms[i].type, j, system->atoms[j].type, +twbp->gamma, Tap, dr3gamij_3, +system->atoms[i].q, system->atoms[j].q ); */ + + lr->CEclmb = C_ele * ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + /* fprintf( stdout, "%d %d\t%g\t%g %g\t%g %g\t%g %g\n", + i+1, j+1, r_ij, e_vdW, CEvd * r_ij, + system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */ + + /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n", + i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */ +} + + +void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control, + simulation_data *data, + static_storage *workspace, list **lists, + output_controls *out_control ) +{ + int i, j, pj, r, steps, update_freq, update_energies; + int type_i, type_j, tmin, tmax; + int start_i, end_i; + real r_ij, self_coef, base, dif; + real e_vdW, e_ele; + real CEvd, CEclmb; + rvec temp, ext_press; + far_neighbor_data *nbr_pj; + list *far_nbrs = (*lists) + FAR_NBRS; + LR_lookup_table *t; + + steps = data->step - data->prev_steps; + update_freq = out_control->energy_update_freq; + update_energies = update_freq > 0 && steps % update_freq == 0; + + for( i = 0; i < system->N; ++i ) { + type_i = system->atoms[i].type; + start_i = Start_Index(i,far_nbrs); + end_i = End_Index(i,far_nbrs); + + for( pj = start_i; pj < end_i; ++pj ) + if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { + nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); + j = nbr_pj->nbr; + type_j = system->atoms[j].type; + r_ij = nbr_pj->d; + self_coef = (i == j) ? 0.5 : 1.0; + tmin = MIN( type_i, type_j ); + tmax = MAX( type_i, type_j ); + t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); + + /* Cubic Spline Interpolation */ + r = (int)(r_ij * t->inv_dx); + if( r == 0 ) ++r; + base = (real)(r+1) * t->dx; + dif = r_ij - base; + //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif); + + if( update_energies ) { + e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + + t->vdW[r].a; + e_vdW *= self_coef; + + e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + + t->ele[r].a; + e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q; + + data->E_vdW += e_vdW; + data->E_Ele += e_ele; + } + + CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + + t->CEvd[r].a; + CEvd *= self_coef; + //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b; + + CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + + t->CEclmb[r].a; + CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q; + + if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { + rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec ); + rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec ); + } + else { // NPT, iNPT or sNPT + /* for pressure coupling, terms not related to bond order + derivatives are added directly into pressure vector/tensor */ + rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( system->atoms[i].f, -1., temp ); + rvec_Add( system->atoms[j].f, temp ); + rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); + rvec_Add( data->ext_press, ext_press ); + } + +#ifdef TEST_ENERGY + fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + r_ij, e_vdW, data->E_vdW ); + fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", + workspace->orig_id[i], workspace->orig_id[j], + r_ij, system->atoms[i].q, system->atoms[j].q, + e_ele, data->E_Ele ); +#endif +#ifdef TEST_FORCES + rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); + rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); +#endif + } + } +} + + +#if defined(OLD) + /* Linear extrapolation */ + /*p = (r_ij * t->inv_dx; + r = (int) p; + prev = &( t->y[r] ); + next = &( t->y[r+1] ); + + tmp = p - r; + e_vdW = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW )); + CEvd = self_coef * (prev->CEvd + tmp*(next->CEvd - prev->CEvd )); + + e_ele = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele )); + e_ele = e_ele * system->atoms[i].q * system->atoms[j].q; + CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb)); + CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/ +#endif diff --git a/PuReMD-GPU/src/two_body_interactions.cu b/PuReMD-GPU/src/two_body_interactions.cu deleted file mode 100644 index f53b0cfb0fb1c23626032e7a257fce5b1447953e..0000000000000000000000000000000000000000 --- a/PuReMD-GPU/src/two_body_interactions.cu +++ /dev/null @@ -1,1630 +0,0 @@ -/*---------------------------------------------------------------------- - PuReMD-GPU - Reax Force Field Simulator - - Copyright (2014) Purdue University - Sudhir Kylasa, skylasa@purdue.edu - Hasan Metin Aktulga, haktulga@cs.purdue.edu - Ananth Y Grama, ayg@cs.purdue.edu - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details: - <http://www.gnu.org/licenses/>. - ----------------------------------------------------------------------*/ - -#include "two_body_interactions.h" -#include "bond_orders.h" -#include "list.h" -#include "lookup.h" -#include "vector.h" -#include "index_utils.h" - -#include "cuda_helpers.h" - - -void Bond_Energy( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) -{ - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - real ebond, pow_BOs_be2, exp_be12, CEbo; - real gp3, gp4, gp7, gp10, gp37; - real exphu, exphua1, exphub1, exphuov, hulpov, estriph; - real decobdbo, decobdboua, decobdboub; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_order_data *bo_ij; - list *bonds; - - bonds = (*lists) + BONDS; - gp3 = system->reaxprm.gp.l[3]; - gp4 = system->reaxprm.gp.l[4]; - gp7 = system->reaxprm.gp.l[7]; - gp10 = system->reaxprm.gp.l[10]; - gp37 = (int) system->reaxprm.gp.l[37]; - - for( i=0; i < system->N; ++i ) { - start_i = Start_Index(i, bonds); - end_i = End_Index(i, bonds); - //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i ); - for( pj = start_i; pj < end_i; ++pj ) - if( i < bonds->select.bond_list[pj].nbr ) { - /* set the pointers */ - j = bonds->select.bond_list[pj].nbr; - type_i = system->atoms[i].type; - type_j = system->atoms[j].type; - sbp_i = &( system->reaxprm.sbp[type_i] ); - sbp_j = &( system->reaxprm.sbp[type_j] ); - twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] ); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - - /* calculate the constants */ - pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); - exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); - CEbo = -twbp->De_s * exp_be12 * - ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); - - /* calculate the Bond Energy */ - ebond = - -twbp->De_s * bo_ij->BO_s * exp_be12 - -twbp->De_p * bo_ij->BO_pi - -twbp->De_pp * bo_ij->BO_pi2; - - data->E_BE += ebond; - - /* calculate derivatives of Bond Orders */ - bo_ij->Cdbo += CEbo; - bo_ij->Cdbopi -= (CEbo + twbp->De_p); - bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); - -#ifdef TEST_ENERGY - fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - // i+1, j+1, - bo_ij->BO, ebond/*, data->E_BE*/ ); - /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", - workspace->orig_id[i], workspace->orig_id[j], - CEbo, -twbp->De_p, -twbp->De_pp );*/ -#endif -#ifdef TEST_FORCES - Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); - Add_dBOpinpi2( system, lists, i, pj, - -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), - workspace->f_be, workspace->f_be ); -#endif - - /* Stabilisation terminal triple bond */ - if( bo_ij->BO >= 1.00 ) { - if( gp37 == 2 || - (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || - (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { - // ba = SQR(bo_ij->BO - 2.50); - exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); - //oboa=abo(j1)-boa; - //obob=abo(j2)-boa; - exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO)); - exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO)); - //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2); - exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j])); - hulpov = 1.0 / (1.0 + 25.0 * exphuov); - - estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); - //estrain(j1) = estrain(j1) + 0.50*estriph; - //estrain(j2) = estrain(j2) + 0.50*estriph; - data->E_BE += estriph; - - decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * - ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); - decobdboua = -gp10 * exphu * hulpov * - (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - decobdboub = -gp10 * exphu * hulpov * - (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - - bo_ij->Cdbo += decobdbo; - workspace->CdDelta[i] += decobdboua; - workspace->CdDelta[j] += decobdboub; - //loop_j ++; - //fprintf (stderr, "incrementing loopj %d \n", loop_j); -#ifdef TEST_ENERGY - fprintf( out_control->ebond, - "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - //i+1, j+1, - estriph, decobdbo, decobdboua, decobdboub ); -#endif -#ifdef TEST_FORCES - Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); - Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); - Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); -#endif - } - } - } - } -} - - - - - - - -GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, - single_body_parameters *sbp, two_body_parameters *tbp, - simulation_data *data, - static_storage p_workspace, list p_bonds, - int N, int num_atom_types, real *E_BE) -{ - int i, j, pj; - int start_i, end_i; - int type_i, type_j; - real ebond, pow_BOs_be2, exp_be12, CEbo; - real gp3, gp4, gp7, gp10, gp37; - real exphu, exphua1, exphub1, exphuov, hulpov, estriph; - real decobdbo, decobdboua, decobdboub; - single_body_parameters *sbp_i, *sbp_j; - two_body_parameters *twbp; - bond_order_data *bo_ij; - list *bonds; - static_storage *workspace; - - i = blockIdx.x * blockDim.x + threadIdx.x; - if ( i >= N ) return; - - bonds = &p_bonds; - workspace = &p_workspace; - - gp3 = g_params.l[3]; - gp4 = g_params.l[4]; - gp7 = g_params.l[7]; - gp10 = g_params.l[10]; - gp37 = (int) g_params.l[37]; - - //for( i=0; i < system->N; ++i ) - start_i = Start_Index(i, bonds); - end_i = End_Index(i, bonds); - //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i ); - for( pj = start_i; pj < end_i; ++pj ) - { - //TODO - //if( i < bonds->select.bond_list[pj].nbr ) - if( i < bonds->select.bond_list[pj].nbr ) - { - //TODO - /* set the pointers */ - j = bonds->select.bond_list[pj].nbr; - type_i = atoms[i].type; - type_j = atoms[j].type; - sbp_i = &( sbp[type_i] ); - sbp_j = &( sbp[type_j] ); - twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] ); - bo_ij = &( bonds->select.bond_list[pj].bo_data ); - - /* calculate the constants */ - pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 ); - exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) ); - CEbo = -twbp->De_s * exp_be12 * - ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 ); - - /* calculate the Bond Energy */ - ebond = - -twbp->De_s * bo_ij->BO_s * exp_be12 - -twbp->De_p * bo_ij->BO_pi - -twbp->De_pp * bo_ij->BO_pi2; - - //PERFORMANCE IMAPCT - //atomicAdd (&data->E_BE, ebond); - //TODO - //E_BE [ i ] += ebond/2.0; - E_BE [ i ] += ebond; - //data->E_BE += ebond; - - /* calculate derivatives of Bond Orders */ - bo_ij->Cdbo += CEbo; - bo_ij->Cdbopi -= (CEbo + twbp->De_p); - bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp); - -#ifdef TEST_ENERGY - //TODO - //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", - // workspace->orig_id[i], workspace->orig_id[j], - // i+1, j+1, - // bo_ij->BO, ebond/*, data->E_BE*/ ); - /* - fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", - workspace->orig_id[i], workspace->orig_id[j], - CEbo, -twbp->De_p, -twbp->De_pp );*/ -#endif -#ifdef TEST_FORCES - //TODO - /* - Add_dBO( system, lists, i, pj, CEbo, workspace->f_be ); - Add_dBOpinpi2( system, lists, i, pj, - -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), - workspace->f_be, workspace->f_be ); - */ - //TODO -#endif - - /* Stabilisation terminal triple bond */ - if( bo_ij->BO >= 1.00 ) { - if( gp37 == 2 || - (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || - (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) { - // ba = SQR(bo_ij->BO - 2.50); - exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) ); - //oboa=abo(j1)-boa; - //obob=abo(j2)-boa; - exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO)); - exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO)); - //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2); - exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j])); - hulpov = 1.0 / (1.0 + 25.0 * exphuov); - - estriph = gp10 * exphu * hulpov * (exphua1 + exphub1); - //estrain(j1) = estrain(j1) + 0.50*estriph; - //estrain(j2) = estrain(j2) + 0.50*estriph; - - //PERFORMANCE IMPACT - //atomicAdd (&data->E_BE, estriph); - E_BE [ i] += estriph; - //data->E_BE += estriph; - - decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * - ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) ); - decobdboua = -gp10 * exphu * hulpov * - (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - decobdboub = -gp10 * exphu * hulpov * - (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1)); - - bo_ij->Cdbo += decobdbo; - - //PERFORMANCE IMAPCT - workspace->CdDelta[i] += decobdboua; - //atomicAdd (&workspace->CdDelta[j], decobdboub); - //CdDelta [ i * N + i ] += decobdboua; - //CdDelta [ i * N + j ] += decobdboua; - //workspace->CdDelta [i] += decobdboua; - //workspace->CdDelta [j] += decobdboub; - -#ifdef TEST_ENERGY - /* - fprintf( out_control->ebond, - "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - //i+1, j+1, - estriph, decobdbo, decobdboua, decobdboub ); - */ -#endif -#ifdef TEST_FORCES - /* - Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be ); - Add_dDelta( system, lists, i, decobdboua, workspace->f_be ); - Add_dDelta( system, lists, j, decobdboub, workspace->f_be ); - */ -#endif - } - } - } - } //TODO commented out the if statement for processing i < j. - // we process all teh bonds and add only half the energy -} - - -void vdW_Coulomb_Energy( reax_system *system, control_params *control, - simulation_data *data, static_storage *workspace, - list **lists, output_controls *out_control ) -{ - int i, j, pj; - int start_i, end_i; - real self_coef; - real p_vdW1, p_vdW1i; - real powr_vdW1, powgi_vdW1; - real tmp, r_ij, fn13, exp1, exp2; - real Tap, dTap, dfn13, CEvd, CEclmb; - real dr3gamij_1, dr3gamij_3; - real e_ele, e_vdW, e_core, de_core; - rvec temp, ext_press; - // rtensor temp_rtensor, total_rtensor; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - list *far_nbrs; - - p_vdW1 = system->reaxprm.gp.l[28]; - p_vdW1i = 1.0 / p_vdW1; - far_nbrs = (*lists) + FAR_NBRS; - e_ele = 0; - e_vdW = 0; - e_core = 0; - de_core = 0; - - for( i = 0; i < system->N; ++i ) { - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - // fprintf( stderr, "i: %d, start: %d, end: %d\n", - // i, start_i, end_i ); - - for( pj = start_i; pj < end_i; ++pj ) - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - r_ij = nbr_pj->d; - twbp = &(system->reaxprm.tbp[ index_tbp (system->atoms[i].type, system->atoms[j].type, &system->reaxprm) ]); - self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! - - /* Calculate Taper and its derivative */ - // Tap = nbr_pj->Tap; -- precomputed during compte_H - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; - dTap = dTap * r_ij + 5*control->Tap5; - dTap = dTap * r_ij + 4*control->Tap4; - dTap = dTap * r_ij + 3*control->Tap3; - dTap = dTap * r_ij + 2*control->Tap2; - dTap += control->Tap1/r_ij; - - /*vdWaals Calculations*/ - if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) { - // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - data->E_vdW += e_vdW = - self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) * dfn13 ); - } - else{ // no shielding - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - - data->E_vdW += e_vdW = - self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) ); - } - - if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) { - // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - e_vdW += self_coef * Tap * e_core; - data->E_vdW += self_coef * Tap * e_core; - - de_core = -(twbp->acore/twbp->rcore) * e_core; - CEvd += self_coef * ( dTap * e_core + Tap * de_core ); - } - - /*Coulomb Calculations*/ - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - tmp = Tap / dr3gamij_3; - //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H - data->E_Ele += e_ele = - self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp; - - - CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * - ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; - /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* - ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/ - - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - rvec_ScaledAdd( system->atoms[i].f, - -(CEvd+CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( system->atoms[j].f, - +(CEvd+CEclmb), nbr_pj->dvec ); - } - else { // NPT, iNPT or sNPT - /* for pressure coupling, terms not related to bond order - derivatives are added directly into pressure vector/tensor */ - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - - rvec_ScaledAdd( system->atoms[i].f, -1., temp ); - rvec_Add( system->atoms[j].f, temp ); - - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - - /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", - i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] ); - - fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] ); - - fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n", - data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/ - - /* This part is intended for a fully-flexible box */ - /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, - system->atoms[i].x ); - rtensor_Scale( total_rtensor, - F_C * -(CEvd + CEclmb), temp_rtensor ); - rvec_OuterProduct( temp_rtensor, - nbr_pj->dvec, system->atoms[j].x ); - rtensor_ScaledAdd( total_rtensor, - F_C * +(CEvd + CEclmb), temp_rtensor ); - - if( nbr_pj->imaginary ) - // This is an external force due to an imaginary nbr - rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor ); - else - // This interaction is completely internal - rtensor_Add( data->flex_bar.P, total_rtensor ); */ - } - -#ifdef TEST_ENERGY - rvec_MakeZero( temp ); - rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec ); - fprintf( out_control->evdw, - "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - //i+1, j+1, - MIN( workspace->orig_id[i], workspace->orig_id[j] ), - MAX( workspace->orig_id[i], workspace->orig_id[j] ), - r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ ); - - fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n", - MIN( workspace->orig_id[i], workspace->orig_id[j] ), - MAX( workspace->orig_id[i], workspace->orig_id[j] ), - r_ij, system->atoms[i].q, system->atoms[j].q, - e_ele/*, data->E_Ele*/ ); -#endif -#ifdef TEST_FORCES - rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); -#endif - } - } - - // fclose( fout ); - - // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", - // data->ext_press[0], data->ext_press[1], data->ext_press[2] ); -} - - -/* - - GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, - two_body_parameters *tbp, - global_parameters g_p, - control_params *control, - simulation_data *data, - list p_far_nbrs, - real *E_vdW, real *E_Ele, rvec *aux_ext_press, - int num_atom_types, int N ) - { - int i, j, pj; - int start_i, end_i; - real self_coef; - real p_vdW1, p_vdW1i; - real powr_vdW1, powgi_vdW1; - real tmp, r_ij, fn13, exp1, exp2; - real Tap, dTap, dfn13, CEvd, CEclmb; - real dr3gamij_1, dr3gamij_3; - real e_ele, e_vdW, e_core, de_core; - rvec temp, ext_press; -// rtensor temp_rtensor, total_rtensor; -two_body_parameters *twbp; -far_neighbor_data *nbr_pj; -list *far_nbrs = &p_far_nbrs; - -i = blockIdx.x * blockDim.x + threadIdx.x; -if ( i >= N ) return; - -p_vdW1 = g_p.l[28]; -p_vdW1i = 1.0 / p_vdW1; -e_ele = 0; -e_vdW = 0; -e_core = 0; -de_core = 0; - -//for( i = 0; i < system->N; ++i ) { -start_i = Start_Index(i, far_nbrs); -end_i = End_Index(i, far_nbrs); -// fprintf( stderr, "i: %d, start: %d, end: %d\n", -// i, start_i, end_i ); - -for( pj = start_i; pj < end_i; ++pj ) -if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { -nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); -j = nbr_pj->nbr; -r_ij = nbr_pj->d; -twbp = &(tbp[ index_tbp (atoms[i].type, atoms[j].type, num_atom_types) ]); -self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! - -//CHANGE ORIGINAL -//if (i <= j) continue; -//CHANGE ORIGINAL - -// Calculate Taper and its derivative -// Tap = nbr_pj->Tap; -- precomputed during compte_H -Tap = control->Tap7 * r_ij + control->Tap6; -Tap = Tap * r_ij + control->Tap5; -Tap = Tap * r_ij + control->Tap4; -Tap = Tap * r_ij + control->Tap3; -Tap = Tap * r_ij + control->Tap2; -Tap = Tap * r_ij + control->Tap1; -Tap = Tap * r_ij + control->Tap0; - -dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; -dTap = dTap * r_ij + 5*control->Tap5; -dTap = dTap * r_ij + 4*control->Tap4; -dTap = dTap * r_ij + 3*control->Tap3; -dTap = dTap * r_ij + 2*control->Tap2; -dTap += control->Tap1/r_ij; - -//vdWaals Calculations -if(g_p.vdw_type==1 || g_p.vdw_type==3) { - // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - E_vdW [i] += e_vdW / 2.0; - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) * dfn13 ); -} -else{ // no shielding - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - - e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - E_vdW [i] += e_vdW / 2.0; - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) ); -} - -if(g_p.vdw_type==2 || g_p.vdw_type==3) { - // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - e_vdW = self_coef * Tap * e_core; - - //TODO check this - E_vdW [i] += e_vdW / 2.0; - //TODO check this - - de_core = -(twbp->acore/twbp->rcore) * e_core; - CEvd += self_coef * ( dTap * e_core + Tap * de_core ); -} - -//Coulomb Calculations -dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); -dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - -tmp = Tap / dr3gamij_3; -//tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H -e_ele = -self_coef * C_ele * atoms[i].q * atoms[j].q * tmp; -E_Ele [i] += e_ele / 2.0; - -CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q * -( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; -//CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* -// ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3; - -if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - if (i >= j) - rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec ); - else - rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec ); -} -else { // NPT, iNPT or sNPT - // for pressure coupling, terms not related to bond order - // derivatives are added directly into pressure vector/tensor - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - - if ( i >= j) - rvec_ScaledAdd( atoms[i].f, -1., temp ); - else - rvec_Add( atoms[i].f, temp ); - - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - - //rvec_Add( data->ext_press, ext_press ); - rvec_Copy (aux_ext_press[i], ext_press); - - //TODO CHECK THIS calculation here, it should be divided by two somehow. -} -} -//} -} - -*/ - - - - -GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, - two_body_parameters *tbp, - global_parameters g_p, - control_params *control, - simulation_data *data, - list p_far_nbrs, - real *E_vdW, real *E_Ele, rvec *aux_ext_press, - int num_atom_types, int N ) -{ - extern __shared__ real _vdw[]; - extern __shared__ real _ele[]; - extern __shared__ rvec _force []; - - real *sh_vdw; - real *sh_ele; - rvec *sh_force; - - int i, j, pj; - int start_i, end_i; - real self_coef; - real p_vdW1, p_vdW1i; - real powr_vdW1, powgi_vdW1; - real tmp, r_ij, fn13, exp1, exp2; - real Tap, dTap, dfn13, CEvd, CEclmb; - real dr3gamij_1, dr3gamij_3; - real e_ele, e_vdW, e_core, de_core; - rvec temp, ext_press; - // rtensor temp_rtensor, total_rtensor; - two_body_parameters *twbp; - far_neighbor_data *nbr_pj; - list *far_nbrs = &p_far_nbrs; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warpid = thread_id / VDW_THREADS_PER_ATOM; - int laneid = thread_id & (VDW_THREADS_PER_ATOM -1); - - i = warpid; - - sh_vdw = _vdw; - sh_ele = _vdw + blockDim.x; - sh_force = (rvec *)( _vdw + 2*blockDim.x); - - sh_vdw[threadIdx.x] = 0.0; - sh_ele[threadIdx.x] = 0.0; - rvec_MakeZero ( sh_force [threadIdx.x] ); - - if (i < N) - { - - p_vdW1 = g_p.l[28]; - p_vdW1i = 1.0 / p_vdW1; - e_ele = 0; - e_vdW = 0; - e_core = 0; - de_core = 0; - - //for( i = 0; i < system->N; ++i ) { - start_i = Start_Index(i, far_nbrs); - end_i = End_Index(i, far_nbrs); - // fprintf( stderr, "i: %d, start: %d, end: %d\n", - // i, start_i, end_i ); - - pj = start_i + laneid; - //for( pj = start_i; pj < end_i; ++pj ) - while (pj < end_i) - { - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - r_ij = nbr_pj->d; - twbp = &(tbp[ index_tbp (atoms[i].type, atoms[j].type, num_atom_types) ]); - self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes! - - //CHANGE ORIGINAL - //if (i <= j) continue; - //CHANGE ORIGINAL - - // Calculate Taper and its derivative - // Tap = nbr_pj->Tap; -- precomputed during compte_H - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; - dTap = dTap * r_ij + 5*control->Tap5; - dTap = dTap * r_ij + 4*control->Tap4; - dTap = dTap * r_ij + 3*control->Tap3; - dTap = dTap * r_ij + 2*control->Tap2; - dTap += control->Tap1/r_ij; - - //vdWaals Calculations - if(g_p.vdw_type==1 || g_p.vdw_type==3) { - // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - - - //E_vdW [i] += e_vdW / 2.0; - sh_vdw [threadIdx.x] += e_vdW/2.0; - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) * dfn13 ); - } - else{ // no shielding - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - - e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2); - - - //E_vdW [i] += e_vdW / 2.0; - sh_vdw [threadIdx.x] += e_vdW/2.0; - - CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * - (exp1 - exp2) ); - } - - if(g_p.vdw_type==2 || g_p.vdw_type==3) { - // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - e_vdW = self_coef * Tap * e_core; - - //TODO check this - //E_vdW [i] += e_vdW / 2.0; - sh_vdw [threadIdx.x] += e_vdW / 2.0; - //TODO check this - - de_core = -(twbp->acore/twbp->rcore) * e_core; - CEvd += self_coef * ( dTap * e_core + Tap * de_core ); - } - - //Coulomb Calculations - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - tmp = Tap / dr3gamij_3; - //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H - e_ele = - self_coef * C_ele * atoms[i].q * atoms[j].q * tmp; - - //E_Ele [i] += e_ele / 2.0; - sh_ele [threadIdx.x] += e_ele / 2.0; - - CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q * - ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; - //CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* - // ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3; - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - if (i >= j){ - //rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( sh_force[threadIdx.x], -(CEvd+CEclmb), nbr_pj->dvec ); - } - else - { - //rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( sh_force[threadIdx.x], +(CEvd+CEclmb), nbr_pj->dvec ); - } - } - else { // NPT, iNPT or sNPT - // for pressure coupling, terms not related to bond order - // derivatives are added directly into pressure vector/tensor - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - - if ( i >= j) - { - //rvec_ScaledAdd( atoms[i].f, -1., temp ); - rvec_ScaledAdd( sh_force[threadIdx.x], -1., temp ); - } - else - { - //rvec_Add( atoms[i].f, temp ); - rvec_Add( sh_force[threadIdx.x], temp ); - } - - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - - //rvec_Add( data->ext_press, ext_press ); - rvec_Copy (aux_ext_press[i], ext_press); - - //TODO CHECK THIS calculation here, it should be divided by two somehow. - } - } // if condition for far neighbors - - - pj += VDW_THREADS_PER_ATOM; - - } // end of while loop for pj < end_i condition - } // if (i < N ) condition - //} - - __syncthreads (); - - if (laneid < 16) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); - } - __syncthreads (); - if (laneid < 8) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); - } - __syncthreads (); - if (laneid < 4) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); - } - __syncthreads (); - if (laneid < 2) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); - } - __syncthreads (); - if (laneid < 1) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); - } - __syncthreads (); - if (laneid == 0) { - E_vdW [i] += sh_vdw[threadIdx.x]; - E_Ele [i] += sh_ele[threadIdx.x]; - rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]); - } - - -} - -void LR_vdW_Coulomb( reax_system *system, control_params *control, - int i, int j, real r_ij, LR_data *lr ) -{ - real p_vdW1 = system->reaxprm.gp.l[28]; - real p_vdW1i = 1.0 / p_vdW1; - real powr_vdW1, powgi_vdW1; - real tmp, fn13, exp1, exp2; - real Tap, dTap, dfn13; - real dr3gamij_1, dr3gamij_3; - real e_core, de_core; - two_body_parameters *twbp; - - twbp = &(system->reaxprm.tbp[ index_tbp (i,j,&system->reaxprm) ]); - e_core = 0; - de_core = 0; - - /* calculate taper and its derivative */ - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dTap = 7*control->Tap7 * r_ij + 6*control->Tap6; - dTap = dTap * r_ij + 5*control->Tap5; - dTap = dTap * r_ij + 4*control->Tap4; - dTap = dTap * r_ij + 3*control->Tap3; - dTap = dTap * r_ij + 2*control->Tap2; - dTap += control->Tap1/r_ij; - - - /* vdWaals calculations */ - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\ -Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n", -Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), -powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */ - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0); - - lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; - - /*vdWaals Calculations*/ - if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) - { // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); - - lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; - } - else{ // no shielding - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - - lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); - } - - if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) - { // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore))); - lr->e_vdW += Tap * e_core; - - de_core = -(twbp->acore/twbp->rcore) * e_core; - lr->CEvd += dTap * e_core + Tap * de_core; - } - - /* Coulomb calculations */ - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - tmp = Tap / dr3gamij_3; - lr->H = EV_to_KCALpMOL * tmp; - lr->e_ele = C_ele * tmp; - /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\ -Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n", -i, system->atoms[i].type, j, system->atoms[j].type, -twbp->gamma, Tap, dr3gamij_3, -system->atoms[i].q, system->atoms[j].q ); */ - - lr->CEclmb = C_ele * ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; - /* fprintf( stdout, "%d %d\t%g\t%g %g\t%g %g\t%g %g\n", - i+1, j+1, r_ij, e_vdW, CEvd * r_ij, - system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */ - - /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n", - i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */ -} - - -void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control, - simulation_data *data, - static_storage *workspace, list **lists, - output_controls *out_control ) -{ - int i, j, pj, r, steps, update_freq, update_energies; - int type_i, type_j, tmin, tmax; - int start_i, end_i; - real r_ij, self_coef, base, dif; - real e_vdW, e_ele; - real CEvd, CEclmb; - rvec temp, ext_press; - far_neighbor_data *nbr_pj; - list *far_nbrs = (*lists) + FAR_NBRS; - LR_lookup_table *t; - - steps = data->step - data->prev_steps; - update_freq = out_control->energy_update_freq; - update_energies = update_freq > 0 && steps % update_freq == 0; - - for( i = 0; i < system->N; ++i ) { - type_i = system->atoms[i].type; - start_i = Start_Index(i,far_nbrs); - end_i = End_Index(i,far_nbrs); - - for( pj = start_i; pj < end_i; ++pj ) - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - type_j = system->atoms[j].type; - r_ij = nbr_pj->d; - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); - - /* Cubic Spline Interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif); - - if( update_energies ) { - e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + - t->vdW[r].a; - e_vdW *= self_coef; - - e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q; - - data->E_vdW += e_vdW; - data->E_Ele += e_ele; - } - - CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + - t->CEvd[r].a; - CEvd *= self_coef; - //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b; - - CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + - t->CEclmb[r].a; - CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q; - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec ); - } - else { // NPT, iNPT or sNPT - /* for pressure coupling, terms not related to bond order - derivatives are added directly into pressure vector/tensor */ - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - rvec_ScaledAdd( system->atoms[i].f, -1., temp ); - rvec_Add( system->atoms[j].f, temp ); - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - rvec_Add( data->ext_press, ext_press ); - } - -#ifdef TEST_ENERGY - fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - r_ij, e_vdW, data->E_vdW ); - fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n", - workspace->orig_id[i], workspace->orig_id[j], - r_ij, system->atoms[i].q, system->atoms[j].q, - e_ele, data->E_Ele ); -#endif -#ifdef TEST_FORCES - rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec ); - rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec ); -#endif - } - } -} - -GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( reax_atom *atoms, - control_params *control, - simulation_data *data, - list p_far_nbrs, - real *E_vdW, real *E_Ele, rvec *aux_ext_press, - LR_lookup_table *d_LR, - int num_atom_types, - int energy_update_freq, - int N ) -{ - - extern __shared__ real _vdw[]; - extern __shared__ real _ele[]; - extern __shared__ rvec _force []; - - real *sh_vdw; - real *sh_ele; - rvec *sh_force; - - int i, j, pj, r, steps, update_freq, update_energies; - int type_i, type_j, tmin, tmax; - int start_i, end_i; - real r_ij, self_coef, base, dif; - real e_vdW, e_ele; - real CEvd, CEclmb; - rvec temp, ext_press; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - list *far_nbrs = &p_far_nbrs; - - int thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int warpid = thread_id / VDW_THREADS_PER_ATOM; - int laneid = thread_id & (VDW_THREADS_PER_ATOM -1); - - i = warpid; - - sh_vdw = _vdw; - sh_ele = _vdw + blockDim.x; - sh_force = (rvec *)( _vdw + 2*blockDim.x); - - sh_vdw[threadIdx.x] = 0.0; - sh_ele[threadIdx.x] = 0.0; - rvec_MakeZero ( sh_force [threadIdx.x] ); - - if ( i < N ) - { - - reax_atom local_atom ; - local_atom.q = atoms[i].q; - //local_atom.q = d_far_data.q[i]; - local_atom.type = atoms[i].type; - //local_atom.type = d_far_data.type[i]; - - /* - sh_vdw = _vdw; - sh_ele = _vdw + warpid; - sh_force = (rvec *)( _vdw + 2*warpid); - - sh_vdw[threadIdx.x] = 0.0; - sh_ele[threadIdx.x] = 0.0; - rvec_MakeZero ( sh_force [threadIdx.x] ); - */ - - - steps = data->step - data->prev_steps; - update_freq = energy_update_freq; - update_energies = update_freq > 0 && steps % update_freq == 0; - - //for( i = 0; i < system->N; ++i ) { - type_i = local_atom.type; - start_i = Start_Index(i,far_nbrs); - end_i = End_Index(i,far_nbrs); - - pj = start_i + laneid; - - //for( pj = start_i; pj < end_i; ++pj ) - while (pj < end_i) - { - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) - //if( d_far_data.d[pj] <= control->r_cut ) - { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - //j = d_far_data.nbrs[pj]; - type_j = atoms[j].type; - //type_j = d_far_data.type[j]; - r_ij = nbr_pj->d; - //r_ij = d_far_data.d[pj]; - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); - - //TODO - //CHANGE ORIGINAL - //if (i <= j) { pj += blockDim.x; continue; } - //CHANGE ORIGINAL - - /* Cubic Spline Interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - - if(( update_energies )) - { - e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + - t->vdW[r].a; - e_vdW *= self_coef; - - e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + t->ele[r].a; - e_ele *= self_coef * local_atom.q * atoms[j].q; - - - //data->E_vdW += e_vdW; - //TODO - //E_vdW [i] += e_vdW / 2.0; - //E_vdW [i] = __dadd_rd (E_vdW [i], e_vdW/2.0); - sh_vdw [threadIdx.x] += e_vdW/2.0; - //E_vdW [i] += e_vdW; - - //TODO - //data->E_Ele += e_ele; - //E_Ele [i] += e_ele / 2.0; - //E_Ele [i] = __dadd_rd ( E_Ele [i], e_ele / 2.0); - sh_ele [threadIdx.x] += e_ele/2.0; - //E_Ele [i] += e_ele; - } - - CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + - t->CEvd[r].a; - CEvd *= self_coef; - - CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + - t->CEclmb[r].a; - CEclmb *= self_coef * local_atom.q * atoms[j].q; - //CEclmb *= self_coef * local_atom.q * d_far_data.q[j]; - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) { - if ( i >= j) - //rvec_ScaledAdd( atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec ); - //rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), d_far_data.dvec[pj] ); - else - //rvec_ScaledAdd( atoms[i].f, +(CEvd + CEclmb), nbr_pj->dvec ); - rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec ); - //rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), d_far_data.dvec[pj] ); - } - else { // NPT, iNPT or sNPT - // for pressure coupling, terms not related to bond order - // derivatives are added directly into pressure vector/tensor / - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - if (i >= j) - rvec_ScaledAdd( atoms[i].f, -1., temp ); - else - rvec_Add( atoms[i].f, temp ); - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - - //rvec_Add( data->ext_press, ext_press ); - rvec_Copy (aux_ext_press [i], ext_press ); - - //TODO CHECK THIS - } - - - - } - - pj += VDW_THREADS_PER_ATOM; - } - - }// if i < n condition - - __syncthreads (); - - if (laneid < 16) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); - } - __syncthreads (); - if (laneid < 8) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); - } - __syncthreads (); - if (laneid < 4) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); - } - __syncthreads (); - if (laneid < 2) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); - } - __syncthreads (); - if (laneid < 1) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); - } - __syncthreads (); - if (laneid == 0) { - E_vdW [i] += sh_vdw[threadIdx.x]; - E_Ele [i] += sh_ele[threadIdx.x]; - rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]); - } - - - } - - - - - GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1( reax_atom *atoms, - control_params *control, - simulation_data *data, - list p_far_nbrs, - real *E_vdW, real *E_Ele, rvec *aux_ext_press, - LR_lookup_table *d_LR, - int num_atom_types, - int energy_update_freq, - int N ) - { - - extern __shared__ real _vdw[]; - extern __shared__ real _ele[]; - - real *sh_vdw; - real *sh_ele; - - int i, j, pj, r, steps, update_freq, update_energies; - int type_i, type_j, tmin, tmax; - int start_i, end_i; - real r_ij, self_coef, base, dif; - real e_vdW, e_ele; - real CEvd, CEclmb; - rvec temp, ext_press; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - list *far_nbrs = &p_far_nbrs; - - i = blockIdx.x; - - reax_atom local_atom; - local_atom.q = atoms[i].q; - local_atom.type = atoms[i].type; - - sh_vdw = _vdw; - sh_ele = _vdw + blockDim.x; - - sh_vdw[threadIdx.x] = 0.0; - sh_ele[threadIdx.x] = 0.0; - - - steps = data->step - data->prev_steps; - update_freq = energy_update_freq; - update_energies = update_freq > 0 && steps % update_freq == 0; - - type_i = local_atom.type; - start_i = Start_Index(i,far_nbrs); - end_i = End_Index(i,far_nbrs); - - pj = start_i + threadIdx.x; - - while (pj < end_i) - { - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) - { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - type_j = atoms[j].type; - r_ij = nbr_pj->d; - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); - - /* Cubic Spline Interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - - if(( update_energies )) - { - e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + - t->vdW[r].a; - e_vdW *= self_coef; - - e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + - t->ele[r].a; - e_ele *= self_coef * local_atom.q * atoms[j].q; - - sh_vdw [threadIdx.x] += e_vdW/2.0; - sh_ele [threadIdx.x] += e_ele/2.0; - } - } - - pj += blockDim.x; - } - - // now do a reduce inside the warp for E_vdW, E_Ele and force. - if (threadIdx.x < 16) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16]; - } - if (threadIdx.x < 8) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8]; - } - if (threadIdx.x < 4) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4]; - } - if (threadIdx.x < 2) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2]; - } - if (threadIdx.x < 1) { - sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1]; - sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1]; - } - if (threadIdx.x == 0) { - E_vdW [i] += sh_vdw[0]; - E_Ele [i] += sh_ele[0]; - } - - } - - - - - - - GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2( reax_atom *atoms, - control_params *control, - simulation_data *data, - list p_far_nbrs, - real *E_vdW, real *E_Ele, rvec *aux_ext_press, - LR_lookup_table *d_LR, - int num_atom_types, - int energy_update_freq, - int N ) - { - - extern __shared__ rvec _force []; - - rvec *sh_force; - - int i, j, pj, r, steps, update_freq, update_energies; - int type_i, type_j, tmin, tmax; - int start_i, end_i; - real r_ij, self_coef, base, dif; - real e_vdW, e_ele; - real CEvd, CEclmb; - rvec temp, ext_press; - far_neighbor_data *nbr_pj; - LR_lookup_table *t; - list *far_nbrs = &p_far_nbrs; - - i = blockIdx.x; - - reax_atom local_atom; - local_atom.q = atoms[i].q; - local_atom.type = atoms[i].type; - - sh_force = _force; - rvec_MakeZero ( sh_force [threadIdx.x] ); - - - steps = data->step - data->prev_steps; - update_freq = energy_update_freq; - update_energies = update_freq > 0 && steps % update_freq == 0; - - //for( i = 0; i < system->N; ++i ) { - type_i = local_atom.type; - start_i = Start_Index(i,far_nbrs); - end_i = End_Index(i,far_nbrs); - - pj = start_i + threadIdx.x; - - while (pj < end_i) - { - if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) - { - nbr_pj = &( far_nbrs->select.far_nbr_list[pj] ); - j = nbr_pj->nbr; - type_j = atoms[j].type; - r_ij = nbr_pj->d; - self_coef = (i == j) ? 0.5 : 1.0; - tmin = MIN( type_i, type_j ); - tmax = MAX( type_i, type_j ); - t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); - - /* Cubic Spline Interpolation */ - r = (int)(r_ij * t->inv_dx); - if( r == 0 ) ++r; - base = (real)(r+1) * t->dx; - dif = r_ij - base; - - CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + - t->CEvd[r].a; - CEvd *= self_coef; - - CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + - t->CEclmb[r].a; - CEclmb *= self_coef * local_atom.q * atoms[j].q; - - if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) { - if ( i >= j) - rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec ); - else - rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec ); - } - else { // NPT, iNPT or sNPT - // for pressure coupling, terms not related to bond order - // derivatives are added directly into pressure vector/tensor / - rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec ); - if (i >= j) - rvec_ScaledAdd( atoms[i].f, -1., temp ); - else - rvec_Add( atoms[i].f, temp ); - rvec_iMultiply( ext_press, nbr_pj->rel_box, temp ); - - rvec_Copy (aux_ext_press [i], ext_press ); - } - } - - pj += blockDim.x; - } - - - // now do a reduce inside the warp for E_vdW, E_Ele and force. - if (threadIdx.x < 16) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] ); - } - if (threadIdx.x < 8) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] ); - } - if (threadIdx.x < 4) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] ); - } - if (threadIdx.x < 2) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] ); - } - if (threadIdx.x < 1) { - rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] ); - } - if (threadIdx.x == 0) { - rvec_Add (atoms[i].f, sh_force [ 0 ]); - } - - - } - - - - - - - - - - - - - - - - - - - - - - - - -#if defined(OLD) - /* Linear extrapolation */ - /*p = (r_ij * t->inv_dx; - r = (int) p; - prev = &( t->y[r] ); - next = &( t->y[r+1] ); - - tmp = p - r; - e_vdW = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW )); - CEvd = self_coef * (prev->CEvd + tmp*(next->CEvd - prev->CEvd )); - - e_ele = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele )); - e_ele = e_ele * system->atoms[i].q * system->atoms[j].q; - CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb)); - CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/ -#endif diff --git a/PuReMD-GPU/src/two_body_interactions.h b/PuReMD-GPU/src/two_body_interactions.h index 41483222a1d1e66eb21805f7e1659b5eecd3f40b..f689290e9e680ca62352b6f627737b8a8b006472 100644 --- a/PuReMD-GPU/src/two_body_interactions.h +++ b/PuReMD-GPU/src/two_body_interactions.h @@ -21,156 +21,19 @@ #ifndef __TWO_BODY_INTERACTIONS_H_ #define __TWO_BODY_INTERACTIONS_H_ -#include <mytypes.h> -#include "index_utils.h" +#include "mytypes.h" -void Bond_Energy( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); -void vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); -void LR_vdW_Coulomb( reax_system*, control_params*, int, int, real, LR_data* ); -void Tabulated_vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls* ); - -//CUDA functions -GLOBAL void Cuda_Bond_Energy ( reax_atom *, global_parameters , single_body_parameters *, two_body_parameters *, - simulation_data *, static_storage , list , int , int, real *); - -GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *, - two_body_parameters *, - global_parameters , - control_params *, - simulation_data *, - list , - real *, real *, rvec *, - int , int ); - -GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy ( reax_atom *, control_params *, simulation_data *, - list , real *, real *, rvec *, - LR_lookup_table *, int , int , int ) ; -GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1 ( reax_atom *, control_params *, simulation_data *, - list , real *, real *, rvec *, - LR_lookup_table *, int , int , int ) ; -GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2 ( reax_atom *, control_params *, simulation_data *, - list , real *, real *, rvec *, - LR_lookup_table *, int , int , int ) ; - -HOST_DEVICE void LR_vdW_Coulomb( global_parameters , two_body_parameters *, - control_params *, int , int , real , LR_data * , int); - -HOST_DEVICE inline void LR_vdW_Coulomb( global_parameters g_params, two_body_parameters *tbp, - control_params *control, - int i, int j, real r_ij, LR_data *lr, int num_atom_types ) -{ - real p_vdW1 = g_params.l[28]; - real p_vdW1i = 1.0 / p_vdW1; - real powr_vdW1, powgi_vdW1; - real tmp, fn13, exp1, exp2; - real Tap, dTap, dfn13; - real dr3gamij_1, dr3gamij_3; - real e_core, de_core; - two_body_parameters *twbp; - - twbp = &(tbp[ index_tbp (i, j, num_atom_types) ]); - e_core = 0; - de_core = 0; - - /* calculate taper and its derivative */ - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6; - dTap = dTap * r_ij + 5 * control->Tap5; - dTap = dTap * r_ij + 4 * control->Tap4; - dTap = dTap * r_ij + 3 * control->Tap3; - dTap = dTap * r_ij + 2 * control->Tap2; - dTap += control->Tap1 / r_ij; - - - /* vdWaals calculations */ - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\ - Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n", - Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), - powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */ - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0); - - lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; - - /*vdWaals Calculations*/ - if (g_params.vdw_type == 1 || g_params.vdw_type == 3) - { - // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); - - lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; - } - else // no shielding - { - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - - lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); - } - - if (g_params.vdw_type == 2 || g_params.vdw_type == 3) - { - // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore))); - lr->e_vdW += Tap * e_core; - - de_core = -(twbp->acore / twbp->rcore) * e_core; - lr->CEvd += dTap * e_core + Tap * de_core; - } +void Bond_Energy( reax_system*, control_params*, simulation_data*, + static_storage*, list**, output_controls* ); - /* Coulomb calculations */ - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); +void vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*, + static_storage*, list**, output_controls* ); - tmp = Tap / dr3gamij_3; - lr->H = EV_to_KCALpMOL * tmp; - lr->e_ele = C_ele * tmp; - /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\ - Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n", - i, system->atoms[i].type, j, system->atoms[j].type, - twbp->gamma, Tap, dr3gamij_3, - system->atoms[i].q, system->atoms[j].q ); */ +void LR_vdW_Coulomb( reax_system*, control_params*, int, int, real, LR_data* ); - lr->CEclmb = C_ele * ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; - /* fprintf( stdout, "%d %d\t%g\t%g %g\t%g %g\t%g %g\n", - i+1, j+1, r_ij, e_vdW, CEvd * r_ij, - system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */ +void Tabulated_vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*, + static_storage*, list**, output_controls* ); - /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n", - i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */ -} #endif diff --git a/PuReMD-GPU/src/validation.cu b/PuReMD-GPU/src/validation.cu index f8261555a595a06dcbdfbffe5d62c7a0375c4c97..21cd2145e689621ee0b3827889b106ed7c05af7f 100644 --- a/PuReMD-GPU/src/validation.cu +++ b/PuReMD-GPU/src/validation.cu @@ -18,7 +18,6 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ - #include "validation.h" #include "cuda_utils.h" @@ -27,33 +26,37 @@ #include "sort.h" #include "index_utils.h" -bool check_zero (real p1, real p2) + +int check_zero (real p1, real p2) { if (abs (p1 - p2) >= GPU_TOLERANCE) - return true; + return TRUE; else - return false; + return FALSE; } -bool check_zero (rvec p1, rvec p2) + +int check_zero (rvec p1, rvec p2) { if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) || ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) || ((abs (p1[2] - p2[2])) >= GPU_TOLERANCE )) - return true; - else return false; + return TRUE; + else return FALSE; } -bool check_same (ivec p1, ivec p2) + +int check_same (ivec p1, ivec p2) { if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) ) - return true; + return TRUE; else - return false; + return FALSE; } -bool validate_box (simulation_box *host, simulation_box *dev) + +int validate_box (simulation_box *host, simulation_box *dev) { simulation_box test; @@ -62,14 +65,15 @@ bool validate_box (simulation_box *host, simulation_box *dev) if (memcmp (&test, host, SIMULATION_BOX_SIZE)) { fprintf (stderr, " Simulation box is not in synch between host and device \n"); - return false; + return FALSE; } fprintf (stderr, " Simulation box is in **synch** between host and device \n"); - return true; + return TRUE; } -bool validate_atoms (reax_system *system, list **lists) + +int validate_atoms (reax_system *system, list **lists) { int start, end, index, count, miscount; @@ -154,9 +158,10 @@ bool validate_atoms (reax_system *system, list **lists) //fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount); free (test); - return true; + return TRUE; } + void Print_Matrix( sparse_matrix *A ) { int i, j; @@ -170,6 +175,7 @@ void Print_Matrix( sparse_matrix *A ) } } + void Print_Matrix_L( sparse_matrix *A ) { int i, j; @@ -184,7 +190,7 @@ void Print_Matrix_L( sparse_matrix *A ) } -bool validate_sort_matrix (reax_system *system, static_storage *workspace) +int validate_sort_matrix (reax_system *system, static_storage *workspace) { sparse_matrix test; int index, count; @@ -221,7 +227,7 @@ bool validate_sort_matrix (reax_system *system, static_storage *workspace) } -bool validate_sparse_matrix( reax_system *system, static_storage *workspace ) +int validate_sparse_matrix( reax_system *system, static_storage *workspace ) { sparse_matrix test; int index, count; @@ -287,10 +293,10 @@ bool validate_sparse_matrix( reax_system *system, static_storage *workspace ) free (test.start); free (test.end); free (test.entries); - return true; + return TRUE; } -bool validate_lu (static_storage *workspace) +int validate_lu (static_storage *workspace) { sparse_matrix test; int index, count; @@ -354,7 +360,7 @@ bool validate_lu (static_storage *workspace) } //fprintf (stderr, "L and U match on device and host \n"); - return true; + return TRUE; } void print_sparse_matrix (reax_system *system, static_storage *workspace) @@ -405,7 +411,7 @@ void print_sparse_matrix (reax_system *system, static_storage *workspace) } -bool validate_bonds (reax_system *system, static_storage *workspace, list **lists) +int validate_bonds (reax_system *system, static_storage *workspace, list **lists) { int start, end, index, count, miscount; int *d_start, *d_end; @@ -601,10 +607,10 @@ bool validate_bonds (reax_system *system, static_storage *workspace, list **list free (d_start); free (d_end); free (d_bond_data); - return true; + return TRUE; } -bool validate_sym_dbond_indices (reax_system *system, static_storage *workspace, list **lists) +int validate_sym_dbond_indices (reax_system *system, static_storage *workspace, list **lists) { int start, end, index, count, miscount; int *d_start, *d_end; @@ -660,10 +666,11 @@ bool validate_sym_dbond_indices (reax_system *system, static_storage *workspace, free (d_start); free (d_end); free (d_bond_data); - return true; + return TRUE; } -bool analyze_hbonds (reax_system *system, static_storage *workspace, list **lists) + +int analyze_hbonds (reax_system *system, static_storage *workspace, list **lists) { int hindex, nbr_hindex; int pj, hj, hb_start_j, hb_end_j, j, nbr; @@ -748,7 +755,7 @@ bool analyze_hbonds (reax_system *system, static_storage *workspace, list **list } -bool validate_hbonds (reax_system *system, static_storage *workspace, list **lists) +int validate_hbonds (reax_system *system, static_storage *workspace, list **lists) { int *hbond_index, count; int *d_start, *d_end, index, d_index; @@ -858,10 +865,10 @@ bool validate_hbonds (reax_system *system, static_storage *workspace, list **lis free (d_start); free (d_end); free (data); - return true; + return TRUE; } -bool validate_neighbors (reax_system *system, list **lists) +int validate_neighbors (reax_system *system, list **lists) { list *far_nbrs = *lists + FAR_NBRS; list *d_nbrs = dev_lists + FAR_NBRS; @@ -989,971 +996,975 @@ bool validate_neighbors (reax_system *system, list **lists) start[i], end[i]); exit (10); } + } + + //fprintf (stderr, "FAR Neighbors match between device and host \n"); + free (start); + free (end); + free (data); + return TRUE; +} + + +int validate_workspace (reax_system *system, static_storage *workspace, list **lists) +{ + real *total_bond_order; + int count, tcount; + + total_bond_order = (real *) malloc ( system->N * REAL_SIZE ); + copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < system->N; i++) { + + //if (abs (workspace->total_bond_order[i] - total_bond_order[i]) >= GPU_TOLERANCE){ + if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){ + fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n", + i, workspace->total_bond_order[i], total_bond_order[i]); + exit (-1); + count ++; } + } + free (total_bond_order); + //fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count); - //fprintf (stderr, "FAR Neighbors match between device and host \n"); - free (start); - free (end); - free (data); - return true; + + rvec *dDeltap_self; + dDeltap_self = (rvec *) calloc (system->N, RVEC_SIZE); + copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + for (int i = 0; i < system->N; i++ ) + { + if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i])) + { + fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, + workspace->dDeltap_self[i][0], + workspace->dDeltap_self[i][1], + workspace->dDeltap_self[i][2], + dDeltap_self[3*i+0], + dDeltap_self[3*i+1], + dDeltap_self[3*i+2] ); + exit (-1); + count ++; } + } + free (dDeltap_self); + //fprintf (stderr, "dDeltap_self mismatch count %d\n", count); - bool validate_workspace (reax_system *system, static_storage *workspace, list **lists) + //exit for init_forces + + real *test; + test = (real *) malloc (system->N * REAL_SIZE); + + copy_host_device (test, dev_workspace->Deltap, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) + { + if (check_zero (workspace->Deltap[i], test[i])) { - real *total_bond_order; - int count, tcount; + fprintf (stderr, "Deltap: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Deltap mismatch count %d\n", count); - total_bond_order = (real *) malloc ( system->N * REAL_SIZE ); - copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (test, dev_workspace->Deltap_boc, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) + { + if (check_zero (workspace->Deltap_boc[i], test[i])) + { + fprintf (stderr, "Deltap_boc: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap_boc[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "dDeltap_boc mismatch count %d\n", count); - count = 0; - for (int i = 0; i < system->N; i++) { + copy_host_device (test, dev_workspace->Delta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Delta[i], test[i])) { + fprintf (stderr, "Delta: Mismatch index --> %d (%f %f) \n", i, workspace->Delta[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Delta mismatch count %d\n", count); - //if (abs (workspace->total_bond_order[i] - total_bond_order[i]) >= GPU_TOLERANCE){ - if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){ - fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n", - i, workspace->total_bond_order[i], total_bond_order[i]); - exit (-1); - count ++; - } - } - free (total_bond_order); - //fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count); + copy_host_device (test, dev_workspace->Delta_e, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Delta_e[i], test[i])) { + fprintf (stderr, "Delta_e: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_e[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Delta_e mismatch count %d\n", count); + copy_host_device (test, dev_workspace->vlpex, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->vlpex[i], test[i])) { + fprintf (stderr, "vlpex: Mismatch index --> %d (%f %f) \n", i, workspace->vlpex[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "vlpex mismatch count %d\n", count); - rvec *dDeltap_self; - dDeltap_self = (rvec *) calloc (system->N, RVEC_SIZE); - copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (test, dev_workspace->nlp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->nlp[i], test[i])) { + fprintf (stderr, "nlp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "nlp mismatch count %d\n", count); - count = 0; - for (int i = 0; i < system->N; i++ ) - { - if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i])) - { - fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, - workspace->dDeltap_self[i][0], - workspace->dDeltap_self[i][1], - workspace->dDeltap_self[i][2], - dDeltap_self[3*i+0], - dDeltap_self[3*i+1], - dDeltap_self[3*i+2] ); - exit (-1); - count ++; - } - } - free (dDeltap_self); - //fprintf (stderr, "dDeltap_self mismatch count %d\n", count); + copy_host_device (test, dev_workspace->Delta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Delta_lp[i], test[i])) { + fprintf (stderr, "Delta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Delta_lp mismatch count %d\n", count); - //exit for init_forces + copy_host_device (test, dev_workspace->Clp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Clp[i], test[i])) { + fprintf (stderr, "Clp: Mismatch index --> %d (%f %f) \n", i, workspace->Clp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Clp mismatch count %d\n", count); - real *test; - test = (real *) malloc (system->N * REAL_SIZE); + copy_host_device (test, dev_workspace->dDelta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->dDelta_lp[i], test[i])) { + fprintf (stderr, "dDelta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "dDelta_lp mismatch count %d\n", count); - copy_host_device (test, dev_workspace->Deltap, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) - { - if (check_zero (workspace->Deltap[i], test[i])) - { - fprintf (stderr, "Deltap: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Deltap mismatch count %d\n", count); + copy_host_device (test, dev_workspace->nlp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->nlp_temp[i], test[i])) { + fprintf (stderr, "nlp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp_temp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "nlp_temp mismatch count %d\n", count); - copy_host_device (test, dev_workspace->Deltap_boc, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) - { - if (check_zero (workspace->Deltap_boc[i], test[i])) - { - fprintf (stderr, "Deltap_boc: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap_boc[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "dDeltap_boc mismatch count %d\n", count); + copy_host_device (test, dev_workspace->Delta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->Delta_lp_temp[i], test[i])) { + fprintf (stderr, "Delta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp_temp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count); - copy_host_device (test, dev_workspace->Delta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Delta[i], test[i])) { - fprintf (stderr, "Delta: Mismatch index --> %d (%f %f) \n", i, workspace->Delta[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Delta mismatch count %d\n", count); + copy_host_device (test, dev_workspace->dDelta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->dDelta_lp_temp[i], test[i])) { + fprintf (stderr, "dDelta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp_temp[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count); - copy_host_device (test, dev_workspace->Delta_e, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Delta_e[i], test[i])) { - fprintf (stderr, "Delta_e: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_e[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Delta_e mismatch count %d\n", count); + //exit for Bond order calculations - copy_host_device (test, dev_workspace->vlpex, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->vlpex[i], test[i])) { - fprintf (stderr, "vlpex: Mismatch index --> %d (%f %f) \n", i, workspace->vlpex[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "vlpex mismatch count %d\n", count); - copy_host_device (test, dev_workspace->nlp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->nlp[i], test[i])) { - fprintf (stderr, "nlp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "nlp mismatch count %d\n", count); + copy_host_device (test, dev_workspace->CdDelta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->CdDelta[i], test[i])) { + fprintf (stderr, " CdDelta does NOT match (%f %f) for atom %d \n", workspace->CdDelta[i], test[i], i); + exit (-1); + count ++; + } + } + //fprintf (stderr, "CdDelta mismatch count %d\n", count); + //exit for Bond Energy calculations - copy_host_device (test, dev_workspace->Delta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Delta_lp[i], test[i])) { - fprintf (stderr, "Delta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Delta_lp mismatch count %d\n", count); + /* + copy_host_device (test, dev_workspace->droptol, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->droptol[i], test[i])) { + fprintf (stderr, " Droptol Does not match (%f %f) \n", workspace->droptol[i], test[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "droptol mismatch count %d\n", count); + */ - copy_host_device (test, dev_workspace->Clp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Clp[i], test[i])) { - fprintf (stderr, "Clp: Mismatch index --> %d (%f %f) \n", i, workspace->Clp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Clp mismatch count %d\n", count); - copy_host_device (test, dev_workspace->dDelta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->dDelta_lp[i], test[i])) { - fprintf (stderr, "dDelta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "dDelta_lp mismatch count %d\n", count); + //exit for QEa calculations + /* + real *t_s; - copy_host_device (test, dev_workspace->nlp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->nlp_temp[i], test[i])) { - fprintf (stderr, "nlp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp_temp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "nlp_temp mismatch count %d\n", count); + t_s = (real *) malloc (REAL_SIZE * (system->N * 2) ); + copy_host_device (t_s, dev_workspace->b_prm, REAL_SIZE * (system->N * 2), cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (test, dev_workspace->Delta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->Delta_lp_temp[i], test[i])) { - fprintf (stderr, "Delta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp_temp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count); + count = 0; + for (int i = 0; i < (system->N * 2); i++ ) { + if (check_zero (workspace->b_prm[i], t_s[i])) { + fprintf (stderr, " (%f %f) \n", workspace->b_prm[i], t_s[i]); + exit (-1); + count ++; + } + } + //fprintf (stderr, "b_prm mismatch count %d\n", count); - copy_host_device (test, dev_workspace->dDelta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->dDelta_lp_temp[i], test[i])) { - fprintf (stderr, "dDelta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp_temp[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count); + t_s = (real *) malloc (REAL_SIZE * 5 * system->N); + copy_host_device (t_s, dev_workspace->s, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__); - //exit for Bond order calculations + count = 0; + for (int i = 0; i < 5*system->N; i++ ) { + if (check_zero (workspace->s[i], t_s[i])) { + //fprintf (stderr, " (%f %f) @ index %d \n", workspace->s[i], t_s[i], i); + count ++; + } + } + fprintf (stderr, "s mismatch count %d\n", count); - copy_host_device (test, dev_workspace->CdDelta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->CdDelta[i], test[i])) { - fprintf (stderr, " CdDelta does NOT match (%f %f) for atom %d \n", workspace->CdDelta[i], test[i], i); - exit (-1); - count ++; - } - } - //fprintf (stderr, "CdDelta mismatch count %d\n", count); - //exit for Bond Energy calculations + t_s = (real *) malloc (REAL_SIZE * 5 * system->N); + copy_host_device (t_s, dev_workspace->t, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__); - /* - copy_host_device (test, dev_workspace->droptol, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->droptol[i], test[i])) { - fprintf (stderr, " Droptol Does not match (%f %f) \n", workspace->droptol[i], test[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "droptol mismatch count %d\n", count); - */ + count = 0; + for (int i = 0; i < 5*system->N; i++ ) { + if (check_zero (workspace->t[i], t_s[i])) { + //fprintf (stderr, " (%f %f) @ index : %d\n", workspace->t[i], t_s[i], i); + count ++; + } + } + fprintf (stderr, "t mismatch count %d\n", count); - //exit for QEa calculations - /* - real *t_s; + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * system->N); + copy_host_device (t_s, dev_workspace->v, system->N * REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - t_s = (real *) malloc (REAL_SIZE * (system->N * 2) ); - copy_host_device (t_s, dev_workspace->b_prm, REAL_SIZE * (system->N * 2), cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < (RESTART + 1)*system->N; i++ ) { + if (check_zero (workspace->v[i], t_s[i])) { + //fprintf (stderr, " (%f %f) @ index %d \n", workspace->v[i], t_s[i], i); + count ++; + } + } + fprintf (stderr, "v mismatch count %d\n", count); - count = 0; - for (int i = 0; i < (system->N * 2); i++ ) { - if (check_zero (workspace->b_prm[i], t_s[i])) { - fprintf (stderr, " (%f %f) \n", workspace->b_prm[i], t_s[i]); - exit (-1); - count ++; - } - } - //fprintf (stderr, "b_prm mismatch count %d\n", count); + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->y, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - t_s = (real *) malloc (REAL_SIZE * 5 * system->N); - copy_host_device (t_s, dev_workspace->s, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < (RESTART + 1); i++ ) { + if (check_zero (workspace->y[i], t_s[i])) { + //fprintf (stderr, " (%f %f) \n", workspace->y[i], t_s[i]); + count ++; + } + } + fprintf (stderr, "y mismatch count %d\n", count); - count = 0; - for (int i = 0; i < 5*system->N; i++ ) { - if (check_zero (workspace->s[i], t_s[i])) { - //fprintf (stderr, " (%f %f) @ index %d \n", workspace->s[i], t_s[i], i); - count ++; - } - } - fprintf (stderr, "s mismatch count %d\n", count); + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->hc, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < (RESTART + 1); i++ ) { + if (check_zero (workspace->hc[i], t_s[i])) { + //fprintf (stderr, " (%f %f) \n", workspace->hc[i], t_s[i]); + count ++; + } + } + fprintf (stderr, "hc mismatch count %d\n", count); - t_s = (real *) malloc (REAL_SIZE * 5 * system->N); - copy_host_device (t_s, dev_workspace->t, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__); + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->hs, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < 5*system->N; i++ ) { - if (check_zero (workspace->t[i], t_s[i])) { - //fprintf (stderr, " (%f %f) @ index : %d\n", workspace->t[i], t_s[i], i); + count = 0; + for (int i = 0; i < (RESTART + 1); i++ ) { + if (check_zero (workspace->hs[i], t_s[i])) { + //fprintf (stderr, " (%f %f) \n", workspace->hs[i], t_s[i]); count ++; - } - } - fprintf (stderr, "t mismatch count %d\n", count); - + } + } + fprintf (stderr, "hs mismatch count %d\n", count); - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * system->N); - copy_host_device (t_s, dev_workspace->v, system->N * REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->h, REAL_SIZE * (RESTART+1)*(RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < (RESTART + 1)*system->N; i++ ) { - if (check_zero (workspace->v[i], t_s[i])) { - //fprintf (stderr, " (%f %f) @ index %d \n", workspace->v[i], t_s[i], i); + count = 0; + for (int i = 0; i < (RESTART+1)*(RESTART+1); i++ ) { + if (check_zero (workspace->h[i], t_s[i])) { + //fprintf (stderr, " (%f %f) \n", workspace->h[i], t_s[i]); count ++; - } - } - fprintf (stderr, "v mismatch count %d\n", count); + } + } + fprintf (stderr, "h mismatch count %d\n", count); - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->y, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); + copy_host_device (t_s, dev_workspace->g, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < (RESTART + 1); i++ ) { - if (check_zero (workspace->y[i], t_s[i])) { - //fprintf (stderr, " (%f %f) \n", workspace->y[i], t_s[i]); + count = 0; + for (int i = 0; i < (RESTART + 1); i++ ) { + if (check_zero (workspace->g[i], t_s[i])) { + //fprintf (stderr, " (%f %f) @ index %d\n", workspace->g[i], t_s[i], i); count ++; - } - } - fprintf (stderr, "y mismatch count %d\n", count); + } + } + fprintf (stderr, "g mismatch count %d\n", count); + */ - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->hc, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + rvec *r_s = (rvec *) malloc (RVEC_SIZE * system->N ); + copy_host_device (r_s, dev_workspace->v_const, RVEC_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - count = 0; - for (int i = 0; i < (RESTART + 1); i++ ) { - if (check_zero (workspace->hc[i], t_s[i])) { - //fprintf (stderr, " (%f %f) \n", workspace->hc[i], t_s[i]); - count ++; - } - } - fprintf (stderr, "hc mismatch count %d\n", count); + count = 0; + for (int i = 0; i < system->N; i++ ) { + if (check_zero (workspace->v_const[i], r_s[i])) { + fprintf (stderr, " v_const (%f %f %f) (%f %f %f) @ index %d\n", + workspace->v_const[i][0], + workspace->v_const[i][1], + workspace->v_const[i][2], + r_s[i][0], + r_s[i][1], + r_s[i][2], + i); + exit (-1); + count ++; + } + } + //fprintf (stderr, "v_const mismatch count %d\n", count); - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->hs, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + free (test); + free (r_s); + return TRUE; +} - count = 0; - for (int i = 0; i < (RESTART + 1); i++ ) { - if (check_zero (workspace->hs[i], t_s[i])) { - //fprintf (stderr, " (%f %f) \n", workspace->hs[i], t_s[i]); - count ++; - } - } - fprintf (stderr, "hs mismatch count %d\n", count); - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->h, REAL_SIZE * (RESTART+1)*(RESTART+1), cudaMemcpyDeviceToHost, __LINE__); +int validate_data (reax_system *system, simulation_data *host) +{ + simulation_data device; - count = 0; - for (int i = 0; i < (RESTART+1)*(RESTART+1); i++ ) { - if (check_zero (workspace->h[i], t_s[i])) { - //fprintf (stderr, " (%f %f) \n", workspace->h[i], t_s[i]); - count ++; - } - } - fprintf (stderr, "h mismatch count %d\n", count); + copy_host_device (&device, host->d_simulation_data, SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, __LINE__); - t_s = (real *) malloc (REAL_SIZE * (RESTART+1) ); - copy_host_device (t_s, dev_workspace->g, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__); + if (check_zero (host->E_BE, device.E_BE)){ + fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->E_BE, device.E_BE); + exit (-1); + } - count = 0; - for (int i = 0; i < (RESTART + 1); i++ ) { - if (check_zero (workspace->g[i], t_s[i])) { - //fprintf (stderr, " (%f %f) @ index %d\n", workspace->g[i], t_s[i], i); - count ++; - } - } - fprintf (stderr, "g mismatch count %d\n", count); - */ - - rvec *r_s = (rvec *) malloc (RVEC_SIZE * system->N ); - copy_host_device (r_s, dev_workspace->v_const, RVEC_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - for (int i = 0; i < system->N; i++ ) { - if (check_zero (workspace->v_const[i], r_s[i])) { - fprintf (stderr, " v_const (%f %f %f) (%f %f %f) @ index %d\n", - workspace->v_const[i][0], - workspace->v_const[i][1], - workspace->v_const[i][2], - r_s[i][0], - r_s[i][1], - r_s[i][2], - i); - exit (-1); - count ++; - } - } - //fprintf (stderr, "v_const mismatch count %d\n", count); + if (check_zero (host->E_Lp, device.E_Lp)){ + fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->E_Lp, device.E_Lp); + exit (-1); + } - free (test); - free (r_s); - return true; - } + if (check_zero (host->E_Ov, device.E_Ov)){ + fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->E_Ov, device.E_Ov); + exit (-1); + } - bool validate_data (reax_system *system, simulation_data *host) - { - simulation_data device; + if (check_zero (host->E_Un, device.E_Un)){ + fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->E_Un, device.E_Un); + exit (-1); + } - copy_host_device (&device, host->d_simulation_data, SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, __LINE__); + if (check_zero (host->E_Tor, device.E_Tor)) { + fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->E_Tor, device.E_Tor); + exit (-1); + } - if (check_zero (host->E_BE, device.E_BE)){ - fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->E_BE, device.E_BE); - exit (-1); - } + if (check_zero (host->E_Con, device.E_Con)) { + fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->E_Con, device.E_Con); + exit (-1); + } - if (check_zero (host->E_Lp, device.E_Lp)){ - fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->E_Lp, device.E_Lp); - exit (-1); - } + if (check_zero (host->ext_press, device.ext_press)) { + fprintf (stderr, "ext_press does not match (%4.10e %4.10e) \n", host->ext_press, device.ext_press); + exit (-1); + } - if (check_zero (host->E_Ov, device.E_Ov)){ - fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->E_Ov, device.E_Ov); - exit (-1); - } + if (check_zero (host->E_HB, device.E_HB)) { + fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->E_HB, device.E_HB); + exit (-1); + } - if (check_zero (host->E_Un, device.E_Un)){ - fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->E_Un, device.E_Un); - exit (-1); - } + if (check_zero (host->E_Ang, device.E_Ang)) { + fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->E_Ang, device.E_Ang); + exit (-1); + } - if (check_zero (host->E_Tor, device.E_Tor)) { - fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->E_Tor, device.E_Tor); - exit (-1); - } + if (check_zero (host->E_Pen, device.E_Pen)) { + fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->E_Pen, device.E_Pen); + exit (-1); + } - if (check_zero (host->E_Con, device.E_Con)) { - fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->E_Con, device.E_Con); - exit (-1); - } + if (check_zero (host->E_Coa, device.E_Coa)) { + fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->E_Coa, device.E_Coa); + exit (-1); + } - if (check_zero (host->ext_press, device.ext_press)) { - fprintf (stderr, "ext_press does not match (%4.10e %4.10e) \n", host->ext_press, device.ext_press); - exit (-1); - } + if (check_zero (host->E_vdW, device.E_vdW)) { + fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->E_vdW, device.E_vdW); + exit (-1); + } - if (check_zero (host->E_HB, device.E_HB)) { - fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->E_HB, device.E_HB); - exit (-1); - } + if (check_zero (host->E_Ele, device.E_Ele)) { + fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->E_Ele, device.E_Ele); + exit (-1); + } - if (check_zero (host->E_Ang, device.E_Ang)) { - fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->E_Ang, device.E_Ang); - exit (-1); - } + if (check_zero (host->E_Pol, device.E_Pol)) { + fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->E_Pol, device.E_Pol); + exit (-1); + } - if (check_zero (host->E_Pen, device.E_Pen)) { - fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->E_Pen, device.E_Pen); - exit (-1); - } - if (check_zero (host->E_Coa, device.E_Coa)) { - fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->E_Coa, device.E_Coa); - exit (-1); - } + //fprintf (stderr, "Simulation Data match between host and device \n"); + return TRUE; +} - if (check_zero (host->E_vdW, device.E_vdW)) { - fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->E_vdW, device.E_vdW); - exit (-1); - } - if (check_zero (host->E_Ele, device.E_Ele)) { - fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->E_Ele, device.E_Ele); - exit (-1); - } +void print_bond_data (bond_order_data *s) +{ + /* + fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", + s->BO, + s->BO_s, + s->BO_pi, + s->BO_pi2 ); + */ + fprintf (stderr, " Cdbo (%e) ", s->Cdbo ); + fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi ); + fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 ); +} - if (check_zero (host->E_Pol, device.E_Pol)) { - fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->E_Pol, device.E_Pol); - exit (-1); - } +void print_bond_list (reax_system *system, static_storage *workspace, list **lists) +{ + list *bonds = *lists + BONDS; - //fprintf (stderr, "Simulation Data match between host and device \n"); - return true; - } + for (int i = 1; i < 2; i++) + { + fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i); + for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) + { + bond_data *data = &bonds->select.bond_list [j]; + fprintf (stderr, " %d, ", data->nbr ); + print_bond_data (&data->bo_data); + fprintf (stderr, ")\n"); + } + } - void print_bond_data (bond_order_data *s) - { - /* - fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", - s->BO, - s->BO_s, - s->BO_pi, - s->BO_pi2 ); - */ - fprintf (stderr, " Cdbo (%e) ", s->Cdbo ); - fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi ); - fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 ); - } + int *b_start = (int *) malloc (INT_SIZE * system->N); + int *b_end = (int *) malloc (INT_SIZE * system->N); + list *d_bonds = dev_lists + BONDS; + bond_data *d_bond_data; - void print_bond_list (reax_system *system, static_storage *workspace, list **lists) - { - list *bonds = *lists + BONDS; + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - for (int i = 1; i < 2; i++) - { - fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i); - for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) - { - bond_data *data = &bonds->select.bond_list [j]; - fprintf (stderr, " %d, ", data->nbr ); - print_bond_data (&data->bo_data); - fprintf (stderr, ")\n"); - } - } + copy_host_device ( b_start, d_bonds->index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( b_end, d_bonds->end_index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 0; i < 2; i++) + { + fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i); + for (int j = b_start[i]; j < b_end[i]; j ++) { + bond_data *src = &d_bond_data[j]; + fprintf (stderr, " %d, ", src->nbr ); + print_bond_data (&src->bo_data); + fprintf (stderr, ")\n"); + } + } +} - int *b_start = (int *) malloc (INT_SIZE * system->N); - int *b_end = (int *) malloc (INT_SIZE * system->N); - list *d_bonds = dev_lists + BONDS; - bond_data *d_bond_data; - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); +void count_three_bodies (reax_system *system, static_storage *workspace, list **lists) +{ + list *three = *lists + THREE_BODIES; + list *bonds = *lists + BONDS; - copy_host_device ( b_start, d_bonds->index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( b_end, d_bonds->end_index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 0; i < 2; i++) - { - fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i); - for (int j = b_start[i]; j < b_end[i]; j ++) { - bond_data *src = &d_bond_data[j]; - fprintf (stderr, " %d, ", src->nbr ); - print_bond_data (&src->bo_data); - fprintf (stderr, ")\n"); - } - } - } + list *d_three = dev_lists + THREE_BODIES; + list *d_bonds = dev_lists + BONDS; + bond_data *d_bond_data; + real *test; + + three_body_interaction_data *data = (three_body_interaction_data *) + malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); + int *start = (int *) malloc (INT_SIZE * system->num_bonds); + int *end = (int *) malloc (INT_SIZE * system->num_bonds); + + int *b_start = (int *) malloc (INT_SIZE * system->N); + int *b_end = (int *) malloc (INT_SIZE * system->N); + int count; + int hcount, dcount; + + copy_host_device ( start, d_three->index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( end, d_three->end_index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( data, d_three->select.three_body_list, + sizeof (three_body_interaction_data) * system->num_thbodies, + cudaMemcpyDeviceToHost, __LINE__); + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + copy_host_device ( b_start, d_bonds->index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( b_end, d_bonds->end_index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - void count_three_bodies (reax_system *system, static_storage *workspace, list **lists) - { - list *three = *lists + THREE_BODIES; - list *bonds = *lists + BONDS; - - list *d_three = dev_lists + THREE_BODIES; - list *d_bonds = dev_lists + BONDS; - bond_data *d_bond_data; - real *test; - - three_body_interaction_data *data = (three_body_interaction_data *) - malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); - int *start = (int *) malloc (INT_SIZE * system->num_bonds); - int *end = (int *) malloc (INT_SIZE * system->num_bonds); - - int *b_start = (int *) malloc (INT_SIZE * system->N); - int *b_end = (int *) malloc (INT_SIZE * system->N); - int count; - int hcount, dcount; - - copy_host_device ( start, d_three->index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( end, d_three->end_index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( data, d_three->select.three_body_list, - sizeof (three_body_interaction_data) * system->num_thbodies, - cudaMemcpyDeviceToHost, __LINE__); - - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - - copy_host_device ( b_start, d_bonds->index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( b_end, d_bonds->end_index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - hcount = dcount = 0; - for (int i = 0; i < system->N; i++) - { - for (int j = b_start[i]; j < b_end[i]; j ++) { - dcount += end[j] - start[j]; - } - } + count = 0; + hcount = dcount = 0; + for (int i = 0; i < system->N; i++) + { + for (int j = b_start[i]; j < b_end[i]; j ++) { + dcount += end[j] - start[j]; + } + } - fprintf (stderr, "Total Actual Three Body Count ---> %d \n", dcount); + fprintf (stderr, "Total Actual Three Body Count ---> %d \n", dcount); - free (data); - free (start); - free (end); - free (b_start); - free (b_end); - free (d_bond_data); - } + free (data); + free (start); + free (end); + free (b_start); + free (b_end); + free (d_bond_data); +} +int validate_three_bodies (reax_system *system, static_storage *workspace, list **lists) +{ + list *three = *lists + THREE_BODIES; + list *bonds = *lists + BONDS; - bool validate_three_bodies (reax_system *system, static_storage *workspace, list **lists) - { - list *three = *lists + THREE_BODIES; - list *bonds = *lists + BONDS; + list *d_three = dev_lists + THREE_BODIES; + list *d_bonds = dev_lists + BONDS; + bond_data *d_bond_data; + real *test; - list *d_three = dev_lists + THREE_BODIES; - list *d_bonds = dev_lists + BONDS; - bond_data *d_bond_data; - real *test; + three_body_interaction_data *data = (three_body_interaction_data *) + malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); + int *start = (int *) malloc (INT_SIZE * system->num_bonds); + int *end = (int *) malloc (INT_SIZE * system->num_bonds); - three_body_interaction_data *data = (three_body_interaction_data *) - malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); - int *start = (int *) malloc (INT_SIZE * system->num_bonds); - int *end = (int *) malloc (INT_SIZE * system->num_bonds); + int *b_start = (int *) malloc (INT_SIZE * system->N); + int *b_end = (int *) malloc (INT_SIZE * system->N); + int count; + int hcount, dcount; - int *b_start = (int *) malloc (INT_SIZE * system->N); - int *b_end = (int *) malloc (INT_SIZE * system->N); - int count; - int hcount, dcount; + copy_host_device ( start, d_three->index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( end, d_three->end_index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( data, d_three->select.three_body_list, + sizeof (three_body_interaction_data) * system->num_thbodies, + cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( start, d_three->index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( end, d_three->end_index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( data, d_three->select.three_body_list, - sizeof (three_body_interaction_data) * system->num_thbodies, - cudaMemcpyDeviceToHost, __LINE__); + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + copy_host_device ( b_start, d_bonds->index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( b_end, d_bonds->end_index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( b_start, d_bonds->index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( b_end, d_bonds->end_index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + //test = (real *) malloc (REAL_SIZE * system->num_bonds); + //memset (test, 0, REAL_SIZE * system->num_bonds); + //copy_host_device (test, testdata, REAL_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - //test = (real *) malloc (REAL_SIZE * system->num_bonds); - //memset (test, 0, REAL_SIZE * system->num_bonds); - //copy_host_device (test, testdata, REAL_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + count = 0; + for (int i = 0; i < system->N; i++) + { + //for (int j = bonds->index[i]; j < bonds->end_index[i]; j ++) - count = 0; - for (int i = 0; i < system->N; i++) - { - //for (int j = bonds->index[i]; j < bonds->end_index[i]; j ++) + hcount = dcount = 0; + for (int j = b_start[i]; j < b_end[i]; j ++) { + dcount += end[j] - start[j]; + hcount += Num_Entries (j, three); - hcount = dcount = 0; - for (int j = b_start[i]; j < b_end[i]; j ++) { - dcount += end[j] - start[j]; - hcount += Num_Entries (j, three); + /* + if ((end[j] - start[j]) != (End_Index (j, three) - Start_Index (j, three))) + { + fprintf (stderr, " Three body count does not match between host and device\n"); + fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (j, three), End_Index (j, three)); + fprintf (stderr, " Device count: (%d, %d)\n", start[j], end[j]); + } + */ + } - /* - if ((end[j] - start[j]) != (End_Index (j, three) - Start_Index (j, three))) - { - fprintf (stderr, " Three body count does not match between host and device\n"); - fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (j, three), End_Index (j, three)); - fprintf (stderr, " Device count: (%d, %d)\n", start[j], end[j]); - } - */ - } + if ((dcount != hcount)) { - if ((dcount != hcount)) { + fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); - fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); + for (int j = b_start[i]; j < b_end[i]; j ++) { + bond_order_data *src = &d_bond_data[j].bo_data; + dcount = end[j] - start[j]; + hcount = Num_Entries (j, three); + fprintf (stderr, "device \n"); + print_bond_data (src); - for (int j = b_start[i]; j < b_end[i]; j ++) { - bond_order_data *src = &d_bond_data[j].bo_data; - dcount = end[j] - start[j]; - hcount = Num_Entries (j, three); - fprintf (stderr, "device \n"); - print_bond_data (src); + fprintf (stderr, "\n"); + src = &bonds->select.bond_list[j].bo_data; + fprintf (stderr, "host \n"); + print_bond_data (src); + fprintf (stderr, "\n"); - fprintf (stderr, "\n"); - src = &bonds->select.bond_list[j].bo_data; - fprintf (stderr, "host \n"); - print_bond_data (src); - fprintf (stderr, "\n"); + //fprintf (stderr, "--- Device bo is %f \n", test[j]); + fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i], + Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds)); + fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j); + fprintf (stderr, "------\n"); + } + fprintf (stderr, " Three Bodies count does not match between host and device \n"); + exit (-1); + } + } - //fprintf (stderr, "--- Device bo is %f \n", test[j]); - fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i], - Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds)); - fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j); - fprintf (stderr, "------\n"); - } - fprintf (stderr, " Three Bodies count does not match between host and device \n"); - exit (-1); - } - } + //fprintf (stderr, "Three body count on DEVICE %d HOST %d \n", dcount, hcount); - //fprintf (stderr, "Three body count on DEVICE %d HOST %d \n", dcount, hcount); + count = 0; + for (int i = 0; i < system->N; i++) + { + int x, y, z; + for (x = b_start[i]; x < b_end[i]; x++) + { + int t_start = start[x]; + int t_end = end[x]; - count = 0; - for (int i = 0; i < system->N; i++) + bond_data *dev_bond = &d_bond_data [x]; + bond_data *host_bond; + for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++) + { + host_bond = &bonds->select.bond_list [z]; + if ((dev_bond->nbr == host_bond->nbr) && + check_same (dev_bond->rel_box, host_bond->rel_box) && + !check_zero (dev_bond->dvec, host_bond->dvec) && + !check_zero (dev_bond->d, host_bond->d) ) { - int x, y, z; - for (x = b_start[i]; x < b_end[i]; x++) - { - int t_start = start[x]; - int t_end = end[x]; - - bond_data *dev_bond = &d_bond_data [x]; - bond_data *host_bond; - for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++) - { - host_bond = &bonds->select.bond_list [z]; - if ((dev_bond->nbr == host_bond->nbr) && - check_same (dev_bond->rel_box, host_bond->rel_box) && - !check_zero (dev_bond->dvec, host_bond->dvec) && - !check_zero (dev_bond->d, host_bond->d) ) - { - break; - } - } - if (z >= End_Index (i, bonds)){ - fprintf (stderr, "Could not find the matching bond on host and device \n"); - exit (-1); - } - - //find this bond in the bonds on the host side. - - for (y = t_start; y < t_end; y++) - { - - three_body_interaction_data *device = data + y; - three_body_interaction_data *host; - - //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb); - - int xx; - for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++) - { - host = &three->select.three_body_list [xx]; - //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb); - //if ((host->thb == device->thb) && (host->pthb == device->pthb)) - if ((host->thb == device->thb) && !check_zero (host->theta, device->theta)) - { - count ++; - break; - } - } - - if ( xx >= End_Index (z, three) ) { - fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, - Start_Index (z, three), End_Index (z, three), start[x], end[x] ); - exit (-1); - }// else fprintf (stderr, "----------------- \n"); - } - } + break; } - free (data); - free (start); - free (end); - free (b_start); - free (b_end); - free (d_bond_data); - - //fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count); - return true; } + if (z >= End_Index (i, bonds)){ + fprintf (stderr, "Could not find the matching bond on host and device \n"); + exit (-1); + } + + //find this bond in the bonds on the host side. - bool bin_three_bodies (reax_system *system, static_storage *workspace, list **lists) + for (y = t_start; y < t_end; y++) { - list *d_three = dev_lists + THREE_BODIES; - list *d_bonds = dev_lists + BONDS; - list *three = *lists + THREE_BODIES; - list *bonds = *lists + BONDS; - bond_data *d_bond_data; - - three_body_interaction_data *data = (three_body_interaction_data *) - malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); - int *start = (int *) malloc (INT_SIZE * system->num_bonds); - int *end = (int *) malloc (INT_SIZE * system->num_bonds); - - int *b_start = (int *) malloc (INT_SIZE * system->N); - int *b_end = (int *) malloc (INT_SIZE * system->N); - - int *a = (int *) malloc (2 * INT_SIZE * system->N ); - int *b = (int *) malloc (2 * INT_SIZE * system->N ); - int *c = (int *) malloc (2 * INT_SIZE * system->N ); - int *d = (int *) malloc (2 * INT_SIZE * system->N ); - - for (int i = 0; i < 2 * system->N; i++) - a[i] = b[i] = c[i] = d[i] = -1; - - int count; - int hcount, dcount; - int index_a, index_b, index_c, index_d; - index_a = index_b = index_c = index_d = 0; - - copy_host_device ( start, d_three->index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( end, d_three->end_index, - INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( data, d_three->select.three_body_list, - sizeof (three_body_interaction_data) * system->num_thbodies, - cudaMemcpyDeviceToHost, __LINE__); - - d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); - - copy_host_device ( b_start, d_bonds->index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device ( b_end, d_bonds->end_index, - INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); - copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); - - count = 0; - hcount = dcount = 0; - - /* - for (int i = 0; i < 20; i++) - { - for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) - { - for ( int k = Start_Index (j, three); k < End_Index (j, three); k ++) - { - three_body_interaction_data *host = &three->select.three_body_list [k]; - fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d)\n", - i, Start_Index (i, bonds), End_Index (i, bonds), j, host->thb, host->pthb ); - - } - } - } - exit (-1); - */ - - count = 0; - for (int i = 0; i < system->N; i++) - { - for (int j = b_start[i]; j < b_end[i]; j ++) { - /* - bond_data *src; - src = &d_bond_data[j]; - fprintf (stderr, " atom %d Neighbor %d \n", i, src->nbr ); - */ + three_body_interaction_data *device = data + y; + three_body_interaction_data *host; - for (int x = start[j]; x < end[j]; x ++) - { - three_body_interaction_data *device = data + x; - - int center = device->j; - int d_i = device->i; - int d_k = device->k; - - - //fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d %d) -- (%d %d)\n", - //i, b_start[i], b_end[i], j, center, d_i, d_k, device->thb, device->pthb); - - if ((a[system->N + center] != -1)) { - a[d_i] = a[d_k] = 1; - continue; - } else if ((b[system->N + center] != -1)) { - b[d_i] = b[d_k] = 1; - continue; - } else if ((c[system->N + center] != -1)) { - c[d_i] = c[d_k] = 1; - continue; - } else if ((d[system->N + center] != -1)) { - d[d_i] = d[d_k] = 1; - continue; - } - - if ((a[center] == -1) && (a[d_i] == -1) && (a[d_k] == -1)) { - a[center] = a[d_i] = a[d_k] = 1; - a[system->N + center] = 1; - } else if ((b[center] == -1) && (b[d_i] == -1) && (b[d_k] == -1)) { - b[center] = b[d_i] = b[d_k] = 1; - b[system->N + center] = 1; - } else if ((c[center] == -1) && (c[d_i] == -1) && (c[d_k] == -1)) { - c[center] = c[d_i] = c[d_k] = 1; - c[system->N + center] = 1; - } else if ((d[center] == -1) && (d[d_i] == -1) && (d[d_k] == -1)) { - d[center] = d[d_i] = d[d_k] = 1; - d[system->N + center]= 1; - } - else { - count ++; - break; - fprintf (stderr, "We have a problem with the four bins atom %d bond (%d %d) -- %d, (%d %d %d)\n", - i, b_start[i], b_end[i], j, center, d_i, d_k); - fprintf (stderr, "A's contents %d %d %d (%d %d %d)\n", - a[system->N + center], a[system->N + d_i], a[system->N + d_k], a[center], a[d_i], a[d_k]); - fprintf (stderr, "B's contents %d %d %d (%d %d %d)\n", - b[system->N + center], b[system->N + d_i], b[system->N + d_k], b[center], b[d_i], b[d_k]); - fprintf (stderr, "C's contents %d %d %d (%d %d %d)\n", - c[system->N + center], c[system->N + d_i], c[system->N + d_k], c[center], c[d_i], c[d_k]); - fprintf (stderr, "D's contents %d %d %d (%d %d %d)\n", - d[system->N + center], d[system->N + d_i], d[system->N + d_k], d[center], d[d_i], d[d_k]); - - } - } - } - } - fprintf (stderr, "Miscount is %d \n", count); - exit (-1); + //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb); - count = 0; - for (int i = 0; i < system->N; i++) + int xx; + for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++) { - if (a[system->N + i] != -1) count ++; - if (b[system->N + i] != -1) count ++; - if (c[system->N + i] != -1) count ++; - if (d[system->N + i] != -1) count ++; + host = &three->select.three_body_list [xx]; + //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb); + //if ((host->thb == device->thb) && (host->pthb == device->pthb)) + if ((host->thb == device->thb) && !check_zero (host->theta, device->theta)) + { + count ++; + break; + } } - fprintf (stderr, "binned so many atoms --> %d \n", count ); + if ( xx >= End_Index (z, three) ) { + fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, + Start_Index (z, three), End_Index (z, three), start[x], end[x] ); + exit (-1); + }// else fprintf (stderr, "----------------- \n"); } + } + } + free (data); + free (start); + free (end); + free (b_start); + free (b_end); + free (d_bond_data); - bool validate_grid (reax_system *system) - { - int total = system->g.ncell[0] * system->g.ncell[1] * system->g.ncell[2]; - int count = 0; + //fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count); + return TRUE; +} - int *dtop = (int *) malloc (INT_SIZE * total ); - copy_host_device (dtop, system->d_g.top, INT_SIZE * total, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 0; i < total; i++){ - if (system->g.top[i] != dtop[i]){ - fprintf (stderr, " top count does not match (%d %d) @ index %d \n", system->g.top[i], dtop[i], i ); - exit (-1); - } - } - free (dtop); +int bin_three_bodies (reax_system *system, static_storage *workspace, list **lists) +{ + list *d_three = dev_lists + THREE_BODIES; + list *d_bonds = dev_lists + BONDS; + list *three = *lists + THREE_BODIES; + list *bonds = *lists + BONDS; + bond_data *d_bond_data; - int *datoms = (int *) malloc (INT_SIZE * total * system->d_g.max_atoms); - copy_host_device (datoms, system->d_g.atoms, INT_SIZE * total * system->d_g.max_atoms, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 0; i < total*system->d_g.max_atoms; i++){ - if (system->g.atoms[i] != datoms[i]){ - fprintf (stderr, " atoms count does not match (%d %d) @ index %d \n", system->g.atoms[i], datoms[i], i ); - exit (-1); - } - } - free (datoms); + three_body_interaction_data *data = (three_body_interaction_data *) + malloc ( sizeof (three_body_interaction_data) * system->num_thbodies); + int *start = (int *) malloc (INT_SIZE * system->num_bonds); + int *end = (int *) malloc (INT_SIZE * system->num_bonds); - ivec *dnbrs = (ivec *) malloc (IVEC_SIZE * total * system->d_g.max_nbrs); - copy_host_device (dnbrs, system->d_g.nbrs, IVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 0; i < total*system->d_g.max_nbrs; i++){ - if (!check_same (system->g.nbrs[i], dnbrs[i])){ - fprintf (stderr, " nbrs count does not match @ index %d \n", i ); - exit (-1); - } - } - free (dnbrs); + int *b_start = (int *) malloc (INT_SIZE * system->N); + int *b_end = (int *) malloc (INT_SIZE * system->N); - rvec *dnbrs_cp = (rvec *) malloc (RVEC_SIZE * total * system->d_g.max_nbrs); - copy_host_device (dnbrs_cp, system->d_g.nbrs_cp, RVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__); - for (int i = 0; i < total*system->d_g.max_nbrs; i++){ - if (check_zero (system->g.nbrs_cp[i], dnbrs_cp[i])){ - fprintf (stderr, " nbrs_cp count does not match @ index %d \n", i ); - exit (-1); - } - } - free (dnbrs_cp); + int *a = (int *) malloc (2 * INT_SIZE * system->N ); + int *b = (int *) malloc (2 * INT_SIZE * system->N ); + int *c = (int *) malloc (2 * INT_SIZE * system->N ); + int *d = (int *) malloc (2 * INT_SIZE * system->N ); - //fprintf (stderr, " Grid match between device and host \n"); - return true; - } + for (int i = 0; i < 2 * system->N; i++) + a[i] = b[i] = c[i] = d[i] = -1; - void print_atoms (reax_system *system) - { - int start, end, index; + int count; + int hcount, dcount; + int index_a, index_b, index_c, index_d; + index_a = index_b = index_c = index_d = 0; - reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N); - copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); + copy_host_device ( start, d_three->index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( end, d_three->end_index, + INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( data, d_three->select.three_body_list, + sizeof (three_body_interaction_data) * system->num_thbodies, + cudaMemcpyDeviceToHost, __LINE__); - //for (int i = 0; i < system->N; i++) - for (int i = 0; i < 10; i++) - { - fprintf (stderr, "Atom:%d: Type:%d", i, test[i].type); - fprintf (stderr, " x(%6.10f %6.10f %6.10f)", test[i].x[0], test[i].x[1], test[i].x[2] ); - fprintf (stderr, " v(%6.10f %6.10f %6.10f)", test[i].v[0], test[i].v[1], test[i].v[2] ); - fprintf (stderr, " f(%6.10f %6.10f %6.10f)", test[i].f[0], test[i].f[1], test[i].f[2] ); - fprintf (stderr, " q(%6.10f) \n", test[i].q ); - } - } + d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds ); + + copy_host_device ( b_start, d_bonds->index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device ( b_end, d_bonds->end_index, + INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__); + + count = 0; + hcount = dcount = 0; + + /* + for (int i = 0; i < 20; i++) + { + for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) + { + for ( int k = Start_Index (j, three); k < End_Index (j, three); k ++) + { + three_body_interaction_data *host = &three->select.three_body_list [k]; + fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d)\n", + i, Start_Index (i, bonds), End_Index (i, bonds), j, host->thb, host->pthb ); + + } + } + } + exit (-1); + */ + + count = 0; + for (int i = 0; i < system->N; i++) + { + for (int j = b_start[i]; j < b_end[i]; j ++) { - void print_sys_atoms (reax_system *system) + /* + bond_data *src; + src = &d_bond_data[j]; + fprintf (stderr, " atom %d Neighbor %d \n", i, src->nbr ); + */ + + for (int x = start[j]; x < end[j]; x ++) { - for (int i = 0; i < 10; i++) - { - fprintf (stderr, "Atom:%d: Type:%d", i, system->atoms[i].type); - fprintf (stderr, " x(%6.10f %6.10f %6.10f)",system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] ); - fprintf (stderr, " v(%6.10f %6.10f %6.10f)",system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); - fprintf (stderr, " f(%6.10f %6.10f %6.10f)", system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2] ); - fprintf (stderr, " q(%6.10f) \n", system->atoms[i].q ); + three_body_interaction_data *device = data + x; + + int center = device->j; + int d_i = device->i; + int d_k = device->k; + + + //fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d %d) -- (%d %d)\n", + //i, b_start[i], b_end[i], j, center, d_i, d_k, device->thb, device->pthb); + + if ((a[system->N + center] != -1)) { + a[d_i] = a[d_k] = 1; + continue; + } else if ((b[system->N + center] != -1)) { + b[d_i] = b[d_k] = 1; + continue; + } else if ((c[system->N + center] != -1)) { + c[d_i] = c[d_k] = 1; + continue; + } else if ((d[system->N + center] != -1)) { + d[d_i] = d[d_k] = 1; + continue; } - } + if ((a[center] == -1) && (a[d_i] == -1) && (a[d_k] == -1)) { + a[center] = a[d_i] = a[d_k] = 1; + a[system->N + center] = 1; + } else if ((b[center] == -1) && (b[d_i] == -1) && (b[d_k] == -1)) { + b[center] = b[d_i] = b[d_k] = 1; + b[system->N + center] = 1; + } else if ((c[center] == -1) && (c[d_i] == -1) && (c[d_k] == -1)) { + c[center] = c[d_i] = c[d_k] = 1; + c[system->N + center] = 1; + } else if ((d[center] == -1) && (d[d_i] == -1) && (d[d_k] == -1)) { + d[center] = d[d_i] = d[d_k] = 1; + d[system->N + center]= 1; + } + else { + count ++; + break; + fprintf (stderr, "We have a problem with the four bins atom %d bond (%d %d) -- %d, (%d %d %d)\n", + i, b_start[i], b_end[i], j, center, d_i, d_k); + fprintf (stderr, "A's contents %d %d %d (%d %d %d)\n", + a[system->N + center], a[system->N + d_i], a[system->N + d_k], a[center], a[d_i], a[d_k]); + fprintf (stderr, "B's contents %d %d %d (%d %d %d)\n", + b[system->N + center], b[system->N + d_i], b[system->N + d_k], b[center], b[d_i], b[d_k]); + fprintf (stderr, "C's contents %d %d %d (%d %d %d)\n", + c[system->N + center], c[system->N + d_i], c[system->N + d_k], c[center], c[d_i], c[d_k]); + fprintf (stderr, "D's contents %d %d %d (%d %d %d)\n", + d[system->N + center], d[system->N + d_i], d[system->N + d_k], d[center], d[d_i], d[d_k]); - void print_grid (reax_system *system) - { - int i, j, k, x; - grid *g = &system->g; - - for( i = 0; i < g->ncell[0]; i++ ) - for( j = 0; j < g->ncell[1]; j++ ) - for( k = 0; k < g->ncell[2]; k++ ){ - fprintf (stderr, "Cell [%d,%d,%d]--(", i, j, k); - for (x = 0; x < g->top[index_grid_3d (i,j,k,g) ]; x++){ - fprintf (stderr, "%d,", g->atoms[ index_grid_atoms (i,j,k,x,g) ]); - } - fprintf (stderr, ")\n"); - } + } } + } + } + fprintf (stderr, "Miscount is %d \n", count); + exit (-1); + + count = 0; + for (int i = 0; i < system->N; i++) + { + if (a[system->N + i] != -1) count ++; + if (b[system->N + i] != -1) count ++; + if (c[system->N + i] != -1) count ++; + if (d[system->N + i] != -1) count ++; + } + + fprintf (stderr, "binned so many atoms --> %d \n", count ); +} + + +int validate_grid (reax_system *system) +{ + int total = system->g.ncell[0] * system->g.ncell[1] * system->g.ncell[2]; + int count = 0; + int *dtop = (int *) malloc (INT_SIZE * total ); + copy_host_device (dtop, system->d_g.top, INT_SIZE * total, cudaMemcpyDeviceToHost, __LINE__); + + for (int i = 0; i < total; i++){ + if (system->g.top[i] != dtop[i]){ + fprintf (stderr, " top count does not match (%d %d) @ index %d \n", system->g.top[i], dtop[i], i ); + exit (-1); + } + } + free (dtop); + + int *datoms = (int *) malloc (INT_SIZE * total * system->d_g.max_atoms); + copy_host_device (datoms, system->d_g.atoms, INT_SIZE * total * system->d_g.max_atoms, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 0; i < total*system->d_g.max_atoms; i++){ + if (system->g.atoms[i] != datoms[i]){ + fprintf (stderr, " atoms count does not match (%d %d) @ index %d \n", system->g.atoms[i], datoms[i], i ); + exit (-1); + } + } + free (datoms); + ivec *dnbrs = (ivec *) malloc (IVEC_SIZE * total * system->d_g.max_nbrs); + copy_host_device (dnbrs, system->d_g.nbrs, IVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 0; i < total*system->d_g.max_nbrs; i++){ + if (!check_same (system->g.nbrs[i], dnbrs[i])){ + fprintf (stderr, " nbrs count does not match @ index %d \n", i ); + exit (-1); + } + } + free (dnbrs); + + rvec *dnbrs_cp = (rvec *) malloc (RVEC_SIZE * total * system->d_g.max_nbrs); + copy_host_device (dnbrs_cp, system->d_g.nbrs_cp, RVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__); + for (int i = 0; i < total*system->d_g.max_nbrs; i++){ + if (check_zero (system->g.nbrs_cp[i], dnbrs_cp[i])){ + fprintf (stderr, " nbrs_cp count does not match @ index %d \n", i ); + exit (-1); + } + } + free (dnbrs_cp); + + //fprintf (stderr, " Grid match between device and host \n"); + return TRUE; +} + + +void print_atoms (reax_system *system) +{ + int start, end, index; + + reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N); + copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS ); + + //for (int i = 0; i < system->N; i++) + for (int i = 0; i < 10; i++) + { + fprintf (stderr, "Atom:%d: Type:%d", i, test[i].type); + fprintf (stderr, " x(%6.10f %6.10f %6.10f)", test[i].x[0], test[i].x[1], test[i].x[2] ); + fprintf (stderr, " v(%6.10f %6.10f %6.10f)", test[i].v[0], test[i].v[1], test[i].v[2] ); + fprintf (stderr, " f(%6.10f %6.10f %6.10f)", test[i].f[0], test[i].f[1], test[i].f[2] ); + fprintf (stderr, " q(%6.10f) \n", test[i].q ); + } +} + + +void print_sys_atoms (reax_system *system) +{ + for (int i = 0; i < 10; i++) + { + fprintf (stderr, "Atom:%d: Type:%d", i, system->atoms[i].type); + fprintf (stderr, " x(%6.10f %6.10f %6.10f)",system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] ); + fprintf (stderr, " v(%6.10f %6.10f %6.10f)",system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] ); + fprintf (stderr, " f(%6.10f %6.10f %6.10f)", system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2] ); + fprintf (stderr, " q(%6.10f) \n", system->atoms[i].q ); + } +} + + +void print_grid (reax_system *system) +{ + int i, j, k, x; + grid *g = &system->g; + + for( i = 0; i < g->ncell[0]; i++ ) + for( j = 0; j < g->ncell[1]; j++ ) + for( k = 0; k < g->ncell[2]; k++ ){ + fprintf (stderr, "Cell [%d,%d,%d]--(", i, j, k); + for (x = 0; x < g->top[index_grid_3d (i,j,k,g) ]; x++){ + fprintf (stderr, "%d,", g->atoms[ index_grid_atoms (i,j,k,x,g) ]); + } + fprintf (stderr, ")\n"); + } +} diff --git a/PuReMD-GPU/src/validation.h b/PuReMD-GPU/src/validation.h index 5ef9d2f37393c37eb4430238bdf0c68734842bcc..5eccf7d94f8c7716d45b45305155b0492b70dda2 100644 --- a/PuReMD-GPU/src/validation.h +++ b/PuReMD-GPU/src/validation.h @@ -23,35 +23,35 @@ #include "mytypes.h" -bool check_zero (real , real ); -bool check_zero (rvec , rvec ); -bool check_same (ivec , ivec ); +int check_zero (real , real ); +int check_zero (rvec , rvec ); +int check_same (ivec , ivec ); -bool validate_box (simulation_box *host, simulation_box *dev); -bool validate_atoms (reax_system *, list **); -bool validate_grid (reax_system *); +int validate_box (simulation_box *host, simulation_box *dev); +int validate_atoms (reax_system *, list **); +int validate_grid (reax_system *); -bool validate_bonds (reax_system *, static_storage *, list **); -bool validate_hbonds (reax_system *, static_storage *, list **); -bool validate_sym_dbond_indices (reax_system *, static_storage *, list **); -bool validate_three_bodies (reax_system *, static_storage *, list **); +int validate_bonds (reax_system *, static_storage *, list **); +int validate_hbonds (reax_system *, static_storage *, list **); +int validate_sym_dbond_indices (reax_system *, static_storage *, list **); +int validate_three_bodies (reax_system *, static_storage *, list **); void count_three_bodies (reax_system *system, static_storage *workspace, list **lists); -bool bin_three_bodies (reax_system *, static_storage *, list **); +int bin_three_bodies (reax_system *, static_storage *, list **); -bool validate_sort_matrix (reax_system *, static_storage *); -bool validate_sparse_matrix (reax_system *, static_storage *); -bool validate_lu (static_storage *); +int validate_sort_matrix (reax_system *, static_storage *); +int validate_sparse_matrix (reax_system *, static_storage *); +int validate_lu (static_storage *); void print_sparse_matrix (reax_system *, static_storage *); void print_bond_list (reax_system *, static_storage *, list **); -bool validate_workspace (reax_system *, static_storage *, list **); -bool validate_neighbors (reax_system *, list **lists); +int validate_workspace (reax_system *, static_storage *, list **); +int validate_neighbors (reax_system *, list **lists); -bool validate_data (reax_system *, simulation_data *); +int validate_data (reax_system *, simulation_data *); -bool analyze_hbonds (reax_system *, static_storage *, list **); +int analyze_hbonds (reax_system *, static_storage *, list **); void Print_Matrix (sparse_matrix *); void Print_Matrix_L (sparse_matrix *); diff --git a/PuReMD-GPU/src/vector.cu b/PuReMD-GPU/src/vector.c similarity index 100% rename from PuReMD-GPU/src/vector.cu rename to PuReMD-GPU/src/vector.c diff --git a/PuReMD-GPU/src/vector.h b/PuReMD-GPU/src/vector.h index 336534784e50fd5552aaf27ff0c9531c3b97a02a..e1111e514928e79fc79197a0f2486d5eefb1cfa3 100644 --- a/PuReMD-GPU/src/vector.h +++ b/PuReMD-GPU/src/vector.h @@ -22,8 +22,14 @@ #define __VECTOR_H_ #include "mytypes.h" + #include "random.h" + +#ifdef __cplusplus +extern "C" { +#endif + int Vector_isZero( real*, int ); void Vector_MakeZero( real*, int ); void Vector_Copy( real*, real*, int ); @@ -33,8 +39,6 @@ void Vector_Copy( real*, real*, int ); void Vector_Print( FILE*, char*, real*, int ); real Norm( real*, int ); -HOST_DEVICE inline real Dot( real*, real*, int ); - void rvec_Sum( rvec, rvec, rvec ); real rvec_ScaledDot( real, rvec, real, rvec ); void rvec_Multiply( rvec, rvec, rvec ); @@ -44,19 +48,6 @@ void rvec_Invert( rvec, rvec ); void rvec_OuterProduct( rtensor, rvec, rvec ); int rvec_isZero( rvec ); -HOST_DEVICE inline real rvec_Dot( rvec, rvec ); -HOST_DEVICE inline void rvec_Scale( rvec, real, rvec ); -HOST_DEVICE inline real rvec_Norm_Sqr( rvec ); -HOST_DEVICE inline void rvec_Random( rvec ); -HOST_DEVICE inline void rvec_MakeZero( rvec ); -HOST_DEVICE inline void rvec_Add( rvec, rvec ); -HOST_DEVICE inline void rvec_Copy( rvec, rvec ); -HOST_DEVICE inline void rvec_Cross( rvec, rvec, rvec ); -HOST_DEVICE inline void rvec_ScaledAdd( rvec, real, rvec ); -HOST_DEVICE inline void rvec_ScaledSum( rvec, real, rvec, real, rvec ); -HOST_DEVICE inline void rvec_iMultiply( rvec, ivec, rvec ); -HOST_DEVICE inline real rvec_Norm( rvec ); - void rtensor_MakeZero( rtensor ); void rtensor_Multiply( rtensor, rtensor, rtensor ); void rtensor_MatVec( rvec, rtensor, rvec ); @@ -80,16 +71,7 @@ void ivec_MakeZero( ivec ); void ivec_rScale( ivec, real, rvec ); -HOST_DEVICE inline void ivec_Copy( ivec, ivec ); -HOST_DEVICE inline void ivec_Scale( ivec, real, ivec ); -HOST_DEVICE inline void ivec_Sum( ivec, ivec, ivec ); - -/* - * Code which is common to multiple HOST and DEVICE - * - */ - -HOST_DEVICE inline real Dot( real* v1, real* v2, int k ) +static inline HOST_DEVICE real Dot( real* v1, real* v2, int k ) { real ret = 0; @@ -100,102 +82,109 @@ HOST_DEVICE inline real Dot( real* v1, real* v2, int k ) } - - ///////////////////////////// //rvec functions ///////////////////////////// - -HOST_DEVICE inline void rvec_MakeZero( rvec v ) +static inline HOST_DEVICE void rvec_MakeZero( rvec v ) { v[0] = v[1] = v[2] = ZERO; } -HOST_DEVICE inline void rvec_Add( rvec ret, rvec v ) + +static inline HOST_DEVICE void rvec_Add( rvec ret, rvec v ) { ret[0] += v[0]; ret[1] += v[1]; ret[2] += v[2]; } -HOST_DEVICE inline void rvec_Copy( rvec dest, rvec src ) + +static inline HOST_DEVICE void rvec_Copy( rvec dest, rvec src ) { dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } -HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 ) + +static inline HOST_DEVICE void rvec_Cross( rvec ret, rvec v1, rvec v2 ) { ret[0] = v1[1] * v2[2] - v1[2] * v2[1]; ret[1] = v1[2] * v2[0] - v1[0] * v2[2]; ret[2] = v1[0] * v2[1] - v1[1] * v2[0]; } -HOST_DEVICE inline void rvec_ScaledAdd( rvec ret, real c, rvec v ) + +static inline HOST_DEVICE void rvec_ScaledAdd( rvec ret, real c, rvec v ) { ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2]; } -HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 ) + +static inline HOST_DEVICE void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 ) { ret[0] = c1 * v1[0] + c2 * v2[0]; ret[1] = c1 * v1[1] + c2 * v2[1]; ret[2] = c1 * v1[2] + c2 * v2[2]; } -HOST_DEVICE inline void rvec_Random( rvec v ) + +static inline HOST_DEVICE void rvec_Random( rvec v ) { v[0] = Random(2.0) - 1.0; v[1] = Random(2.0) - 1.0; v[2] = Random(2.0) - 1.0; } -HOST_DEVICE inline real rvec_Norm_Sqr( rvec v ) + +static inline HOST_DEVICE real rvec_Norm_Sqr( rvec v ) { return SQR(v[0]) + SQR(v[1]) + SQR(v[2]); } -HOST_DEVICE inline void rvec_Scale( rvec ret, real c, rvec v ) + +static inline HOST_DEVICE void rvec_Scale( rvec ret, real c, rvec v ) { ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2]; } -HOST_DEVICE inline real rvec_Dot( rvec v1, rvec v2 ) + +static inline HOST_DEVICE real rvec_Dot( rvec v1, rvec v2 ) { return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; } -HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 ) + +static inline HOST_DEVICE void rvec_iMultiply( rvec r, ivec v1, rvec v2 ) { r[0] = v1[0] * v2[0]; r[1] = v1[1] * v2[1]; r[2] = v1[2] * v2[2]; } -HOST_DEVICE inline real rvec_Norm( rvec v ) + +static inline HOST_DEVICE real rvec_Norm( rvec v ) { return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) ); } - ///////////////// //ivec functions ///////////////// - - -HOST_DEVICE inline void ivec_Copy( ivec dest , ivec src ) +static inline HOST_DEVICE void ivec_Copy( ivec dest , ivec src ) { dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } -HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src ) + +static inline HOST_DEVICE void ivec_Scale( ivec dest, real C, ivec src ) { dest[0] = C * src[0]; dest[1] = C * src[1]; dest[2] = C * src[2]; } -HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 ) + +static inline HOST_DEVICE void ivec_Sum( ivec dest, ivec v1, ivec v2 ) { dest[0] = v1[0] + v2[0]; dest[1] = v1[1] + v2[1]; @@ -203,26 +192,32 @@ HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 ) } - ///////////////// //vector functions ///////////////// -HOST_DEVICE inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) +static inline HOST_DEVICE void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) { for (k--; k >= 0; k--) dest[k] = c * v[k] + d * y[k]; } -HOST_DEVICE inline void Vector_Scale( real* dest, real c, real* v, int k ) + +static inline HOST_DEVICE void Vector_Scale( real* dest, real c, real* v, int k ) { for (k--; k >= 0; k--) dest[k] = c * v[k]; } -HOST_DEVICE inline void Vector_Add( real* dest, real c, real* v, int k ) + +static inline HOST_DEVICE void Vector_Add( real* dest, real c, real* v, int k ) { for (k--; k >= 0; k--) dest[k] += c * v[k]; } +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c index 835ffa358e4e6ff995e2f69dc694bb1c3f7798c1..d08dfe1ed75780bc90b5aa65bb13462393d9af47 100644 --- a/PuReMD/src/linear_solvers.c +++ b/PuReMD/src/linear_solvers.c @@ -36,7 +36,9 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) real H; for ( i = 0; i < N; ++i ) + { b[i][0] = b[i][1] = 0; + } /* perform multiplication */ for ( i = 0; i < A->n; ++i ) @@ -64,7 +66,7 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N ) int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, - rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout ) + rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout ) { int i, j, n, N, matvecs, scale; rvec2 tmp, alpha, beta; @@ -86,13 +88,17 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, t_start = Get_Time( ); } #endif + Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer ); dual_Sparse_MatVec( H, x, workspace->q2, N ); // tryQEq Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif for ( j = 0; j < system->n; ++j ) @@ -126,6 +132,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, } MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) Update_Timing_Info( &t_start, &dot_time ); @@ -137,9 +144,12 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N ); // tryQEq Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif /* dot product: d.q */ @@ -174,12 +184,18 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, sig_old[1] = sig_new[1]; MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm ); //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif + if ( sqrt(sig_new[0]) / b_norm[0] <= tol || sqrt(sig_new[1]) / b_norm[1] <= tol ) + { break; + } beta[0] = sig_new[0] / sig_old[0]; beta[1] = sig_new[1] / sig_old[1]; @@ -194,30 +210,41 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, if ( sqrt(sig_new[0]) / b_norm[0] <= tol ) { for ( j = 0; j < n; ++j ) + { workspace->t[j] = workspace->x[j][1]; - matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t, - mpi_data, fout ); + } + matvecs = CG( system, workspace, H, workspace->b_t, tol, + workspace->t,mpi_data, fout ); for ( j = 0; j < n; ++j ) + { workspace->x[j][1] = workspace->t[j]; + } } else if ( sqrt(sig_new[1]) / b_norm[1] <= tol ) { for ( j = 0; j < n; ++j ) + { workspace->s[j] = workspace->x[j][0]; + } matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s, - mpi_data, fout ); + mpi_data, fout ); for ( j = 0; j < system->n; ++j ) + { workspace->x[j][0] = workspace->s[j]; + } } - if ( i >= 300 ) + { fprintf( stderr, "CG convergence failed!\n" ); + } #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) - fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n", - i + 1, matvecs, matvec_time, dot_time ); + { + fprintf( fout, "QEq %d + %d iters. matvecs: %f dot: %f\n", i + 1, + matvecs, matvec_time, dot_time ); + } #endif return (i + 1) + matvecs; @@ -230,7 +257,9 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N ) real H; for ( i = 0; i < N; ++i ) + { b[i] = 0; + } /* perform multiplication */ for ( i = 0; i < A->n; ++i ) @@ -249,8 +278,8 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N ) } -int CG( reax_system *system, storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) +int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b, + real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) { int i, j, scale; real tmp, alpha, beta, b_norm; @@ -269,21 +298,29 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, Sparse_MatVec( H, x, workspace->q, system->N ); // tryQEq Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif Vector_Sum( workspace->r , 1., b, -1., workspace->q, system->n ); for ( j = 0; j < system->n; ++j ) + { workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition + } b_norm = Parallel_Norm( b, system->n, mpi_data->world ); sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world); sig0 = sig_new; + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif for ( i = 1; i < 300 && sqrt(sig_new) / b_norm > tol; ++i ) @@ -292,9 +329,12 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, Sparse_MatVec( H, workspace->d, workspace->q, system->N ); //tryQEq Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &matvec_time ); + } #endif tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world); @@ -303,15 +343,20 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, Vector_Add( workspace->r, -alpha, workspace->q, system->n ); /* pre-conditioning */ for ( j = 0; j < system->n; ++j ) + { workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j]; + } sig_old = sig_new; sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world); beta = sig_new / sig_old; Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n ); + #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) + { Update_Timing_Info( &t_start, &dot_time ); + } #endif } @@ -323,8 +368,10 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, #if defined(CG_PERFORMANCE) if ( system->my_rank == MASTER_NODE ) - fprintf( fout, "QEq %d iters. matvecs: %f dot: %f\n", - i, matvec_time, dot_time ); + { + fprintf( fout, "QEq %d iters. matvecs: %f dot: %f\n", i, matvec_time, + dot_time ); + } #endif return i; @@ -332,7 +379,7 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, - real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) + real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout ) { int i, j, scale; real tmp, alpha, beta, b_norm;