diff --git a/PuReMD-GPU/Makefile.am b/PuReMD-GPU/Makefile.am index bb4ebe45fd54575213cc93ccf18ae9031ce9846f..016114db2dfaf3b9af623069b2f5f3ffc677907a 100644 --- a/PuReMD-GPU/Makefile.am +++ b/PuReMD-GPU/Makefile.am @@ -28,6 +28,7 @@ bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \ src/reset_utils.c src/single_body_interactions.c \ src/system_props.c src/three_body_interactions.c \ src/traj.c src/two_body_interactions.c src/vector.c \ + src/testmd.c \ src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/cuda_reduction.cu \ src/cuda_center_mass.cu src/cuda_box.cu src/validation.cu \ src/cuda_allocate.cu src/cuda_bond_orders.cu \ @@ -38,8 +39,7 @@ bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \ src/cuda_reset_utils.cu src/cuda_single_body_interactions.cu \ src/cuda_system_props.cu src/cuda_three_body_interactions.cu \ src/cuda_two_body_interactions.cu src/cuda_environment.cu \ - src/cuda_post_evolve.cu \ - src/testmd.c + src/cuda_post_evolve.cu include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \ src/restart.h src/param.h src/pdb_tools.h src/box.h \ src/lin_alg.h src/QEq.h src/allocate.h src/bond_orders.h \ diff --git a/PuReMD-GPU/configure.ac b/PuReMD-GPU/configure.ac index 9736714d5d49615e9d6e387feed9a16c37f65348..30e7e0bff6bc3115193c75d3b2d0dcbe29a5cf4b 100644 --- a/PuReMD-GPU/configure.ac +++ b/PuReMD-GPU/configure.ac @@ -42,19 +42,6 @@ AC_SEARCH_LIBS([gzeof], [z]) AC_SEARCH_LIBS([gzgets], [z]) AC_SEARCH_LIBS([gzseek], [z]) AC_SEARCH_LIBS([gzclose, [z]]) -AC_SEARCH_LIBS([cublasCheckError], [cublas]) -AC_SEARCH_LIBS([cublasDnrm2], [cublas]) -AC_SEARCH_LIBS([cublasDaxpy], [cublas]) -AC_SEARCH_LIBS([cublasDscal], [cublas]) -AC_SEARCH_LIBS([cublasDdot], [cublas]) -AC_SEARCH_LIBS([cudaThreadSynchronize], [cuda]) -AC_SEARCH_LIBS([cudaCheckError], [cuda]) -# FIXME: Replace `main' with a function in `-lcudart': -AC_CHECK_LIB([cudart], [main]) -AC_SEARCH_LIBS([cusparseCheckError], [cusparse]) -AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse]) -AC_SEARCH_LIBS([cusparseSetMatType], [cusparse]) -AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse]) # Checks for typedefs, structures, and compiler characteristics. AC_CHECK_HEADER_STDBOOL @@ -78,10 +65,26 @@ then fi AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.]) -if test "BUILD_PROF" = "true" -then - NVCCFLAGS+=" --compiler-options ${gprof_flags}" -fi +AC_SEARCH_LIBS([cublasDnrm2], [cublas]) +AC_SEARCH_LIBS([cublasDaxpy], [cublas]) +AC_SEARCH_LIBS([cublasDscal], [cublas]) +AC_SEARCH_LIBS([cublasDdot], [cublas]) +AC_SEARCH_LIBS([cudaThreadSynchronize], [cudart]) +AC_SEARCH_LIBS([cudaGetLastError], [cudart]) +AC_CHECK_LIB([cudart], [cudaMalloc]) +AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse]) +AC_SEARCH_LIBS([cusparseSetMatType], [cusparse]) +AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse]) + +AC_SEARCH_LIBS([cublasDnrm2], [cublas], + [CUBLAS_FOUND_LIBS="yes"], [CUBLAS_FOUND_LIBS="no"], [-lcublas]) +AS_IF([test "x${CUBLAS_FOUND_LIBS}" != "xyes"], + [AC_MSG_ERROR([Unable to find CUBLAS library.])]) + +AC_SEARCH_LIBS([cusparseSetMatType], [cusparse], + [CUSPARSE_FOUND_LIBS="yes"], [CUSPARSE_FOUND_LIBS="no"], [-lcusparse]) +AS_IF([test "x${CUSPARSE_FOUND_LIBS}" != "xyes"], + [AC_MSG_ERROR([Unable to find CUSPARSE library.])]) AC_CHECK_TYPES([cublasHandle_t], [], [AC_MSG_FAILURE([cublasHandle_t type not found in cublas.h], [1])], [#include<cublas_v2.h>]) @@ -90,6 +93,12 @@ AC_CHECK_TYPES([cusparseHandle_t], [], AC_CHECK_TYPES([cusparseMatDescr_t], [], [AC_MSG_FAILURE([cusparseMatDescr_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>]) +if test "BUILD_PROF" = "true" +then + NVCCFLAGS+=" --compiler-options ${gprof_flags}" +fi + + AC_CONFIG_FILES([Makefile]) AC_OUTPUT diff --git a/PuReMD-GPU/src/allocate.c b/PuReMD-GPU/src/allocate.c index e24a67b5ba2aa1ddfb17858d5977b2ec57000411..65f0eb2a872673259d508f17fc0da43530a7426f 100644 --- a/PuReMD-GPU/src/allocate.c +++ b/PuReMD-GPU/src/allocate.c @@ -41,7 +41,7 @@ void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs ) } -int Allocate_Matrix( sparse_matrix *H, int n, int m ) +HOST int Allocate_Matrix( sparse_matrix *H, int n, int m ) { H->n = n; H->m = m; diff --git a/PuReMD-GPU/src/allocate.h b/PuReMD-GPU/src/allocate.h index 4ec4b541ec2f764cbbf583c4262b2793253a4e99..b03ed80b34f153b9929ccaa80bc5c27fbf6ce540 100644 --- a/PuReMD-GPU/src/allocate.h +++ b/PuReMD-GPU/src/allocate.h @@ -23,6 +23,11 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Reallocate( reax_system*, static_storage*, list**, int ); int Allocate_Matrix( sparse_matrix*, int, int ); @@ -32,5 +37,9 @@ int Allocate_HBond_List( int, int, int*, int*, list* ); int Allocate_Bond_List( int, int*, list* ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/box.h b/PuReMD-GPU/src/box.h index a2b6b1a9f1c4d68d1e4e94c6718aa0ddd46d915a..418aa6208a81fb05ee56ff09afc6ff76751f75c9 100644 --- a/PuReMD-GPU/src/box.h +++ b/PuReMD-GPU/src/box.h @@ -63,7 +63,7 @@ real Metric_Product( rvec, rvec, simulation_box* ); void Print_Box_Information( simulation_box*, FILE* ); -HOST_DEVICE inline real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r) +static inline HOST_DEVICE real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r) { real norm = 0.0; @@ -97,7 +97,7 @@ HOST_DEVICE inline real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box } -HOST_DEVICE inline void Inc_on_T3( rvec x, rvec dx, simulation_box *box ) +static inline HOST_DEVICE void Inc_on_T3( rvec x, rvec dx, simulation_box *box ) { int i; real tmp; diff --git a/PuReMD-GPU/src/cuda_QEq.cu b/PuReMD-GPU/src/cuda_QEq.cu index f11271eecd8112d33d53451c62e02e586e1dbf38..033945338aa76aa4c02909f90d2ca3eb1dacad58 100644 --- a/PuReMD-GPU/src/cuda_QEq.cu +++ b/PuReMD-GPU/src/cuda_QEq.cu @@ -38,7 +38,7 @@ #include "validation.h" -GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A ) +GLOBAL void Cuda_Sort_Matrix_Rows( sparse_matrix A ) { int i; int si, ei; @@ -54,7 +54,7 @@ GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A ) } -GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol ) +GLOBAL void Cuda_Calculate_Droptol( sparse_matrix p_A, real *droptol, real dtol ) { int i = blockIdx.x * blockDim.x + threadIdx.x; int k, j, offset, x, diagonal; @@ -78,7 +78,7 @@ GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol } -GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real dtol ) +GLOBAL void Cuda_Calculate_Droptol_js( sparse_matrix p_A, real *droptol, real dtol ) { int i = blockIdx.x * blockDim.x + threadIdx.x; int k, j, offset, x, diagonal; @@ -98,7 +98,7 @@ GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real d } -GLOBAL void Cuda_Calculate_Droptol_diagonal ( sparse_matrix p_A, real *droptol, real dtol ) +GLOBAL void Cuda_Calculate_Droptol_diagonal( sparse_matrix p_A, real *droptol, real dtol ) { int i = blockIdx.x * blockDim.x + threadIdx.x; int k, j, offset, x, diagonal; @@ -118,7 +118,7 @@ GLOBAL void Cuda_Calculate_Droptol_diagonal ( sparse_matrix p_A, real *droptol, } -GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *fillin) +GLOBAL void Cuda_Estimate_LU_Fill( sparse_matrix p_A, real *droptol, int *fillin ) { int i, j, pj; real val; @@ -402,7 +402,7 @@ void Cuda_Fill_U ( sparse_matrix *A, real *droptol, */ -void Cuda_Init_MatVec(reax_system *system, control_params *control, +void Cuda_Init_MatVec( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, list *far_nbrs ) { int i, fillin; @@ -416,8 +416,8 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control, { Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>> ( dev_workspace->H ); - cudaThreadSynchronize(); - cudaCheckError(); + cudaThreadSynchronize( ); + cudaCheckError( ); #ifdef __DEBUG_CUDA__ fprintf (stderr, "Sorting done... \n"); @@ -425,8 +425,8 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control, Cuda_Calculate_Droptol<<<BLOCKS, BLOCK_SIZE >>> ( dev_workspace->H, dev_workspace->droptol, control->droptol ); - cudaThreadSynchronize(); - cudaCheckError(); + cudaThreadSynchronize( ); + cudaCheckError( ); #ifdef __DEBUG_CUDA__ fprintf (stderr, "Droptol done... \n"); @@ -434,24 +434,24 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control, if( dev_workspace->L.entries == NULL ) { - cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH ); + cuda_memset( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH ); Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>> ( dev_workspace->H, dev_workspace->droptol, spad ); - cudaThreadSynchronize(); - cudaCheckError(); + cudaThreadSynchronize( ); + cudaCheckError( ); //Reduction for fill in - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>> + Cuda_reduction_int<<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>> (spad, spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> + Cuda_reduction_int<<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> (spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - copy_host_device (&fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); + copy_host_device( &fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); fillin += dev_workspace->H.n; #ifdef __DEBUG_CUDA__ @@ -497,47 +497,47 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control, sparse_matrix t_H, t_L, t_U; real *t_droptol; - t_droptol = (real *) malloc (REAL_SIZE * system->N); + t_droptol = (real *) malloc( REAL_SIZE * system->N ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m ); + fprintf( stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m ); #endif - start = Get_Time (); - if(!Allocate_Matrix(&t_H, dev_workspace->H.n, dev_workspace->H.m)) + start = Get_Time( ); + if( !Allocate_Matrix(&t_H, dev_workspace->H.n, dev_workspace->H.m) ) { fprintf(stderr, "No space for H matrix \n"); - exit(0); + exit( 0 ); } - if(!Allocate_Matrix(&t_L, far_nbrs->n, dev_workspace->L.m)) + if( !Allocate_Matrix(&t_L, far_nbrs->n, dev_workspace->L.m) ) { - fprintf(stderr, "No space for L matrix \n"); - exit(0); + fprintf( stderr, "No space for L matrix \n" ); + exit( 0 ); } - if(!Allocate_Matrix(&t_U, far_nbrs->n, dev_workspace->U.m)) + if( !Allocate_Matrix(&t_U, far_nbrs->n, dev_workspace->U.m) ) { - fprintf(stderr, "No space for U matrix \n"); - exit(0); + fprintf( stderr, "No space for U matrix \n" ); + exit( 0 ); } - copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * + copy_host_device( t_H.start, dev_workspace->H.start, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * + copy_host_device( t_H.end, dev_workspace->H.end, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX ); - copy_host_device ( t_H.entries, dev_workspace->H.entries, + copy_host_device( t_H.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY ); - copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * + copy_host_device( t_droptol, dev_workspace->droptol, REAL_SIZE * system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL ); //fprintf (stderr, " Done copying LUH .. \n"); - Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U); + Cuda_ICHOLT( &t_H, t_droptol, &t_L, &t_U ); - Sync_Host_Device (&t_L, &t_U, cudaMemcpyHostToDevice); - end += Get_Timing_Info (start); + Sync_Host_Device_Mat( &t_L, &t_U, cudaMemcpyHostToDevice ); + end += Get_Timing_Info( start ); /* fprintf (stderr, "Done syncing .... \n"); @@ -552,7 +552,7 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control, */ //#ifdef __DEBUG_CUDA__ - fprintf (stderr, "Done copying the L/U matrices to the device ---> %f \n", end); + fprintf( stderr, "Done copying the L/U matrices to the device ---> %f \n", end ); //#endif //#ifdef __BUILD_DEBUG__ @@ -562,7 +562,7 @@ void Cuda_Init_MatVec(reax_system *system, control_params *control, } -GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N ) +GLOBAL void Init_MatVec_Postprocess( static_storage p_workspace, int N ) { static_storage *workspace = &p_workspace; @@ -608,7 +608,7 @@ GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N ) } -GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, int N) +GLOBAL void Cuda_Update_Atoms_q( reax_atom *atoms, real *s, real u, real *t, int N ) { int i = blockIdx.x*blockDim.x + threadIdx.x; @@ -621,39 +621,39 @@ GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, in } -void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace) +void Cuda_Calculate_Charges( reax_system *system, static_storage *workspace ) { real *spad = (real *) scratch; real u, s_sum, t_sum; - cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); + cuda_memset( spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); //s_sum - Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> + Cuda_reduction<<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> (&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - copy_host_device (&s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device( &s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); //t_sum - cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); + cuda_memset( spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH ); Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> (&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - copy_host_device (&t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device( &t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__ ); //fraction here u = s_sum / t_sum; @@ -662,10 +662,10 @@ void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace) fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u ); #endif - Cuda_Update_Atoms_q <<< BLOCKS, BLOCK_SIZE >>> + Cuda_Update_Atoms_q<<< BLOCKS, BLOCK_SIZE >>> ( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); } @@ -677,33 +677,34 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data *da real t_start, t_elapsed; #ifdef __DEBUG_CUDA__ - t_start = Get_Time (); + t_start = Get_Time( ); #endif /* //Cuda_Init_MatVec( system, control, data, workspace, far_nbrs ); - Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>> + Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>> ( dev_workspace->H ); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize(); + cudaCheckError(); t_elapsed = Get_Timing_Info (t_start); fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed); */ - Init_MatVec_Postprocess <<< BLOCKS, BLOCK_SIZE >>> + + Init_MatVec_Postprocess<<< BLOCKS, BLOCK_SIZE >>> (*dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info (t_start); - fprintf (stderr, "Done with post processing of init_matvec --> %d with time ---> %f \n", cudaGetLastError (), t_elapsed); + t_elapsed = Get_Timing_Info( t_start ); + fprintf( stderr, "Done with post processing of init_matvec --> %d with time ---> %f \n", cudaGetLastError (), t_elapsed ); #endif //Here goes the GMRES part of the program () //#ifdef __DEBUG_CUDA__ - t_start = Get_Time (); + t_start = Get_Time( ); //#endif //matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s ); @@ -715,10 +716,9 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data *da d_timing.matvecs += matvecs; #ifdef __DEBUG_CUDA__ - t_elapsed = Get_Timing_Info ( t_start ); - fprintf (stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed ); + t_elapsed = Get_Timing_Info( t_start ); + fprintf( stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed ); #endif - //Here cuda calculate charges - Cuda_Calculate_Charges (system, workspace); + Cuda_Calculate_Charges( system, workspace ); } diff --git a/PuReMD-GPU/src/cuda_QEq.h b/PuReMD-GPU/src/cuda_QEq.h index 2215510c07edc4d5b3497ec30b5be1a73aa2d411..f62ab1157e3812837cd3a8a65e07856a78b22bf2 100644 --- a/PuReMD-GPU/src/cuda_QEq.h +++ b/PuReMD-GPU/src/cuda_QEq.h @@ -24,8 +24,16 @@ #include "mytypes.h" +#ifdef __cplusplus +extern "C" { +#endif + void Cuda_QEq( reax_system*, control_params*, simulation_data*, static_storage*, list*, output_controls* ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_allocate.cu b/PuReMD-GPU/src/cuda_allocate.cu index fb9795c5f198b5de4ec3029dfd485710f8d491aa..d0e1f22b6f970038820f82c504dc3472d50d1d7a 100644 --- a/PuReMD-GPU/src/cuda_allocate.cu +++ b/PuReMD-GPU/src/cuda_allocate.cu @@ -302,7 +302,7 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH ); cudaThreadSynchronize (); cudaCheckError (); - Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> + Cuda_reduction_int<<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> (d_bond_top + n, d_bond_top + n + BLOCKS_POW_2, BLOCKS_POW_2); cudaThreadSynchronize (); diff --git a/PuReMD-GPU/src/cuda_allocate.h b/PuReMD-GPU/src/cuda_allocate.h index 1de435a0f70a3cca0f3a39d63d3ddccf685774a1..dc672d3bd76c1afa7a468aa2fdda3dd0ca3d3ec9 100644 --- a/PuReMD-GPU/src/cuda_allocate.h +++ b/PuReMD-GPU/src/cuda_allocate.h @@ -23,9 +23,19 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + int Cuda_Allocate_Matrix( sparse_matrix*, int, int ); int Cuda_Allocate_HBond_List( int, int, int*, int*, list* ); int Cuda_Allocate_Bond_List( int, int*, list* ); void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/cuda_bond_orders.h b/PuReMD-GPU/src/cuda_bond_orders.h index bee988ff209563097d0c511e36dfbe41028990c0..4015b9fa34aa1e66843efc8dc1a49f1c75da749c 100644 --- a/PuReMD-GPU/src/cuda_bond_orders.h +++ b/PuReMD-GPU/src/cuda_bond_orders.h @@ -23,6 +23,11 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + GLOBAL void Cuda_Calculate_Bond_Orders_Init ( reax_atom *, global_parameters , single_body_parameters *, static_storage , int , int ); GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *, global_parameters , single_body_parameters *, @@ -35,4 +40,9 @@ GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *, simulation_data * //HOST_DEVICE void Cuda_Add_dBond_to_Forces( int, int, reax_atom *, static_storage*, list* ); //HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int, int, reax_atom *, simulation_data*, static_storage*, list* ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/cuda_box.h b/PuReMD-GPU/src/cuda_box.h index 7186506fc64dd2e5c7535af1211b489dc02522f9..913abc15867cf5ec9d8e87dad0fa56198de9b1ff 100644 --- a/PuReMD-GPU/src/cuda_box.h +++ b/PuReMD-GPU/src/cuda_box.h @@ -23,7 +23,9 @@ #include "mytypes.h" + GLOBAL void k_compute_Inc_on_T3 (reax_atom *atoms, unsigned int N, simulation_box *box, real d1, real d2, real d3); + #endif diff --git a/PuReMD-GPU/src/cuda_center_mass.cu b/PuReMD-GPU/src/cuda_center_mass.cu index a82c0b0303aec6f5878df03d9493936604ae9aaa..158d3a16489f20362bd854312eb39aa0dcec57e8 100644 --- a/PuReMD-GPU/src/cuda_center_mass.cu +++ b/PuReMD-GPU/src/cuda_center_mass.cu @@ -23,11 +23,8 @@ #include "vector.h" -GLOBAL void k_center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) +GLOBAL void k_center_of_mass_blocks( single_body_parameters *sbp, reax_atom *atoms, + rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ) { extern __shared__ rvec xcm[]; extern __shared__ rvec vcm[]; @@ -76,13 +73,8 @@ GLOBAL void k_center_of_mass_blocks (single_body_parameters *sbp, reax_atom *ato } -GLOBAL void k_center_of_mass (rvec *xcm, - rvec *vcm, - rvec *amcm, - rvec *res_xcm, - rvec *res_vcm, - rvec *res_amcm, - size_t n) +GLOBAL void k_center_of_mass( rvec *xcm, rvec *vcm, rvec *amcm, + rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ) { extern __shared__ rvec sh_xcm[]; extern __shared__ rvec sh_vcm[]; @@ -132,11 +124,8 @@ GLOBAL void k_center_of_mass (rvec *xcm, } -GLOBAL void k_compute_center_mass (single_body_parameters *sbp, - reax_atom *atoms, - real *results, - real xcm0, real xcm1, real xcm2, - size_t n) +GLOBAL void k_compute_center_mass_sbp( single_body_parameters *sbp, reax_atom *atoms, + real *results, real xcm0, real xcm1, real xcm2, size_t n ) { extern __shared__ real xx[]; extern __shared__ real xy[]; @@ -161,11 +150,11 @@ GLOBAL void k_compute_center_mass (single_body_parameters *sbp, xcm[1] = xcm1; xcm[2] = xcm2; - xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0; - if (i < n){ + if (i < n) + { m = sbp[ atoms[i].type ].mass; rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm ); xx[ xx_i ] = diff[0] * diff[0] * m; @@ -177,8 +166,10 @@ GLOBAL void k_compute_center_mass (single_body_parameters *sbp, } __syncthreads (); - for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){ - if (threadIdx.x < offset){ + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) + { + if (threadIdx.x < offset) + { index = threadIdx.x + offset; xx[ threadIdx.x ] += xx[ index ]; xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ]; @@ -190,7 +181,8 @@ GLOBAL void k_compute_center_mass (single_body_parameters *sbp, __syncthreads (); } - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { results [ blockIdx.x*6 ] = xx [ 0 ]; results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ]; results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ]; @@ -201,7 +193,7 @@ GLOBAL void k_compute_center_mass (single_body_parameters *sbp, } -GLOBAL void k_compute_center_mass (real *input, real *output, size_t n) +GLOBAL void k_compute_center_mass( real *input, real *output, size_t n ) { extern __shared__ real xx[]; extern __shared__ real xy[]; diff --git a/PuReMD-GPU/src/cuda_center_mass.h b/PuReMD-GPU/src/cuda_center_mass.h index 887377340eacca0479fffc352499a456dba5a177..0c1d76ec63defe3cb6a0738a2843b10d02104bb5 100644 --- a/PuReMD-GPU/src/cuda_center_mass.h +++ b/PuReMD-GPU/src/cuda_center_mass.h @@ -23,12 +23,22 @@ #include "mytypes.h" -GLOBAL void k_center_of_mass_blocks (single_body_parameters *, reax_atom *, - rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n); -GLOBAL void k_center_of_mass (rvec *xcm, - rvec *vcm, rvec *amcm, rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n); -GLOBAL void k_compute_center_mass (single_body_parameters *sbp, - reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, size_t n); -GLOBAL void k_compute_center_mass (real *input, real *output, size_t n); + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void k_center_of_mass_blocks( single_body_parameters *, reax_atom *, + rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ); +GLOBAL void k_center_of_mass( rvec *xcm, + rvec *vcm, rvec *amcm, rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ); +GLOBAL void k_compute_center_mass_sbp( single_body_parameters *sbp, + reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, size_t n ); +GLOBAL void k_compute_center_mass( real *input, real *output, size_t n ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_copy.cu b/PuReMD-GPU/src/cuda_copy.cu index 88ad09502bcd26d2b6b6d53103513c5cd73f7f9d..1f50dbf3c74fae4d9ab73149d02c51dbec8ca10e 100644 --- a/PuReMD-GPU/src/cuda_copy.cu +++ b/PuReMD-GPU/src/cuda_copy.cu @@ -25,7 +25,7 @@ #include "vector.h" -void Sync_Host_Device( grid *host, grid *dev, enum cudaMemcpyKind dir ) +void Sync_Host_Device_Grid( grid *host, grid *dev, enum cudaMemcpyKind dir ) { copy_host_device( host->top, dev->top, INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP ); @@ -50,7 +50,7 @@ void Sync_Host_Device( grid *host, grid *dev, enum cudaMemcpyKind dir ) } -void Sync_Host_Device( reax_system *sys, enum cudaMemcpyKind dir ) +void Sync_Host_Device_Sys( reax_system *sys, enum cudaMemcpyKind dir ) { copy_host_device( sys->atoms, sys->d_atoms, @@ -78,13 +78,13 @@ void Sync_Host_Device( reax_system *sys, enum cudaMemcpyKind dir ) } -void Sync_Host_Device( simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir ) +void Sync_Host_Device_Data( simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir ) { copy_host_device( host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA ); } -void Sync_Host_Device( sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir ) +void Sync_Host_Device_Mat( sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir ) { copy_host_device( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); copy_host_device( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX ); @@ -96,12 +96,12 @@ void Sync_Host_Device( sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind d } -void Sync_Host_Device( output_controls *, control_params *, enum cudaMemcpyKind ) +void Sync_Host_Device_Control( output_controls *, control_params *, enum cudaMemcpyKind ) { } -void Sync_Host_Device( control_params *host, control_params *device, enum cudaMemcpyKind ) +void Sync_Host_Device_Params( control_params *host, control_params *device, enum cudaMemcpyKind ) { copy_host_device( host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS ); } @@ -117,7 +117,7 @@ void Prep_Device_For_Output( reax_system *system, simulation_data *data ) //fprintf (stderr, "size to copy --> %d \n", size ); //copy_host_device (data, (simulation_data *)data->d_simulation_data, size, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); - //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost ); + //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost ); /* copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, REAL_SIZE * 13, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA ); @@ -151,16 +151,18 @@ void Prep_Device_For_Output( reax_system *system, simulation_data *data ) data->kin_press = local_data.kin_press; data->therm.T = local_data.therm.T; - //Sync_Host_Device (&system.g, &system.d_g, cudaMemcpyDeviceToHost ); - Sync_Host_Device( system, cudaMemcpyDeviceToHost ); + //Sync_Host_Device_Sys( &system.g, &system.d_g, cudaMemcpyDeviceToHost ); + Sync_Host_Device_Sys( system, cudaMemcpyDeviceToHost ); } -void Sync_Host_Device( list *host, list *device, int type ) +void Sync_Host_Device_List( list *host, list *device, int type ) { //list is already allocated -- discard it first if (host->n > 0) + { Cuda_Delete_List( host ); + } //memory is allocated on the host Cuda_Make_List( device->n, device->num_intrs, type, host ); diff --git a/PuReMD-GPU/src/cuda_copy.h b/PuReMD-GPU/src/cuda_copy.h index 6f2509e8318c391bd1af6c9eb5eeca8354016029..6b6f38ae3d12d7d00d8cd38eef996d61cb765bd5 100644 --- a/PuReMD-GPU/src/cuda_copy.h +++ b/PuReMD-GPU/src/cuda_copy.h @@ -18,8 +18,6 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ - - #ifndef __CUDA_COPY_H_ #define __CUDA_COPY_H_ @@ -28,14 +26,24 @@ #include "mytypes.h" #include "list.h" -void Sync_Host_Device (grid *, grid *, enum cudaMemcpyKind); -void Sync_Host_Device (reax_system *, enum cudaMemcpyKind); -void Sync_Host_Device (control_params *, control_params *, enum cudaMemcpyKind); -void Sync_Host_Device (simulation_data *, simulation_data *, enum cudaMemcpyKind); -void Sync_Host_Device (sparse_matrix *, sparse_matrix *, enum cudaMemcpyKind); -void Sync_Host_Device (output_controls *, enum cudaMemcpyKind); -void Prep_Device_For_Output (reax_system *, simulation_data *); -void Sync_Host_Device (list *host, list *device, int type); +#ifdef __cplusplus +extern "C" { +#endif + +void Sync_Host_Device_Grid( grid *, grid *, enum cudaMemcpyKind ); +void Sync_Host_Device_Sys( reax_system *, enum cudaMemcpyKind ); +void Sync_Host_Device_Params( control_params *, control_params *, enum cudaMemcpyKind ); +void Sync_Host_Device_Data( simulation_data *, simulation_data *, enum cudaMemcpyKind ); +void Sync_Host_Device_Mat( sparse_matrix *, sparse_matrix *, enum cudaMemcpyKind ); +void Sync_Host_Device_Control( output_controls *, enum cudaMemcpyKind ); + +void Prep_Device_For_Output( reax_system *, simulation_data * ); +void Sync_Host_Device_List( list *host, list *device, int type ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_environment.cu b/PuReMD-GPU/src/cuda_environment.cu index 05eaf07df6faf5b9f9e561273247ab8b9db519b6..cd6ae50d82b4716aef1d16f2f957dc5f4976cca7 100644 --- a/PuReMD-GPU/src/cuda_environment.cu +++ b/PuReMD-GPU/src/cuda_environment.cu @@ -23,20 +23,18 @@ #include "cuda_utils.h" -extern "C" void Setup_Cuda_Environment( int rank, int nprocs, int gpus_per_node ) +void Setup_Cuda_Environment( int rank, int nprocs, int gpus_per_node ) { - int deviceCount; + int deviceCount = 0; cudaError_t flag; - cublasStatus_t cublasStatus; cublasHandle_t cublasHandle; cusparseHandle_t cusparseHandle; - cusparseStatus_t cusparseStatus; cusparseMatDescr_t matdescriptor; flag = cudaGetDeviceCount( &deviceCount ); - if ( flag != cudaSuccess ) + if ( flag != cudaSuccess || deviceCount < 1 ) { fprintf( stderr, "ERROR: no CUDA capable device(s) found. Terminating...\n" ); exit( 1 ); @@ -45,28 +43,28 @@ extern "C" void Setup_Cuda_Environment( int rank, int nprocs, int gpus_per_node //Calculate the # of GPUs per processor //and assign the GPU for each process //TODO: handle condition where # CPU procs > # GPUs - cudaSetDevice( (rank % (deviceCount)) ); + cudaSetDevice( rank % deviceCount ); #if defined(__CUDA_DEBUG__) - fprintf( stderr, "p:%d is using GPU: %d \n", rank, (rank % deviceCount)); + fprintf( stderr, "p:%d is using GPU: %d \n", rank, rank % deviceCount ); #endif //CHANGE ORIGINAL //cudaDeviceSetLimit( cudaLimitStackSize, 8192 ); //cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 ); - //cudaCheckError(); + //cudaCheckError( ); - cublasCheckError( cublasStatus = cublasCreate(&cublasHandle) ); + cublasCheckError( cublasCreate(&cublasHandle) ); - cusparseCheckError( cusparseStatus = cusparseCreate (&cusparseHandle) ); - cusparseCheckError( cusparseCreateMatDescr (&matdescriptor) ); + cusparseCheckError( cusparseCreate(&cusparseHandle) ); + cusparseCheckError( cusparseCreateMatDescr(&matdescriptor) ); cusparseSetMatType( matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL ); cusparseSetMatIndexBase( matdescriptor, CUSPARSE_INDEX_BASE_ZERO ); } -extern "C" void Cleanup_Cuda_Environment( ) +void Cleanup_Cuda_Environment( ) { cudaDeviceReset( ); cudaDeviceSynchronize( ); diff --git a/PuReMD-GPU/src/cuda_environment.h b/PuReMD-GPU/src/cuda_environment.h index 3d7987193df15d92c9c1c9fab71aad36a3e5a6e5..61f811db3354628a3067a3423442f875f16e6871 100644 --- a/PuReMD-GPU/src/cuda_environment.h +++ b/PuReMD-GPU/src/cuda_environment.h @@ -28,8 +28,8 @@ extern "C" { #endif -void Setup_Cuda_Environment(int, int, int); -void Cleanup_Cuda_Environment(); +void Setup_Cuda_Environment( int, int, int ); +void Cleanup_Cuda_Environment( ); #ifdef __cplusplus } diff --git a/PuReMD-GPU/src/cuda_forces.cu b/PuReMD-GPU/src/cuda_forces.cu index 72e8fbe2dd6559062999e82635ae8a3168a27f40..bf277b391ce0df0c5336ea0a0653b6863ca14fec 100644 --- a/PuReMD-GPU/src/cuda_forces.cu +++ b/PuReMD-GPU/src/cuda_forces.cu @@ -59,35 +59,35 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE); #endif - Cuda_Calculate_Bond_Orders_Init <<< BLOCKS, BLOCK_SIZE >>> + Cuda_Calculate_Bond_Orders_Init<<< BLOCKS, BLOCK_SIZE >>> ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, *dev_workspace, system->reaxprm.num_atom_types, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - Cuda_Calculate_Bond_Orders <<< BLOCKS, BLOCK_SIZE >>> + Cuda_Calculate_Bond_Orders<<< BLOCKS, BLOCK_SIZE >>> ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, system->reaxprm.d_tbp, *dev_workspace, *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), system->reaxprm.num_atom_types, system->N ); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - Cuda_Update_Uncorrected_BO <<<BLOCKS, BLOCK_SIZE>>> + Cuda_Update_Uncorrected_BO<<<BLOCKS, BLOCK_SIZE>>> (*dev_workspace, *(dev_lists + BONDS), system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - Cuda_Update_Workspace_After_Bond_Orders <<<BLOCKS, BLOCK_SIZE>>> + Cuda_Update_Workspace_After_Bond_Orders<<<BLOCKS, BLOCK_SIZE>>> (system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, *dev_workspace, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); #ifdef __DEBUG_CUDA__ t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); - fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n"); + fprintf( stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed ); + fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" ); #endif //Step 2. @@ -197,7 +197,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, #endif cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH); - Three_Body_Estimate <<<BLOCKS, BLOCK_SIZE>>> + k_Three_Body_Estimate<<<BLOCKS, BLOCK_SIZE>>> (system->d_atoms, (control_params *)control->d_control, *(dev_lists + BONDS), @@ -269,7 +269,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); - Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> + k_Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> ( system->d_atoms, system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, (control_params *)control->d_control, @@ -314,41 +314,38 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaThreadSynchronize (); cudaCheckError (); - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); //Reduction for ext_pres rvec_spad = (rvec *) (spad + 6*system->N); - Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> + Cuda_reduction_rvec<<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> (rvec_spad, rvec_spad + system->N, system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> + Cuda_reduction_rvec<<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); real t_1, t_2; - t_1 = Get_Time (); + t_1 = Get_Time( ); //Sum up the f vector for each atom and collect the CdDelta from all the bonds - Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, - (control_params *)control->d_control, - *dev_workspace, - *(dev_lists + BONDS), - system->N ); - cudaThreadSynchronize (); - cudaCheckError (); - t_2 = Get_Timing_Info (t_1); + k_Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>> + (system->d_atoms, (control_params *)control->d_control, + *dev_workspace, *(dev_lists + BONDS), system->N ); + cudaThreadSynchronize( ); + cudaCheckError( ); + t_2 = Get_Timing_Info( t_1 ); #ifdef __DEBUG_CUDA__ t_elapsed = Get_Timing_Info( t_start ); - fprintf (stderr, "Three_Body_Interactions post process Timing %lf \n", t_2); - fprintf (stderr, "Three_Body_Interactions ... Timing %lf \n", t_elapsed ); - fprintf (stderr, "Three_Body_Interactions Done... \n"); + fprintf( stderr, "Three_Body_Interactions post process Timing %lf \n", t_2 ); + fprintf( stderr, "Three_Body_Interactions ... Timing %lf \n", t_elapsed ); + fprintf( stderr, "Three_Body_Interactions Done... \n" ); #endif //Step 5. @@ -356,21 +353,16 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, t_start = Get_Time( ); #endif - cuda_memset (spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); - Four_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>> - //Four_Body_Interactions <<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>> - ( system->d_atoms, - system->reaxprm.d_gp, - system->reaxprm.d_fbp, - (control_params *)control->d_control, - *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), - (simulation_box *)system->d_box, - (simulation_data *)data->d_simulation_data, - *dev_workspace, - system->N, system->reaxprm.num_atom_types, - spad, spad + 2*system->N, (rvec *) (spad + 4*system->N)); - cudaThreadSynchronize (); - cudaCheckError (); + cuda_memset( spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH ); + //k_Four_Body_Interactions<<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>> + k_Four_Body_Interactions<<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_fbp, + (control_params *)control->d_control, *(dev_lists + BONDS), *(dev_lists + THREE_BODIES), + (simulation_box *)system->d_box, (simulation_data *)data->d_simulation_data, + *dev_workspace, system->N, system->reaxprm.num_atom_types, + spad, spad + 2*system->N, (rvec *) (spad + 4*system->N) ); + cudaThreadSynchronize( ); + cudaCheckError( ); //Reduction for E_Tor Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> @@ -407,13 +399,11 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaCheckError (); //Post process here - Four_Body_Postprocess <<< BLOCKS, BLOCK_SIZE >>> - ( system->d_atoms, - *dev_workspace, - *(dev_lists + BONDS), + k_Four_Body_Postprocess<<< BLOCKS, BLOCK_SIZE >>> + ( system->d_atoms, *dev_workspace, *(dev_lists + BONDS), system->N ); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); #ifdef __DEBUG_CUDA__ t_elapsed = Get_Timing_Info( t_start ); @@ -429,7 +419,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH ); /* - Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>> + k_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>> ( system->d_atoms, system->reaxprm.d_sbp, system->reaxprm.d_hbp, @@ -450,7 +440,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + (((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1); - Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE ) >>> + k_Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE ) >>> ( system->d_atoms, system->reaxprm.d_sbp, system->reaxprm.d_hbp, @@ -498,7 +488,7 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, t_1 = Get_Time (); #endif - Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>> + k_Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>> ( system->d_atoms, system->reaxprm.d_sbp, *dev_workspace, @@ -516,8 +506,8 @@ void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, t_1 = Get_Time (); #endif - //Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>> - Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>> + //k_Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>> + k_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>> ( system->d_atoms, system->reaxprm.d_sbp, *dev_workspace, diff --git a/PuReMD-GPU/src/cuda_forces.h b/PuReMD-GPU/src/cuda_forces.h index 3b26116fad2c15a3dd57cc68caad62c402d95ab1..b017e63ebeee45c03d8926a79f1a9a0dd7a4771d 100644 --- a/PuReMD-GPU/src/cuda_forces.h +++ b/PuReMD-GPU/src/cuda_forces.h @@ -24,6 +24,10 @@ #include "mytypes.h" +#ifdef __cplusplus +extern "C" { +#endif + GLOBAL void k_Estimate_Sparse_Matrix_Entries ( reax_atom *, control_params *, simulation_data *, simulation_box *, list, int, int * ); @@ -36,5 +40,9 @@ void Cuda_Threebody_List( reax_system *, static_storage *, list *, int ); int validate_device (reax_system *, simulation_data *, static_storage *, list **); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_four_body_interactions.cu b/PuReMD-GPU/src/cuda_four_body_interactions.cu index 8d8100608b7c067ba0ade906d744277f2aaead32..60d9973482ddd61a34d874301f1ed360443625b9 100644 --- a/PuReMD-GPU/src/cuda_four_body_interactions.cu +++ b/PuReMD-GPU/src/cuda_four_body_interactions.cu @@ -160,7 +160,7 @@ DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk, } -GLOBAL void Four_Body_Interactions ( reax_atom *atoms, +GLOBAL void k_Four_Body_Interactions ( reax_atom *atoms, global_parameters g_params, four_body_header *d_fbp, control_params *control, @@ -804,7 +804,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms, } -GLOBAL void Four_Body_Postprocess ( reax_atom *atoms, +GLOBAL void k_Four_Body_Postprocess( reax_atom *atoms, static_storage p_workspace, list p_bonds, int N ) { diff --git a/PuReMD-GPU/src/cuda_four_body_interactions.h b/PuReMD-GPU/src/cuda_four_body_interactions.h index 5e6cf35176e90c3e9a088162a33b7a776a7fb1f7..088e24f4f15a1e83405b83427ba56d1e5d4e67ba 100644 --- a/PuReMD-GPU/src/cuda_four_body_interactions.h +++ b/PuReMD-GPU/src/cuda_four_body_interactions.h @@ -23,10 +23,20 @@ #include "mytypes.h" -GLOBAL void Four_Body_Interactions ( reax_atom *, global_parameters , + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void k_Four_Body_Interactions( reax_atom *, global_parameters , four_body_header *, control_params *, list , list , simulation_box *, - simulation_data *, static_storage , int , int , real *, real *, rvec *); + simulation_data *, static_storage , int , int , real *, real *, rvec * ); + +GLOBAL void k_Four_Body_Postprocess( reax_atom *, static_storage, list , int ); + +#ifdef __cplusplus +} +#endif -GLOBAL void Four_Body_Postprocess (reax_atom *, static_storage, list , int ); #endif diff --git a/PuReMD-GPU/src/cuda_grid.h b/PuReMD-GPU/src/cuda_grid.h index ca2edf1718f57625b1cce68b4ab58180f11efbd4..28a20797a56bda9682354f967634eeb6301e44ae 100644 --- a/PuReMD-GPU/src/cuda_grid.h +++ b/PuReMD-GPU/src/cuda_grid.h @@ -23,7 +23,17 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Cuda_Bin_Atoms( reax_system*, static_storage* ); void Cuda_Bin_Atoms_Sync (reax_system *); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/cuda_helpers.h b/PuReMD-GPU/src/cuda_helpers.h index 292d561311a455287f9c107e4cef61a6ea61fa45..e306c673f8aec585bbda7b92250e2f0ec91eb2e3 100644 --- a/PuReMD-GPU/src/cuda_helpers.h +++ b/PuReMD-GPU/src/cuda_helpers.h @@ -25,7 +25,11 @@ #include "mytypes.h" -DEVICE inline int cuda_strcmp(char *a, char *b, int len) +#ifdef __cplusplus +extern "C" { +#endif + +static inline DEVICE int cuda_strcmp(char *a, char *b, int len) { char *src, *dst; @@ -52,7 +56,7 @@ DEVICE inline int cuda_strcmp(char *a, char *b, int len) } -DEVICE inline real myAtomicAdd(double* address, double val) +static inline DEVICE double myAtomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; @@ -61,14 +65,14 @@ DEVICE inline real myAtomicAdd(double* address, double val) { assumed = old; old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); + __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } -DEVICE inline void atomic_rvecAdd( rvec ret, rvec v ) +static inline DEVICE void atomic_rvecAdd( rvec ret, rvec v ) { MYATOMICADD( (double*)&ret[0], (double)v[0] ); MYATOMICADD( (double*)&ret[1], (double)v[1] ); @@ -76,12 +80,16 @@ DEVICE inline void atomic_rvecAdd( rvec ret, rvec v ) } -DEVICE inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v ) +static inline DEVICE void atomic_rvecScaledAdd( rvec ret, real c, rvec v ) { MYATOMICADD( (double*)&ret[0], (double)(c * v[0]) ); MYATOMICADD( (double*)&ret[1], (double)(c * v[1]) ); MYATOMICADD( (double*)&ret[2], (double)(c * v[2]) ); } +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_init.h b/PuReMD-GPU/src/cuda_init.h index 233761691fe56bc8b1290c8972c4b413cc876f54..cd9c568130730d298fbaf781f4f6b18b0f02f65a 100644 --- a/PuReMD-GPU/src/cuda_init.h +++ b/PuReMD-GPU/src/cuda_init.h @@ -18,22 +18,31 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ - #ifndef __CUDA_INIT_H__ #define __CUDA_INIT_H__ #include "mytypes.h" -void Cuda_Init_System ( reax_system* ); -void Cuda_Init_Simulation_Data (simulation_data *); -void Cuda_Init_Workspace_System ( reax_system *, static_storage *); -void Cuda_Init_Workspace ( reax_system *, control_params *, static_storage *); -void Cuda_Init_Workspace_Device ( static_storage *); -void Cuda_Init_Control (control_params *); -void Cuda_Init_Grid (grid *, grid *); -void Cuda_Init_Sparse_Matrix (sparse_matrix *, int, int); +#ifdef __cplusplus +extern "C" { +#endif + +void Cuda_Init_System( reax_system* ); +void Cuda_Init_Simulation_Data( simulation_data * ); +void Cuda_Init_Workspace_System( reax_system *, static_storage * ); +void Cuda_Init_Workspace( reax_system *, control_params *, static_storage * ); +void Cuda_Init_Workspace_Device( static_storage * ); +void Cuda_Init_Control( control_params * ); +void Cuda_Init_Grid( grid *, grid * ); + +void Cuda_Init_Sparse_Matrix( sparse_matrix *, int, int ); + +void Cuda_Init_Scratch( ); + +#ifdef __cplusplus +} +#endif -void Cuda_Init_Scratch (); #endif diff --git a/PuReMD-GPU/src/cuda_init_md.cu b/PuReMD-GPU/src/cuda_init_md.cu index 576c61ddaff2ad4fbbbdda891a6c9ba5694018c4..1a205506e4c5ff767e02398a3859f838818c1e1a 100644 --- a/PuReMD-GPU/src/cuda_init_md.cu +++ b/PuReMD-GPU/src/cuda_init_md.cu @@ -334,7 +334,7 @@ void Cuda_Init_Lists( reax_system *system, control_params *control, //First Bin atoms and they sync the host and the device for the grid. //This will copy the atoms from host to device. Cuda_Bin_Atoms( system, workspace ); - Sync_Host_Device( &system->g, &system->d_g, cudaMemcpyHostToDevice ); + Sync_Host_Device_Grid( &system->g, &system->d_g, cudaMemcpyHostToDevice ); k_Estimate_NumNeighbors<<<blockspergrid, threadsperblock >>> (system->d_atoms, system->d_g, system->d_box, @@ -544,16 +544,16 @@ void Cuda_Initialize( reax_system *system, control_params *control, //System Cuda_Init_System( system ); - Sync_Host_Device( system, cudaMemcpyHostToDevice ); + Sync_Host_Device_Sys( system, cudaMemcpyHostToDevice ); Cuda_Init_System( system, control, data ); //Simulation Data copy_host_device( system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS ); Cuda_Init_Simulation_Data( data ); - //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); + //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice ); Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve ); - Sync_Host_Device( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice ); + Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice ); //static storage Cuda_Init_Workspace_System( system, dev_workspace ); diff --git a/PuReMD-GPU/src/cuda_init_md.h b/PuReMD-GPU/src/cuda_init_md.h index 80d5aa2fb44b56b89a99f45715e672bbd4f7a25b..0a6544b017d31ca0a82651840682c3aae49d1b11 100644 --- a/PuReMD-GPU/src/cuda_init_md.h +++ b/PuReMD-GPU/src/cuda_init_md.h @@ -23,8 +23,18 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Cuda_Initialize( reax_system*, control_params*, simulation_data*, - static_storage*, list**, output_controls*, evolve_function* ); + static_storage*, list**, output_controls*, evolve_function* ); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_integrate.cu b/PuReMD-GPU/src/cuda_integrate.cu index 547f4f4a1089b26f3d5d7d3b2e41fedc0b05335a..cba0b79c39b4f9b66e5b506d11dcffb81adc488d 100644 --- a/PuReMD-GPU/src/cuda_integrate.cu +++ b/PuReMD-GPU/src/cuda_integrate.cu @@ -299,7 +299,7 @@ void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, cudaThreadSynchronize (); cudaCheckError (); - Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> + Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> (results, results + BLOCKS_POW_2, BLOCKS_POW_2); cudaThreadSynchronize (); cudaCheckError (); diff --git a/PuReMD-GPU/src/cuda_integrate.h b/PuReMD-GPU/src/cuda_integrate.h index 3413f2d03593c597d48d2092f7737b380475e548..959b6684fef618b86252a5606b6feb4a6d093f15 100644 --- a/PuReMD-GPU/src/cuda_integrate.h +++ b/PuReMD-GPU/src/cuda_integrate.h @@ -23,6 +23,11 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Cuda_Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*, static_storage*, list**, output_controls* ); void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*, @@ -32,5 +37,10 @@ void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* , simulation_data *, static_storage *, list **, output_controls * ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/cuda_lin_alg.h b/PuReMD-GPU/src/cuda_lin_alg.h index 0ad017b10c59c650a1bfdbbda0edbedcbbab3c8f..6b464152280f3590c1c70761c1b3ef206779cf5b 100644 --- a/PuReMD-GPU/src/cuda_lin_alg.h +++ b/PuReMD-GPU/src/cuda_lin_alg.h @@ -25,9 +25,19 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + GLOBAL void Cuda_Matvec (sparse_matrix , real *, real *, int ); GLOBAL void Cuda_Matvec_csr (sparse_matrix , real *, real *, int ); int Cuda_GMRES( static_storage *, real *b, real tol, real *x ); int Cublas_GMRES( reax_system *, static_storage *, real *b, real tol, real *x ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/cuda_list.h b/PuReMD-GPU/src/cuda_list.h index 4ebc60079bcbd5c835fe49d27ef4d23b65483d8b..cafc85743b9eb6b31da5b35b063361ef97fddf54 100644 --- a/PuReMD-GPU/src/cuda_list.h +++ b/PuReMD-GPU/src/cuda_list.h @@ -23,7 +23,17 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + char Cuda_Make_List( int, int, int, list* ); void Cuda_Delete_List( list* ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/cuda_lookup.cu b/PuReMD-GPU/src/cuda_lookup.cu index 7605d4099a62ad1359d0783a4f8d11f26dd12dc6..9804ad8b6afee5b0c7e04ea37f46d9c206c5ba68 100644 --- a/PuReMD-GPU/src/cuda_lookup.cu +++ b/PuReMD-GPU/src/cuda_lookup.cu @@ -405,7 +405,7 @@ GLOBAL void calculate_LR_Values ( LR_lookup_table *d_LR, real *h, real *fh, real int r = blockIdx.x * blockDim.x + threadIdx.x; if ( r == 0 || r > count ) return; - LR_vdW_Coulomb( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types ); + d_LR_vdW_Coulomb( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types ); h[r] = d_LR[ index_lr(i, j, num_atom_types) ].dx; fh[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].H; diff --git a/PuReMD-GPU/src/cuda_lookup.h b/PuReMD-GPU/src/cuda_lookup.h index 041df8777474005745063160abd934b4d2bd3bf1..a0e05e0c092f1634a1d21fd902f198f714ac2391 100644 --- a/PuReMD-GPU/src/cuda_lookup.h +++ b/PuReMD-GPU/src/cuda_lookup.h @@ -23,8 +23,18 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Cuda_Make_LR_Lookup_Table( reax_system*, control_params* ); void copy_LR_table_to_device ( reax_system*, control_params* ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/cuda_neighbors.cu b/PuReMD-GPU/src/cuda_neighbors.cu index 3f447b48d760d8ebf9feb1b75a2ee914ffd1f9ed..876b6b9913e4d825e0cc8be5a2fc1d092c56d9f8 100644 --- a/PuReMD-GPU/src/cuda_neighbors.cu +++ b/PuReMD-GPU/src/cuda_neighbors.cu @@ -41,7 +41,7 @@ extern inline DEVICE int index_grid (int blocksize) } -DEVICE int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, +DEVICE int d_Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, real cutoff, far_neighbor_data *data ) { real norm_sqr, d, tmp; @@ -123,20 +123,20 @@ GLOBAL void k_Estimate_NumNeighbors( reax_atom *sys_atoms, //CHANGE ORIGINAL /* if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, control->vlist_cut, &nbr_data)){ ++num_far; } } */ if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, control->vlist_cut, &nbr_data)){ ++num_far; } } else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, control->vlist_cut, &nbr_data)){ ++num_far; } @@ -205,13 +205,13 @@ GLOBAL void k_New_Estimate_NumNeighbors( reax_atom *sys_atoms, { atom2 = nbr_atoms[m]; if (atom1 > atom2) { - if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, + if (d_Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, control->vlist_cut, &temp)){ num_far++; } } else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, control->vlist_cut, &temp)){ num_far ++; } @@ -273,7 +273,7 @@ GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms, //CHANGE ORIGINAL /* if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, control->vlist_cut, &temp)){ nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); @@ -291,7 +291,7 @@ GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms, } */ if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, control->vlist_cut, &temp)){ nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); nbr_data->nbr = atom2; @@ -307,7 +307,7 @@ GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms, } } else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, control->vlist_cut, &temp)){ nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); nbr_data->nbr = atom2; @@ -394,7 +394,7 @@ GLOBAL void k_New_Generate_Neighbor_Lists( reax_atom *sys_atoms, { atom2 = nbr_atoms[m]; if (atom1 > atom2) { - if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, + if (d_Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, control->vlist_cut, &temp)){ //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); nbr_data = my_start; @@ -412,7 +412,7 @@ GLOBAL void k_New_Generate_Neighbor_Lists( reax_atom *sys_atoms, } } else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, control->vlist_cut, &temp)){ //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] ); nbr_data = my_start; @@ -532,7 +532,7 @@ GLOBAL void Test_Generate_Neighbor_Lists( reax_atom *sys_atoms, if (m < max) { atom2 = nbr_atoms[m]; if (atom1 > atom2) { - if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, control->vlist_cut, &temp)) { tnbr [threadIdx.x] = 1; @@ -540,7 +540,7 @@ GLOBAL void Test_Generate_Neighbor_Lists( reax_atom *sys_atoms, } } else if (atom1 < atom2) { - if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, + if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, control->vlist_cut, &temp)){ tnbr [threadIdx.x] = 1; nbrgen = TRUE; diff --git a/PuReMD-GPU/src/cuda_neighbors.h b/PuReMD-GPU/src/cuda_neighbors.h index 870ddae9717fe59388e89886651e3edb6ce1a915..13656f62f12b53509fad82f535e974e45cc45805 100644 --- a/PuReMD-GPU/src/cuda_neighbors.h +++ b/PuReMD-GPU/src/cuda_neighbors.h @@ -24,13 +24,21 @@ #include "mytypes.h" +#ifdef __cplusplus +extern "C" { +#endif + GLOBAL void k_Estimate_NumNeighbors( reax_atom *, grid, simulation_box *, control_params *, int * ); void Cuda_Generate_Neighbor_Lists (reax_system *system, static_storage *workspace, control_params *control, int); -DEVICE int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* ); +DEVICE int d_Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* ); + +#ifdef __cplusplus +} +#endif #endif diff --git a/PuReMD-GPU/src/cuda_post_evolve.cu b/PuReMD-GPU/src/cuda_post_evolve.cu index 2190220b334604e74793e148544ea61e01d7551d..f5dbad825f2ac9f6e120e6cf7147258c89868c71 100644 --- a/PuReMD-GPU/src/cuda_post_evolve.cu +++ b/PuReMD-GPU/src/cuda_post_evolve.cu @@ -33,7 +33,7 @@ void Cuda_Setup_Evolve( reax_system* system, control_params* control, { //fprintf (stderr, "Begin ... \n"); //to Sync step to the device. - //Sync_Host_Device (&data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice ); + //Sync_Host_Device_Data( &data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice ); copy_host_device( &data->step, &((simulation_data *)data->d_simulation_data)->step, INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA ); @@ -49,7 +49,7 @@ void Cuda_Setup_Output( reax_system* system, simulation_data* data ) void Cuda_Sync_Temp( control_params* control ) { - Sync_Host_Device( control, (control_params*)control->d_control, cudaMemcpyHostToDevice ); + Sync_Host_Device_Params( control, (control_params*)control->d_control, cudaMemcpyHostToDevice ); } diff --git a/PuReMD-GPU/src/cuda_post_evolve.h b/PuReMD-GPU/src/cuda_post_evolve.h index 02c9eb63c37b69b3b1b8127e8a4df9f656150e8a..1d8fdc270edd28e0f387d53d9b9837c3a4879542 100644 --- a/PuReMD-GPU/src/cuda_post_evolve.h +++ b/PuReMD-GPU/src/cuda_post_evolve.h @@ -23,6 +23,11 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Cuda_Setup_Evolve( reax_system *, control_params *, simulation_data *, static_storage *, list **, output_controls * ); @@ -35,5 +40,9 @@ void Cuda_Post_Evolve( reax_system *, control_params *, simulation_data *, static_storage *, list **, output_controls * ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_reduction.cu b/PuReMD-GPU/src/cuda_reduction.cu index 52cb198a71bcdcc637fcd6c69a8fffd8af34ed86..e22f9ad8474830ebdbe11089cdfcc986ff8dd8f3 100644 --- a/PuReMD-GPU/src/cuda_reduction.cu +++ b/PuReMD-GPU/src/cuda_reduction.cu @@ -149,7 +149,7 @@ GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results } -GLOBAL void Cuda_reduction(const int *input, int *per_block_results, const size_t n) +GLOBAL void Cuda_reduction_int(const int *input, int *per_block_results, const size_t n) { extern __shared__ int sh_input[]; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/PuReMD-GPU/src/cuda_reduction.h b/PuReMD-GPU/src/cuda_reduction.h index fefbe4c2de5e7674c2d53c73c2aba698b7c93f2a..5b9baf1df69f2a4ab2bdab0d3bff26e390634951 100644 --- a/PuReMD-GPU/src/cuda_reduction.h +++ b/PuReMD-GPU/src/cuda_reduction.h @@ -18,22 +18,32 @@ <http://www.gnu.org/licenses/>. ----------------------------------------------------------------------*/ -#ifndef __REDUCTION_H__ -#define __REDUCTION_H__ +#ifndef __CUDA_REDUCTION_H__ +#define __CUDA_REDUCTION_H__ #include "mytypes.h" #define INITIAL 0 #define FINAL 1 -GLOBAL void Cuda_reduction (const real *input, real *per_block_results, const size_t n); -GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t n, int pass); -GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, const size_t n); -GLOBAL void Cuda_reduction (const int *input, int *per_block_results, const size_t n); -GLOBAL void Cuda_reduction_rvec (rvec *, rvec *, size_t n); + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void Cuda_reduction( const real *input, real *per_block_results, const size_t n ); +GLOBAL void Cuda_Norm( const real *input, real *per_block_results, const size_t n, int pass ); +GLOBAL void Cuda_Dot( const real *a, const real *b, real *per_block_results, const size_t n ); +GLOBAL void Cuda_reduction_int( const int *input, int *per_block_results, const size_t n ); +GLOBAL void Cuda_reduction_rvec( rvec *, rvec *, size_t n ); GLOBAL void Cuda_Vector_Sum( real* , real , real* , real , real* , int ) ; GLOBAL void Cuda_Vector_Scale( real* , real , real* , int ) ; GLOBAL void Cuda_Vector_Add( real* , real , real* , int ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/cuda_reset_utils.cu b/PuReMD-GPU/src/cuda_reset_utils.cu index be582117d1d18a8832c2a2d793f4578d8dbcf88d..d18d9f749a9a57a269f731aecae7754bdb8925a4 100644 --- a/PuReMD-GPU/src/cuda_reset_utils.cu +++ b/PuReMD-GPU/src/cuda_reset_utils.cu @@ -143,7 +143,7 @@ void Cuda_Reset( reax_system *system, control_params *control, //Reset_Simulation_Data( data ); Cuda_Sync_Simulation_Data ( data ); - //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice); + //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice ); if( control->ensemble == NPT || control->ensemble == sNPT || control->ensemble == iNPT ) diff --git a/PuReMD-GPU/src/cuda_reset_utils.h b/PuReMD-GPU/src/cuda_reset_utils.h index 1e7b3e6606e681293143cdf10bef9da332a4ea6d..bf730b935cef4857eb8e0aa97d614ef122bfca0e 100644 --- a/PuReMD-GPU/src/cuda_reset_utils.h +++ b/PuReMD-GPU/src/cuda_reset_utils.h @@ -24,6 +24,10 @@ #include "mytypes.h" +#ifdef __cplusplus +extern "C" { +#endif + void Cuda_Reset_Grid( grid* ); void Cuda_Reset_Workspace (reax_system *, static_storage *); @@ -33,6 +37,9 @@ void Cuda_Reset( reax_system*, control_params*, simulation_data*, void Cuda_Reset_Atoms (reax_system *); - +#ifdef __cplusplus +} #endif + +#endif diff --git a/PuReMD-GPU/src/cuda_single_body_interactions.h b/PuReMD-GPU/src/cuda_single_body_interactions.h index ce181c9d7c3814a2281f000032166c6e51b26db8..3ecd4b9a68b174624ac1dcf56aae4cec360750be 100644 --- a/PuReMD-GPU/src/cuda_single_body_interactions.h +++ b/PuReMD-GPU/src/cuda_single_body_interactions.h @@ -24,6 +24,10 @@ #include "mytypes.h" +#ifdef __cplusplus +extern "C" { +#endif + GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters , single_body_parameters *, two_body_parameters *, static_storage , simulation_data *, @@ -46,6 +50,10 @@ GLOBAL void test_LonePair_Postprocess ( reax_atom *, global_parameters , static_storage , simulation_data *, list , int , int ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_system_props.cu b/PuReMD-GPU/src/cuda_system_props.cu index 4f19c90d37f934c50523aabadcf967170224a4c0..7ff5c11fe6d644e6878cd5e25e67b82de39ad0f5 100644 --- a/PuReMD-GPU/src/cuda_system_props.cu +++ b/PuReMD-GPU/src/cuda_system_props.cu @@ -37,7 +37,7 @@ GLOBAL void k_Kinetic_Energy_Reduction(simulation_data *, real *, int); void prep_dev_system (reax_system *system) { //copy the system atoms to the device - Sync_Host_Device ( system, cudaMemcpyHostToDevice ); + Sync_Host_Device_Sys( system, cudaMemcpyHostToDevice ); } @@ -188,18 +188,18 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, cuda_memset (partial_results, 0, REAL_SIZE * 6 * (BLOCKS_POW_2 + 1), RES_SCRATCH ); local_results = (real *) malloc (REAL_SIZE * 6 *(BLOCKS_POW_2+ 1)); - k_compute_center_mass<<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> + k_compute_center_mass_sbp<<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> (system->reaxprm.d_sbp, system->d_atoms, partial_results, data->xcm[0], data->xcm[1], data->xcm[2], system->N); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); k_compute_center_mass<<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2); - cudaThreadSynchronize (); - cudaCheckError (); + cudaThreadSynchronize( ); + cudaCheckError( ); - copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__); + copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__ ); #ifdef __BUILD_DEBUG__ if (check_zero (local_results[0],xx) || @@ -253,16 +253,19 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1]; if( fabs(det) > ALMOST_ZERO ) + { rtensor_Scale( inv, 1./det, inv ); + } else + { rtensor_MakeZero( inv ); + } /* Compute the angular velocity about the centre of mass */ rtensor_MatVec( data->avcm, inv, data->amcm ); data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm ); - //free the resources - free (local_results); + free( local_results ); #if defined(DEBUG) fprintf( stderr, "xcm: %24.15e %24.15e %24.15e\n", @@ -286,7 +289,7 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, } -void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data) +void Cuda_Compute_Kinetic_Energy( reax_system *system, simulation_data *data ) { real *results = (real *) scratch; cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH); @@ -458,5 +461,3 @@ data->therm.T = ALMOST_ZERO; } } */ - - diff --git a/PuReMD-GPU/src/cuda_system_props.h b/PuReMD-GPU/src/cuda_system_props.h index bc123ddd960ebc450070651bd728b36bcb8dda38..026999e9e7bbc6ecc70b0945e7a30bd853a6cbe0 100644 --- a/PuReMD-GPU/src/cuda_system_props.h +++ b/PuReMD-GPU/src/cuda_system_props.h @@ -24,11 +24,19 @@ #include "mytypes.h" +#ifdef __cplusplus +extern "C" { +#endif + void prep_dev_system (reax_system *system); void Cuda_Compute_Total_Mass( reax_system*, simulation_data* ); void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* ); void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/cuda_three_body_interactions.cu b/PuReMD-GPU/src/cuda_three_body_interactions.cu index 384d952b8f4358936016b982b7d076a5bc62e2d4..038b88402b49e516ad4594de520a09464b4c7c8e 100644 --- a/PuReMD-GPU/src/cuda_three_body_interactions.cu +++ b/PuReMD-GPU/src/cuda_three_body_interactions.cu @@ -30,7 +30,7 @@ /* calculates the theta angle between i-j-k */ -DEVICE void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, +DEVICE void d_Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, real *theta, real *cos_theta ) { (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk ); @@ -42,7 +42,7 @@ DEVICE void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, /* calculates the derivative of the cosine of the angle between i-j-k */ -DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, +DEVICE void d_Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, rvec* dcos_theta_di, rvec* dcos_theta_dj, rvec* dcos_theta_dk ) { @@ -73,7 +73,7 @@ DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_ /* this is a 3-body interaction in which the main role is played by j which sits in the middle of the other two. */ -GLOBAL void Three_Body_Interactions( reax_atom *atoms, +GLOBAL void k_Three_Body_Interactions( reax_atom *atoms, single_body_parameters *sbp, three_body_header *d_thbp, global_parameters g_params, @@ -253,11 +253,11 @@ where i < k */ if (BOA_jk <= 0) continue; //CHANGE ORIGINAL - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, + d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, pbond_jk->dvec, pbond_jk->d, &theta, &cos_theta ); - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, + d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, pbond_jk->dvec, pbond_jk->d, &(p_ijk->dcos_di), &(p_ijk->dcos_dj), &(p_ijk->dcos_dk) ); @@ -642,7 +642,7 @@ where i < k */ } -GLOBAL void Three_Body_Interactions_results ( reax_atom *atoms, control_params *control, +GLOBAL void k_Three_Body_Interactions_results ( reax_atom *atoms, control_params *control, static_storage p_workspace, list p_bonds, int N ) { @@ -671,7 +671,7 @@ GLOBAL void Three_Body_Interactions_results ( reax_atom *atoms, control_para /* this is a 3-body interaction in which the main role is played by j which sits in the middle of the other two. */ -GLOBAL void Three_Body_Estimate ( reax_atom *atoms, +GLOBAL void k_Three_Body_Estimate ( reax_atom *atoms, control_params *control, list p_bonds, int N, int *count) @@ -747,7 +747,7 @@ GLOBAL void Three_Body_Estimate ( reax_atom *atoms, } -GLOBAL void Hydrogen_Bonds(reax_atom *atoms, +GLOBAL void k_Hydrogen_Bonds(reax_atom *atoms, single_body_parameters *sbp, hbond_parameters *d_hbp, control_params *control, @@ -874,10 +874,10 @@ GLOBAL void Hydrogen_Bonds(reax_atom *atoms, hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); ++num_hb_intrs; - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, &theta, &cos_theta ); // the derivative of cos(theta) - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk ); @@ -1105,7 +1105,7 @@ DEVICE void warpReduce(volatile real* sdata, int tid) -GLOBAL void Hydrogen_Bonds_HB(reax_atom *atoms, +GLOBAL void k_Hydrogen_Bonds_HB(reax_atom *atoms, single_body_parameters *sbp, hbond_parameters *d_hbp, control_params *control, @@ -1254,10 +1254,10 @@ GLOBAL void Hydrogen_Bonds_HB(reax_atom *atoms, hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]); ++num_hb_intrs; - Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, &theta, &cos_theta ); // the derivative of cos(theta) - Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, + d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk, &dcos_theta_di, &dcos_theta_dj, &dcos_theta_dk ); @@ -1482,7 +1482,7 @@ GLOBAL void Hydrogen_Bonds_HB(reax_atom *atoms, } -GLOBAL void Hydrogen_Bonds_Postprocess(reax_atom *atoms, +GLOBAL void k_Hydrogen_Bonds_Postprocess(reax_atom *atoms, single_body_parameters *sbp, static_storage p_workspace, list p_bonds, list p_hbonds, list p_far_nbrs, int N, @@ -1547,7 +1547,7 @@ GLOBAL void Hydrogen_Bonds_Postprocess(reax_atom *atoms, } -GLOBAL void Hydrogen_Bonds_Far_Nbrs(reax_atom *atoms, +GLOBAL void k_Hydrogen_Bonds_Far_Nbrs(reax_atom *atoms, single_body_parameters *sbp, static_storage p_workspace, list p_bonds, list p_hbonds, list p_far_nbrs, int N ) @@ -1593,7 +1593,7 @@ GLOBAL void Hydrogen_Bonds_Far_Nbrs(reax_atom *atoms, } -GLOBAL void Hydrogen_Bonds_HNbrs(reax_atom *atoms, +GLOBAL void k_Hydrogen_Bonds_HNbrs(reax_atom *atoms, single_body_parameters *sbp, static_storage p_workspace, list p_bonds, list p_hbonds, list p_far_nbrs, int N ) diff --git a/PuReMD-GPU/src/cuda_three_body_interactions.h b/PuReMD-GPU/src/cuda_three_body_interactions.h index 20e3baae482f481c9bc2cd11a0d7f8c4f2b3d0ae..4a87fcfe42852c50ac58a1ef52e8353cf24971a2 100644 --- a/PuReMD-GPU/src/cuda_three_body_interactions.h +++ b/PuReMD-GPU/src/cuda_three_body_interactions.h @@ -23,49 +23,49 @@ #include "mytypes.h" -DEVICE void Calculate_Theta( rvec, real, rvec, real, real*, real* ); -DEVICE void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* ); +#ifdef __cplusplus +extern "C" { +#endif + +DEVICE void d_Calculate_Theta( rvec, real, rvec, real, real*, real* ); + +DEVICE void d_Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* ); -GLOBAL void Three_Body_Interactions( reax_atom *, single_body_parameters *, three_body_header *, - global_parameters , control_params *, simulation_data *, - static_storage , +GLOBAL void k_Three_Body_Interactions( reax_atom *, single_body_parameters *, three_body_header *, + global_parameters , control_params *, simulation_data *, static_storage , list , list , int , int , real *, real *, real *, rvec *); -GLOBAL void Three_Body_Interactions_results ( reax_atom *, - control_params *, - static_storage , - list , int ); +GLOBAL void k_Three_Body_Interactions_results( reax_atom *, + control_params *, static_storage , list , int ); -GLOBAL void Three_Body_Estimate ( reax_atom *atoms, - control_params *control, - list p_bonds, int N, - int *count); +GLOBAL void k_Three_Body_Estimate( reax_atom *atoms, + control_params *control, list p_bonds, int N, int *count); -GLOBAL void Hydrogen_Bonds ( reax_atom *, +GLOBAL void k_Hydrogen_Bonds( reax_atom *, single_body_parameters *, hbond_parameters *, control_params *, simulation_data *, static_storage , list , list , int , int, real *, rvec *, rvec *); -GLOBAL void Hydrogen_Bonds_HB ( reax_atom *, +GLOBAL void k_Hydrogen_Bonds_HB( reax_atom *, single_body_parameters *, hbond_parameters *, control_params *, simulation_data *, static_storage , list , list , int , int, real *, rvec *, rvec *); -GLOBAL void Hydrogen_Bonds_Postprocess ( reax_atom *, +GLOBAL void k_Hydrogen_Bonds_Postprocess( reax_atom *, single_body_parameters *, static_storage , list, list , list , int, real * ); -GLOBAL void Hydrogen_Bonds_Far_Nbrs ( reax_atom *, - single_body_parameters *, - static_storage , list, - list , list , int ); +GLOBAL void k_Hydrogen_Bonds_Far_Nbrs( reax_atom *, + single_body_parameters *, static_storage , list, list , list , int ); -GLOBAL void Hydrogen_Bonds_HNbrs ( reax_atom *, - single_body_parameters *, - static_storage , list, - list , list , int ); +GLOBAL void k_Hydrogen_Bonds_HNbrs( reax_atom *, single_body_parameters *, + static_storage , list, list , list , int ); + +#ifdef __cplusplus +} +#endif #endif diff --git a/PuReMD-GPU/src/cuda_two_body_interactions.cu b/PuReMD-GPU/src/cuda_two_body_interactions.cu index 386a3651b42a5367262a0897fbf7dad535d018b9..d5f6abd64b16cc4582a404566560768eca49b866 100644 --- a/PuReMD-GPU/src/cuda_two_body_interactions.cu +++ b/PuReMD-GPU/src/cuda_two_body_interactions.cu @@ -21,6 +21,7 @@ #include "cuda_two_body_interactions.h" #include "bond_orders.h" +#include "index_utils.h" #include "list.h" #include "lookup.h" #include "vector.h" @@ -191,122 +192,6 @@ GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, } -DEVICE void LR_vdW_Coulomb(global_parameters g_params, two_body_parameters *tbp, - control_params *control, int i, int j, real r_ij, LR_data *lr, int num_atom_types ) -{ - real p_vdW1 = g_params.l[28]; - real p_vdW1i = 1.0 / p_vdW1; - real powr_vdW1, powgi_vdW1; - real tmp, fn13, exp1, exp2; - real Tap, dTap, dfn13; - real dr3gamij_1, dr3gamij_3; - real e_core, de_core; - two_body_parameters *twbp; - - twbp = &(tbp[ index_tbp (i, j, num_atom_types) ]); - e_core = 0; - de_core = 0; - - /* calculate taper and its derivative */ - Tap = control->Tap7 * r_ij + control->Tap6; - Tap = Tap * r_ij + control->Tap5; - Tap = Tap * r_ij + control->Tap4; - Tap = Tap * r_ij + control->Tap3; - Tap = Tap * r_ij + control->Tap2; - Tap = Tap * r_ij + control->Tap1; - Tap = Tap * r_ij + control->Tap0; - - dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6; - dTap = dTap * r_ij + 5 * control->Tap5; - dTap = dTap * r_ij + 4 * control->Tap4; - dTap = dTap * r_ij + 3 * control->Tap3; - dTap = dTap * r_ij + 2 * control->Tap2; - dTap += control->Tap1 / r_ij; - - - /* vdWaals calculations */ - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\ - Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n", - Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), - powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */ - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0); - - lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; - - /*vdWaals Calculations*/ - if (g_params.vdw_type == 1 || g_params.vdw_type == 3) - { - // shielding - powr_vdW1 = POW(r_ij, p_vdW1); - powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); - - fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); - exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); - - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - - dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * - POW(r_ij, p_vdW1 - 2.0); - - lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; - } - else // no shielding - { - exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); - - lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); - - lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - - Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); - } - - if (g_params.vdw_type == 2 || g_params.vdw_type == 3) - { - // innner wall - e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore))); - lr->e_vdW += Tap * e_core; - - de_core = -(twbp->acore / twbp->rcore) * e_core; - lr->CEvd += dTap * e_core + Tap * de_core; - } - - /* Coulomb calculations */ - dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); - dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); - - tmp = Tap / dr3gamij_3; - lr->H = EV_to_KCALpMOL * tmp; - lr->e_ele = C_ele * tmp; - /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\ - Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n", - i, system->atoms[i].type, j, system->atoms[j].type, - twbp->gamma, Tap, dr3gamij_3, - system->atoms[i].q, system->atoms[j].q ); */ - - lr->CEclmb = C_ele * ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; - /* fprintf( stdout, "%d %d\t%g\t%g %g\t%g %g\t%g %g\n", - i+1, j+1, r_ij, e_vdW, CEvd * r_ij, - system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */ - - /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n", - i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */ -} - - - /* GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, diff --git a/PuReMD-GPU/src/cuda_two_body_interactions.h b/PuReMD-GPU/src/cuda_two_body_interactions.h index ac21c58430a1ca7879563b7687eab57f88f2811b..fe3e273775f67e17f88d742a52707f72bfbac56c 100644 --- a/PuReMD-GPU/src/cuda_two_body_interactions.h +++ b/PuReMD-GPU/src/cuda_two_body_interactions.h @@ -23,28 +23,149 @@ #include "mytypes.h" +#include "index_utils.h" -GLOBAL void Cuda_Bond_Energy ( reax_atom *, global_parameters , single_body_parameters *, two_body_parameters *, - simulation_data *, static_storage , list , int , int, real *); + +#ifdef __cplusplus +extern "C" { +#endif + +GLOBAL void Cuda_Bond_Energy( reax_atom *, global_parameters , single_body_parameters *, two_body_parameters *, + simulation_data *, static_storage , list , int , int, real * ); GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *, two_body_parameters *, global_parameters , control_params *, simulation_data *, list , real *, real *, rvec *, int , int ); -GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy ( reax_atom *, control_params *, simulation_data *, +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( reax_atom *, control_params *, simulation_data *, list , real *, real *, rvec *, LR_lookup_table *, int , int , int ) ; -GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1 ( reax_atom *, control_params *, simulation_data *, +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1( reax_atom *, control_params *, simulation_data *, list , real *, real *, rvec *, LR_lookup_table *, int , int , int ) ; -GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2 ( reax_atom *, control_params *, simulation_data *, +GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2( reax_atom *, control_params *, simulation_data *, list , real *, real *, rvec *, LR_lookup_table *, int , int , int ) ; -DEVICE void LR_vdW_Coulomb( global_parameters, two_body_parameters *, - control_params *, int , int , real , LR_data * , int); +static DEVICE void d_LR_vdW_Coulomb( global_parameters g_params, two_body_parameters *tbp, + control_params *control, int i, int j, real r_ij, LR_data *lr, int num_atom_types ) +{ + real p_vdW1 = g_params.l[28]; + real p_vdW1i = 1.0 / p_vdW1; + real powr_vdW1, powgi_vdW1; + real tmp, fn13, exp1, exp2; + real Tap, dTap, dfn13; + real dr3gamij_1, dr3gamij_3; + real e_core, de_core; + two_body_parameters *twbp; + + twbp = &(tbp[ index_tbp (i, j, num_atom_types) ]); + e_core = 0; + de_core = 0; + + /* calculate taper and its derivative */ + Tap = control->Tap7 * r_ij + control->Tap6; + Tap = Tap * r_ij + control->Tap5; + Tap = Tap * r_ij + control->Tap4; + Tap = Tap * r_ij + control->Tap3; + Tap = Tap * r_ij + control->Tap2; + Tap = Tap * r_ij + control->Tap1; + Tap = Tap * r_ij + control->Tap0; + + dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6; + dTap = dTap * r_ij + 5 * control->Tap5; + dTap = dTap * r_ij + 4 * control->Tap4; + dTap = dTap * r_ij + 3 * control->Tap3; + dTap = dTap * r_ij + 2 * control->Tap2; + dTap += control->Tap1 / r_ij; + + + /* vdWaals calculations */ + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\ + Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n", + Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), + powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */ + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0); + + lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; + + /*vdWaals Calculations*/ + if (g_params.vdw_type == 1 || g_params.vdw_type == 3) + { + // shielding + powr_vdW1 = POW(r_ij, p_vdW1); + powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1); + + fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i ); + exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + + dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * + POW(r_ij, p_vdW1 - 2.0); + + lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13; + } + else // no shielding + { + exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) ); + + lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2); + + lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - + Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2); + } + + if (g_params.vdw_type == 2 || g_params.vdw_type == 3) + { + // innner wall + e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore))); + lr->e_vdW += Tap * e_core; + + de_core = -(twbp->acore / twbp->rcore) * e_core; + lr->CEvd += dTap * e_core + Tap * de_core; + } + + /* Coulomb calculations */ + dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma ); + dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 ); + + tmp = Tap / dr3gamij_3; + lr->H = EV_to_KCALpMOL * tmp; + lr->e_ele = C_ele * tmp; + /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\ + Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n", + i, system->atoms[i].type, j, system->atoms[j].type, + twbp->gamma, Tap, dr3gamij_3, + system->atoms[i].q, system->atoms[j].q ); */ + + lr->CEclmb = C_ele * ( dTap - Tap * r_ij / dr3gamij_1 ) / dr3gamij_3; + /* fprintf( stdout, "%d %d\t%g\t%g %g\t%g %g\t%g %g\n", + i+1, j+1, r_ij, e_vdW, CEvd * r_ij, + system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */ + + /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n", + i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */ +} + +#ifdef __cplusplus +} +#endif #endif diff --git a/PuReMD-GPU/src/cuda_utils.h b/PuReMD-GPU/src/cuda_utils.h index 8150fed70a742c5419b1fef361f06d52f989329e..c8976d081bbead0048faa7dd12238c40f2eda5d7 100644 --- a/PuReMD-GPU/src/cuda_utils.h +++ b/PuReMD-GPU/src/cuda_utils.h @@ -29,6 +29,10 @@ #define IDX2C(i,j,ld) (((j)*(ld))+(i)) +#ifdef __cplusplus +extern "C" { +#endif + static __inline__ void modify( cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta ) { cublasSscal( handle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm ); @@ -47,9 +51,9 @@ void compute_nearest_pow_2( int blocks, int *result ); void print_device_mem_usage( ); #define cusparseCheckError(cusparseStatus) __cusparseCheckError (cusparseStatus, __FILE__, __LINE__) -inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line ) +static inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line ) { - if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) + if ( cusparseStatus != CUSPARSE_STATUS_SUCCESS ) { fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cusparseStatus); exit (-1); @@ -59,9 +63,9 @@ inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *f #define cublasCheckError(cublasStatus) __cublasCheckError (cublasStatus, __FILE__, __LINE__) -inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line ) +static inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line ) { - if( cublasStatus != CUBLAS_STATUS_SUCCESS ) + if ( cublasStatus != CUBLAS_STATUS_SUCCESS ) { fprintf( stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cublasStatus ); exit( -1 ); @@ -69,8 +73,9 @@ inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, c return; } + #define cudaCheckError() __cudaCheckError( __FILE__, __LINE__ ) -inline void __cudaCheckError( const char *file, const int line ) +static inline void __cudaCheckError( const char *file, const int line ) { cudaError err = cudaGetLastError( ); if ( cudaSuccess != err ) @@ -91,5 +96,9 @@ inline void __cudaCheckError( const char *file, const int line ) return; } +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/grid.h b/PuReMD-GPU/src/grid.h index 273f56c5deedc14eb579a9156c7e2c6138635b75..8f26f018e947bc2ebe1789736d8203dc435addfd 100644 --- a/PuReMD-GPU/src/grid.h +++ b/PuReMD-GPU/src/grid.h @@ -23,6 +23,11 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Setup_Grid( reax_system* ); void Update_Grid( reax_system* ); @@ -35,4 +40,9 @@ void Bin_Atoms( reax_system*, static_storage* ); void Reset_Marks( grid*, ivec*, int ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/init_md.c b/PuReMD-GPU/src/init_md.c index 6e9ffe974a700de207c126f9948d6aca0f18621d..2a2ce1270e2c694722e489b9a3f38f8dd48177a1 100644 --- a/PuReMD-GPU/src/init_md.c +++ b/PuReMD-GPU/src/init_md.c @@ -351,7 +351,7 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data * int index = 0; int count = 0; int jicount = 0; - int end_index, gpu_index, gpu_end, k; + int i, j, end_index, gpu_index, gpu_end, k; far_neighbor_data gpu, cpu; /* @@ -370,12 +370,12 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data * */ - for (int i = 0; i < N; i++){ + for (i = 0; i < N; i++){ index = Start_Index (i, slist); //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]); - for (int j = start[i]; j < end[i]; j++){ + for (j = start[i]; j < end[i]; j++){ gpu = data[j]; if (i < data[j].nbr) continue; diff --git a/PuReMD-GPU/src/init_md.h b/PuReMD-GPU/src/init_md.h index 5eea3d8c554e44bdce185c50891c48f67e69f9c8..8c23806594a8f2b107ddb884efbf68e7b5fe27ff 100644 --- a/PuReMD-GPU/src/init_md.h +++ b/PuReMD-GPU/src/init_md.h @@ -24,6 +24,10 @@ #include "mytypes.h" +#ifdef __cplusplus +extern "C" { +#endif + void Initialize( reax_system*, control_params*, simulation_data*, static_storage*, list**, output_controls*, evolve_function* ); @@ -32,5 +36,9 @@ void Generate_Initial_Velocities(reax_system *, real ); void Init_Out_Controls(reax_system *, control_params *, static_storage *, output_controls *); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/lin_alg.h b/PuReMD-GPU/src/lin_alg.h index 3fe9609cf373a113a17b40e1e5e6762276277399..a515a959494a6eca40fe9f338d2a08118ff3e39a 100644 --- a/PuReMD-GPU/src/lin_alg.h +++ b/PuReMD-GPU/src/lin_alg.h @@ -25,6 +25,7 @@ #include "mytypes.h" + int GMRES( static_storage*, sparse_matrix*, real*, real, real*, FILE* , reax_system* ); @@ -43,4 +44,5 @@ int CG( static_storage*, sparse_matrix*, int uyduruk_GMRES( static_storage*, sparse_matrix*, real*, real, real*, int, FILE*, reax_system* ); + #endif diff --git a/PuReMD-GPU/src/list.h b/PuReMD-GPU/src/list.h index a4d0aac132005967ca3c41d92264dd8d4d18e761..b90c41419271ca6b859be08ea4005fbe9107c029 100644 --- a/PuReMD-GPU/src/list.h +++ b/PuReMD-GPU/src/list.h @@ -28,31 +28,31 @@ char Make_List( int, int, int, list* ); void Delete_List( list* ); -inline HOST_DEVICE int Num_Entries(int i, list* l) +static inline HOST_DEVICE int Num_Entries(int i, list* l) { return l->end_index[i] - l->index[i]; } -inline HOST_DEVICE int Start_Index(int i, list *l ) +static inline HOST_DEVICE int Start_Index(int i, list *l ) { return l->index[i]; } -inline HOST_DEVICE int End_Index( int i, list *l ) +static inline HOST_DEVICE int End_Index( int i, list *l ) { return l->end_index[i]; } -inline HOST_DEVICE void Set_Start_Index(int i, int val, list *l) +static inline HOST_DEVICE void Set_Start_Index(int i, int val, list *l) { l->index[i] = val; } -inline HOST_DEVICE void Set_End_Index(int i, int val, list *l) +static inline HOST_DEVICE void Set_End_Index(int i, int val, list *l) { l->end_index[i] = val; } diff --git a/PuReMD-GPU/src/lookup.h b/PuReMD-GPU/src/lookup.h index 0c175e8a6c1e96a5eb87ab29d43dda6fa7d38d7b..7dac3e4f41764caa53cf0e2313f8c566f4e0e634 100644 --- a/PuReMD-GPU/src/lookup.h +++ b/PuReMD-GPU/src/lookup.h @@ -23,10 +23,20 @@ #include "mytypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + void Make_Lookup_Table( real, real, int, lookup_function, lookup_table* ); int Lookup_Index_Of( real, lookup_table* ); real Lookup( real, lookup_table* ); void Make_LR_Lookup_Table( reax_system*, control_params* ); +#ifdef __cplusplus +} +#endif + + #endif diff --git a/PuReMD-GPU/src/mytypes.h b/PuReMD-GPU/src/mytypes.h index 25902c040df2ab8ff60dcda09298915e9db7f0a2..273f95976159c55461a84668c4a5f2494f1d9522 100644 --- a/PuReMD-GPU/src/mytypes.h +++ b/PuReMD-GPU/src/mytypes.h @@ -28,7 +28,10 @@ #define GLOBAL __global__ #define HOST_DEVICE __host__ __device__ + #include <cuda_runtime.h> #include <cuda.h> + #include <cuda_runtime_api.h> + #include <cublas_v2.h> #include <cusparse_v2.h> #if __CUDA_ARCH__ < 600 @@ -1145,14 +1148,5 @@ extern void *scratch; extern int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE; extern int MATVEC_BLOCKS; -#ifdef __CUDACC__ -extern cublasStatus_t cublasStatus; -extern cublasHandle_t cublasHandle; - -extern cusparseHandle_t cusparseHandle; -extern cusparseStatus_t cusparseStatus; -extern cusparseMatDescr_t matdescriptor; -#endif - #endif diff --git a/PuReMD-GPU/src/random.h b/PuReMD-GPU/src/random.h index 4b698b78a9d74464c9af7ab333f642bf07a0667b..b19bc58e3dcef04a324b108be718bfbff3e5c06c 100644 --- a/PuReMD-GPU/src/random.h +++ b/PuReMD-GPU/src/random.h @@ -28,7 +28,7 @@ large periodicity for generation of pseudo random number. function Random returns this random number appropriately scaled so that 0 <= Random(range) < range */ -HOST_DEVICE inline double Random(double range) +static inline HOST_DEVICE double Random(double range) { return (random() * range) / 2147483647L; } @@ -37,7 +37,7 @@ HOST_DEVICE inline double Random(double range) /* This function seeds the system pseudo random number generator with current time. Use this function once in the begining to initialize the system */ -HOST_DEVICE inline void Randomize( ) +static inline HOST_DEVICE void Randomize( ) { srandom( time(NULL) ); } @@ -45,7 +45,7 @@ HOST_DEVICE inline void Randomize( ) /* GRandom return random number with gaussian distribution with mean and standard deviation "sigma" */ -HOST_DEVICE inline double GRandom(double mean, double sigma) +static inline HOST_DEVICE double GRandom(double mean, double sigma) { double v1 = Random(2.0) - 1.0; double v2 = Random(2.0) - 1.0; @@ -61,4 +61,5 @@ HOST_DEVICE inline double GRandom(double mean, double sigma) return mean + v1 * sigma * sqrt(-2.0 * log(rsq) / rsq); } + #endif diff --git a/PuReMD-GPU/src/reset_utils.h b/PuReMD-GPU/src/reset_utils.h index 796a1508afdef4cf9b9297e37879426a03faeefd..190bd7f5632f3b0a2b3291edf82e03aa4922793e 100644 --- a/PuReMD-GPU/src/reset_utils.h +++ b/PuReMD-GPU/src/reset_utils.h @@ -24,6 +24,10 @@ #include "mytypes.h" +#ifdef __cplusplus +extern "C" { +#endif + void Reset_Atoms( reax_system* ); void Reset_Pressures( simulation_data* ); @@ -48,5 +52,9 @@ void Reset_Grid( grid* ); void Reset_Marks( grid*, ivec*, int ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/system_props.c b/PuReMD-GPU/src/system_props.c index 491b0d93f22aab1cdda29311643434f42694398a..0126b86b776dce8fd30aea0c228731b95104b216 100644 --- a/PuReMD-GPU/src/system_props.c +++ b/PuReMD-GPU/src/system_props.c @@ -24,7 +24,7 @@ #include "vector.h" -real Get_Time( ) +HOST real Get_Time( ) { struct timeval tim; @@ -33,7 +33,7 @@ real Get_Time( ) } -real Get_Timing_Info( real t_start ) +HOST real Get_Timing_Info( real t_start ) { struct timeval tim; real t_end; diff --git a/PuReMD-GPU/src/system_props.h b/PuReMD-GPU/src/system_props.h index e2f7cb1aeb51696bf332d50c6f34fa4af84ed561..874132451d02b2d62d87c82065874f04a35b2d37 100644 --- a/PuReMD-GPU/src/system_props.h +++ b/PuReMD-GPU/src/system_props.h @@ -24,6 +24,10 @@ #include "mytypes.h" +#ifdef __cplusplus +extern "C" { +#endif + real Get_Time( ); real Get_Timing_Info( real ); @@ -40,5 +44,9 @@ void Compute_Pressure( reax_system*, simulation_data*, static_storage* ); void Compute_Pressure_Isotropic( reax_system*, control_params*, simulation_data*, output_controls* ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/PuReMD-GPU/src/testmd.c b/PuReMD-GPU/src/testmd.c index d33a0785a1a26322f5e999ae4dc4cf6953bc70f8..afe0cd4a5cf7e7282d7e2409f7f0edc66c34296a 100644 --- a/PuReMD-GPU/src/testmd.c +++ b/PuReMD-GPU/src/testmd.c @@ -213,17 +213,17 @@ int main( int argc, char* argv[] ) &out_control, &Evolve ); #endif - t_start = Get_Time (); + t_start = Get_Time( ); Cuda_Initialize( &system, &control, &data, &workspace, &lists, &out_control, &Cuda_Evolve ); t_elapsed = Get_Timing_Info( t_start ); #ifdef __DEBUG_CUDA__ - fprintf (stderr, " Cuda Initialize timing ---> %f \n", t_elapsed ); + fprintf( stderr, " Cuda Initialize timing ---> %f \n", t_elapsed ); #endif #ifdef __CUDA_MEM__ - print_device_mem_usage (); + print_device_mem_usage( ); #endif #ifdef __BUILD_DEBUG__ @@ -233,11 +233,11 @@ int main( int argc, char* argv[] ) Cuda_Reset( &system, &control, &data, &workspace, &lists ); #ifdef __BUILD_DEBUG__ - Generate_Neighbor_Lists ( &system, &control, &data, &workspace, + Generate_Neighbor_Lists( &system, &control, &data, &workspace, &lists, &out_control ); #endif - Cuda_Generate_Neighbor_Lists (&system, &workspace, &control, FALSE); + Cuda_Generate_Neighbor_Lists( &system, &workspace, &control, FALSE ); #ifdef __BUILD_DEBUG__ Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control); @@ -297,8 +297,8 @@ int main( int argc, char* argv[] ) //fprintf (stderr, "Post Evolve done \n"); #ifndef __BUILD_DEBUG__ - Prep_Device_For_Output ( &system, &data ); - Output_Results(&system, &control, &data, &workspace, &lists, &out_control); + Cuda_Setup_Output( &system, &data ); + Output_Results( &system, &control, &data, &workspace, &lists, &out_control ); //Analysis( &system, &control, &data, &workspace, &lists, &out_control ); steps = data.step - data.prev_steps; diff --git a/PuReMD-GPU/src/traj.c b/PuReMD-GPU/src/traj.c index 1a8a150ed571a069df8ceb16928a5506eb458f61..2844c370ee79702ed0c75d090afe545149aae185 100644 --- a/PuReMD-GPU/src/traj.c +++ b/PuReMD-GPU/src/traj.c @@ -213,7 +213,7 @@ int Append_Custom_Frame( reax_system *system, control_params *control, #ifdef __PRINT_CPU_RESULTS__ //fprintf (stderr, "Synching bonds from device for printing ....\n"); - Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND ); + Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND ); #endif for( i = 0; i < system->N; ++i ) @@ -245,10 +245,10 @@ int Append_Custom_Frame( reax_system *system, control_params *control, #ifdef __PRINT_CPU_RESULTS__ //fprintf (stderr, "Synching three bodies from deivce for printing ... \n"); - Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY ); + Sync_Host_Device_List( thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY ); if ( !write_bonds) { //fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n"); - Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND ); + Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND ); } #endif diff --git a/PuReMD-GPU/src/two_body_interactions.c b/PuReMD-GPU/src/two_body_interactions.c index d7a491789b05e9c1a855b31b60ba79dab573e271..2e7a6daf9039ea26c22b2fcfda5913e46255ad75 100644 --- a/PuReMD-GPU/src/two_body_interactions.c +++ b/PuReMD-GPU/src/two_body_interactions.c @@ -26,8 +26,6 @@ #include "vector.h" #include "index_utils.h" -#include "cuda_helpers.h" - void Bond_Energy( reax_system *system, control_params *control, simulation_data *data, static_storage *workspace, diff --git a/PuReMD-GPU/src/vector.h b/PuReMD-GPU/src/vector.h index ee1375ee0816bcefbf1f335bfdaeccf8fcfed3f3..e1111e514928e79fc79197a0f2486d5eefb1cfa3 100644 --- a/PuReMD-GPU/src/vector.h +++ b/PuReMD-GPU/src/vector.h @@ -26,6 +26,10 @@ #include "random.h" +#ifdef __cplusplus +extern "C" { +#endif + int Vector_isZero( real*, int ); void Vector_MakeZero( real*, int ); void Vector_Copy( real*, real*, int ); @@ -67,7 +71,7 @@ void ivec_MakeZero( ivec ); void ivec_rScale( ivec, real, rvec ); -HOST_DEVICE inline real Dot( real* v1, real* v2, int k ) +static inline HOST_DEVICE real Dot( real* v1, real* v2, int k ) { real ret = 0; @@ -81,13 +85,13 @@ HOST_DEVICE inline real Dot( real* v1, real* v2, int k ) ///////////////////////////// //rvec functions ///////////////////////////// -HOST_DEVICE inline void rvec_MakeZero( rvec v ) +static inline HOST_DEVICE void rvec_MakeZero( rvec v ) { v[0] = v[1] = v[2] = ZERO; } -HOST_DEVICE inline void rvec_Add( rvec ret, rvec v ) +static inline HOST_DEVICE void rvec_Add( rvec ret, rvec v ) { ret[0] += v[0]; ret[1] += v[1]; @@ -95,13 +99,13 @@ HOST_DEVICE inline void rvec_Add( rvec ret, rvec v ) } -HOST_DEVICE inline void rvec_Copy( rvec dest, rvec src ) +static inline HOST_DEVICE void rvec_Copy( rvec dest, rvec src ) { dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } -HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 ) +static inline HOST_DEVICE void rvec_Cross( rvec ret, rvec v1, rvec v2 ) { ret[0] = v1[1] * v2[2] - v1[2] * v2[1]; ret[1] = v1[2] * v2[0] - v1[0] * v2[2]; @@ -109,13 +113,13 @@ HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 ) } -HOST_DEVICE inline void rvec_ScaledAdd( rvec ret, real c, rvec v ) +static inline HOST_DEVICE void rvec_ScaledAdd( rvec ret, real c, rvec v ) { ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2]; } -HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 ) +static inline HOST_DEVICE void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 ) { ret[0] = c1 * v1[0] + c2 * v2[0]; ret[1] = c1 * v1[1] + c2 * v2[1]; @@ -123,7 +127,7 @@ HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rv } -HOST_DEVICE inline void rvec_Random( rvec v ) +static inline HOST_DEVICE void rvec_Random( rvec v ) { v[0] = Random(2.0) - 1.0; v[1] = Random(2.0) - 1.0; @@ -131,25 +135,25 @@ HOST_DEVICE inline void rvec_Random( rvec v ) } -HOST_DEVICE inline real rvec_Norm_Sqr( rvec v ) +static inline HOST_DEVICE real rvec_Norm_Sqr( rvec v ) { return SQR(v[0]) + SQR(v[1]) + SQR(v[2]); } -HOST_DEVICE inline void rvec_Scale( rvec ret, real c, rvec v ) +static inline HOST_DEVICE void rvec_Scale( rvec ret, real c, rvec v ) { ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2]; } -HOST_DEVICE inline real rvec_Dot( rvec v1, rvec v2 ) +static inline HOST_DEVICE real rvec_Dot( rvec v1, rvec v2 ) { return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; } -HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 ) +static inline HOST_DEVICE void rvec_iMultiply( rvec r, ivec v1, rvec v2 ) { r[0] = v1[0] * v2[0]; r[1] = v1[1] * v2[1]; @@ -157,7 +161,7 @@ HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 ) } -HOST_DEVICE inline real rvec_Norm( rvec v ) +static inline HOST_DEVICE real rvec_Norm( rvec v ) { return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) ); } @@ -166,13 +170,13 @@ HOST_DEVICE inline real rvec_Norm( rvec v ) ///////////////// //ivec functions ///////////////// -HOST_DEVICE inline void ivec_Copy( ivec dest , ivec src ) +static inline HOST_DEVICE void ivec_Copy( ivec dest , ivec src ) { dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; } -HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src ) +static inline HOST_DEVICE void ivec_Scale( ivec dest, real C, ivec src ) { dest[0] = C * src[0]; dest[1] = C * src[1]; @@ -180,7 +184,7 @@ HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src ) } -HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 ) +static inline HOST_DEVICE void ivec_Sum( ivec dest, ivec v1, ivec v2 ) { dest[0] = v1[0] + v2[0]; dest[1] = v1[1] + v2[1]; @@ -191,24 +195,29 @@ HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 ) ///////////////// //vector functions ///////////////// -HOST_DEVICE inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) +static inline HOST_DEVICE void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) { for (k--; k >= 0; k--) dest[k] = c * v[k] + d * y[k]; } -HOST_DEVICE inline void Vector_Scale( real* dest, real c, real* v, int k ) +static inline HOST_DEVICE void Vector_Scale( real* dest, real c, real* v, int k ) { for (k--; k >= 0; k--) dest[k] = c * v[k]; } -HOST_DEVICE inline void Vector_Add( real* dest, real c, real* v, int k ) +static inline HOST_DEVICE void Vector_Add( real* dest, real c, real* v, int k ) { for (k--; k >= 0; k--) dest[k] += c * v[k]; } +#ifdef __cplusplus +} +#endif + + #endif