From b73145d43b3d86222a3e9ed84bf1c195d8177057 Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Tue, 11 Jul 2017 13:29:09 -0400 Subject: [PATCH] PG-PuReMD: fix nvcc build issue where incorrect C++ compiler was utilized. Fix race condition with three body list computation. Remove debug code. Other misc. refactoring. --- PG-PuReMD/aclocal.m4 | 1 + PG-PuReMD/configure.ac | 25 ++++-- PG-PuReMD/src/cuda_allocate.cu | 69 +++++++++++---- PG-PuReMD/src/cuda_forces.cu | 80 ++++++++++++------ PG-PuReMD/src/cuda_forces.h | 2 + PG-PuReMD/src/cuda_hydrogen_bonds.cu | 32 +++++-- PG-PuReMD/src/cuda_lin_alg.cu | 7 +- PG-PuReMD/src/cuda_neighbors.cu | 41 ++++----- PG-PuReMD/src/cuda_reduction.cu | 16 ++-- PG-PuReMD/src/cuda_valence_angles.cu | 122 ++++++++++----------------- PG-PuReMD/src/forces.c | 9 +- PG-PuReMD/src/init_md.c | 23 +---- PG-PuReMD/src/integrate.c | 12 --- PG-PuReMD/src/io_tools.c | 2 +- PG-PuReMD/src/io_tools.h | 3 + PG-PuReMD/src/parallelreax.c | 12 ++- 16 files changed, 246 insertions(+), 210 deletions(-) diff --git a/PG-PuReMD/aclocal.m4 b/PG-PuReMD/aclocal.m4 index 1e5be5b9..06c98b55 100644 --- a/PG-PuReMD/aclocal.m4 +++ b/PG-PuReMD/aclocal.m4 @@ -1151,4 +1151,5 @@ AC_SUBST([am__untar]) ]) # _AM_PROG_TAR m4_include([../m4/acx_mpi.m4]) +m4_include([../m4/ax_compiler_vendor.m4]) m4_include([../m4/ax_cuda.m4]) diff --git a/PG-PuReMD/configure.ac b/PG-PuReMD/configure.ac index d4cb0d4d..6fc8c567 100644 --- a/PG-PuReMD/configure.ac +++ b/PG-PuReMD/configure.ac @@ -13,21 +13,23 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])], [AC_SUBST([AM_DEFAULT_VERB AC_CONFIG_MACRO_DIR([../m4]) -AC_LANG([C]) - -AC_CONFIG_SRCDIR([src/torsion_angles.h]) -AC_CONFIG_HEADERS([src/config.h]) - # Headline formatter AC_DEFUN([CONFIGURE_HEADLINE], [ echo; echo "+++ $1 +++" ]) +AC_LANG([C]) + # Checks for programs. AC_PROG_CC([icc gcc cc]) AC_PROG_CPP +AX_COMPILER_VENDOR + +AC_CONFIG_SRCDIR([src/torsion_angles.h]) +AC_CONFIG_HEADERS([src/config.h]) + # Checks for libraries. AC_SEARCH_LIBS([exp], [m]) AC_SEARCH_LIBS([sqrt], [m]) @@ -111,11 +113,24 @@ fi AC_SUBST(MPI_CFLAGS) AC_SUBST(MPI_LIBS) +AC_LANG([C++]) + +# Checks for programs. +AC_PROG_CXX([icpc g++ c++]) +AC_PROG_CXXCPP + +AX_COMPILER_VENDOR + # Check for CUDA support. if test "x$BUILD_GPU" = "xyes"; then CONFIGURE_HEADLINE([ CUDA support ]) AX_CUDA + NVCCFLAGS= + if test "$ax_cv_cxx_compiler_vendor" = "intel" + then + NVCCFLAGS+=" -ccbin=icpc" + fi if test "BUILD_DEBUG" = "true" then NVCCFLAGS+=" -g -G" diff --git a/PG-PuReMD/src/cuda_allocate.cu b/PG-PuReMD/src/cuda_allocate.cu index dcf99114..8d2404f4 100644 --- a/PG-PuReMD/src/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda_allocate.cu @@ -66,13 +66,19 @@ void dev_alloc_grid( reax_system *system ) ivec_Copy( device->ghost_hbond_span, host->ghost_hbond_span ); ivec_Copy( device->ghost_bond_span, host->ghost_bond_span ); - cuda_malloc( (void **) &device->str, sizeof(int) * total, TRUE, "grid:str" ); - cuda_malloc( (void **) &device->end, sizeof(int) * total, TRUE, "grid:end" ); - cuda_malloc( (void **) &device->cutoff, sizeof(real) * total, TRUE, "grid:cutoff" ); - - cuda_malloc( (void **) &device->nbrs_x, sizeof(ivec) * total * host->max_nbrs, TRUE, "grid:nbrs_x" ); - cuda_malloc( (void **) &device->nbrs_cp, sizeof(rvec) * total * host->max_nbrs, TRUE, "grid:nbrs_cp" ); - cuda_malloc( (void **) &device->rel_box, sizeof(ivec) * total, TRUE, "grid:rel_box" ); + cuda_malloc( (void **) &device->str, sizeof(int) * total, TRUE, + "dev_alloc_grid::grid->str" ); + cuda_malloc( (void **) &device->end, sizeof(int) * total, TRUE, + "dev_alloc_grid::grid->end" ); + cuda_malloc( (void **) &device->cutoff, sizeof(real) * total, TRUE, + "dev_alloc_grid::grid->cutoff" ); + + cuda_malloc( (void **) &device->nbrs_x, sizeof(ivec) * total * host->max_nbrs, + TRUE, "dev_alloc_grid::grid->nbrs_x" ); + cuda_malloc( (void **) &device->nbrs_cp, sizeof(rvec) * total * host->max_nbrs, + TRUE, "dev_alloc_grid::grid->nbrs_cp" ); + cuda_malloc( (void **) &device->rel_box, sizeof(ivec) * total, + TRUE, "dev_alloc_grid::grid->rel_box" ); /* int block_size = 512; @@ -129,8 +135,10 @@ void dev_dealloc_grid_cell_atoms( reax_system *system ) for (int i = 0; i < total; i++) { copy_host_device( &local_cell, &device->cells[i], - sizeof(grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc" ); - cuda_free( local_cell.atoms, "grid_cell:atoms" ); + sizeof(grid_cell), cudaMemcpyDeviceToHost, + "dev_dealloc_grid_cell_atoms::grid" ); + cuda_free( local_cell.atoms, + "dev_dealloc_grid_cell_atoms::grid_cell.atoms" ); } } @@ -239,31 +247,63 @@ void dev_alloc_system( reax_system *system ) void dev_realloc_system( reax_system *system, int local_cap, int total_cap, char *msg ) { + int *temp; + + temp = (int *) scratch; + /* free the existing storage for atoms, leave other info allocated */ cuda_free( system->d_my_atoms, "system::d_my_atoms" ); cuda_malloc( (void **) &system->d_my_atoms, sizeof(reax_atom) * total_cap, TRUE, "system::d_my_atoms" ); + //TODO: record old total_cap before increase, use here + copy_device( temp, system->d_far_nbrs, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); cuda_free( system->d_far_nbrs, "system::d_far_nbrs" ); cuda_malloc( (void **) &system->d_far_nbrs, system->total_cap * sizeof(int), TRUE, "system::d_far_nbrs" ); + copy_device( system->d_far_nbrs, temp, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); + + copy_device( temp, system->d_max_far_nbrs, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); cuda_free( system->d_max_far_nbrs, "system::d_max_far_nbrs" ); cuda_malloc( (void **) &system->d_max_far_nbrs, system->total_cap * sizeof(int), TRUE, "system::d_max_far_nbrs" ); + copy_device( system->d_max_far_nbrs, temp, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); + copy_device( temp, system->d_bonds, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); cuda_free( system->d_bonds, "system::d_bonds" ); cuda_malloc( (void **) &system->d_bonds, system->total_cap * sizeof(int), TRUE, "system::d_bonds" ); + copy_device( system->d_bonds, temp, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); + + copy_device( temp, system->d_max_bonds, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); cuda_free( system->d_max_bonds, "system::d_max_bonds" ); cuda_malloc( (void **) &system->d_max_bonds, system->total_cap * sizeof(int), TRUE, "system::d_max_bonds" ); + copy_device( system->d_max_bonds, temp, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); + copy_device( temp, system->d_hbonds, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); cuda_free( system->d_hbonds, "system::d_hbonds" ); cuda_malloc( (void **) &system->d_hbonds, system->total_cap * sizeof(int), TRUE, "system::d_hbonds" ); + copy_device( system->d_hbonds, temp, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); + + copy_device( temp, system->d_max_hbonds, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); cuda_free( system->d_max_hbonds, "system::d_max_hbonds" ); cuda_malloc( (void **) &system->d_max_hbonds, system->total_cap * sizeof(int), TRUE, "system::d_max_hbonds" ); + copy_device( system->d_max_hbonds, temp, system->total_cap * sizeof(int), + "dev_realloc_system::temp" ); } @@ -626,7 +666,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, realloc->far_nbrs = FALSE; } } - fprintf( stderr, " [CUDA_REALLOCATE::far nbrs]\n" ); /* charge coef matrix */ //if( nflag || realloc->Htop >= system->max_sparse_entries * DANGER_ZONE ) { @@ -673,7 +712,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, //workspace->U = NULL; realloc->Htop = 0; } - fprintf( stderr, " [CUDA_REALLOCATE::charge matrix\n" ); /* hydrogen bonds list */ if ( control->hbond_cut > 0.0 && system->numH > 0 ) @@ -692,7 +730,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, realloc->hbonds = FALSE; } } - fprintf( stderr, " [CUDA_REALLOCATE::hbonds\n" ); /* bonds list */ if ( Nflag == TRUE || realloc->bonds == TRUE ) @@ -707,7 +744,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, Cuda_Init_Bond_Indices( system ); realloc->bonds = FALSE; } - fprintf( stderr, " [CUDA_REALLOCATE::bonds\n" ); /* 3-body list */ if ( Nflag == TRUE || realloc->num_3body > 0 ) @@ -723,7 +759,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, (*dev_lists + BONDS)->num_intrs, system->total_thbodies ); realloc->num_3body = -1; } - fprintf( stderr, " [CUDA_REALLOCATE::thbody\n" ); /* grid */ if ( renbr && realloc->gcell_atoms > -1 ) @@ -752,11 +787,10 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY ); //FIX - 1 - Tested the reallocation logic - //dev_dealloc_grid_cell_atoms (system); - //dev_alloc_grid_cell_atoms (system, realloc->gcell_atoms); + //dev_dealloc_grid_cell_atoms( system ); + //dev_alloc_grid_cell_atoms( system, realloc->gcell_atoms ); realloc->gcell_atoms = -1; } - fprintf( stderr, " [CUDA_REALLOCATE::grid\n" ); /* mpi buffers */ // we have to be at a renbring step - @@ -829,7 +863,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control, Deallocate_MPI_Buffers( mpi_data ); Allocate_MPI_Buffers( mpi_data, system->est_recv, system->my_nbrs, msg ); } - fprintf( stderr, " [CUDA_REALLOCATE::MPI buffers\n" ); #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d @ step%d: reallocate done\n", diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda_forces.cu index 67009b7c..a840cbe5 100644 --- a/PG-PuReMD/src/cuda_forces.cu +++ b/PG-PuReMD/src/cuda_forces.cu @@ -342,11 +342,10 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr int step, reax_list **lists, int *thbody ) { int ret; - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; ret = SUCCESS; - cuda_memset( thbody, 0, (*dev_lists + BONDS)->num_intrs * sizeof(int), "scratch::thbody" ); +// cuda_memset( thbody, 0, (*dev_lists + BONDS)->num_intrs * sizeof(int), "scratch::thbody" ); + cuda_memset( thbody, 0, system->total_bonds * sizeof(int), "scratch::thbody" ); Estimate_Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>> ( system->d_my_atoms, (control_params *)control->d_control_params, @@ -354,7 +353,8 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr cudaThreadSynchronize( ); cudaCheckError( ); - Cuda_Reduction_Sum( thbody, system->d_total_thbodies, (*dev_lists + BONDS)->num_intrs ); +// Cuda_Reduction_Sum( thbody, system->d_total_thbodies, (*dev_lists + BONDS)->num_intrs ); + Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds ); copy_host_device( &(system->total_thbodies), system->d_total_thbodies, sizeof(int), cudaMemcpyDeviceToHost, "Cuda_Estimate_Storage_Three_Body::d_total_thbodies" ); @@ -362,12 +362,16 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr if ( step == 0 ) { /* create Three-body list */ - Dev_Make_List( (*dev_lists + BONDS)->num_intrs, system->total_thbodies, +// Dev_Make_List( (*dev_lists + BONDS)->num_intrs, system->total_thbodies, +// TYP_THREE_BODY, *dev_lists + THREE_BODIES ); + Dev_Make_List( system->total_bonds, system->total_thbodies, TYP_THREE_BODY, *dev_lists + THREE_BODIES ); } +// if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs || +// (*dev_lists + THREE_BODIES)->n < (*dev_lists + BONDS)->num_intrs ) if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs || - (*dev_lists + THREE_BODIES)->n < (*dev_lists + BONDS)->num_intrs ) + (*dev_lists + THREE_BODIES)->n < system->total_bonds ) { system->total_thbodies = (*dev_lists + THREE_BODIES)->num_intrs * SAFE_ZONE; dev_workspace->realloc.num_3body = system->total_thbodies; @@ -1190,7 +1194,6 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace, ret = FAILURE; } - fprintf( stderr, " [sparse_matrix: %d]\n", ret ); /* 3bodies list: since a more accurate estimate of the num. * of three body interactions requires that bond orders have @@ -1200,6 +1203,39 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace, } +CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n ) +{ + int i; + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i >= n) + { + return; + } + + printf( "%8d: %24.15f, %24.15f, %24.15f\n", + my_atoms[i].orig_id, + f[i][0], + f[i][1], + f[i][2] ); +} + + +void Print_Forces( reax_system *system ) +{ + int blocks; + + blocks = (system->n) / DEF_BLOCK_SIZE + + (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1); + + k_print_forces <<< blocks, DEF_BLOCK_SIZE >>> + ( system->d_my_atoms, dev_workspace->f, system->n ); + cudaThreadSynchronize( ); + cudaCheckError( ); +} + + CUDA_GLOBAL void k_init_bond_orders( reax_atom *my_atoms, reax_list far_nbrs, reax_list bonds, real *total_bond_order, int N ) { @@ -1345,7 +1381,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, /* validate lists */ ret = Cuda_Validate_Lists( system, workspace, dev_lists, control, data->step, system->numH ); - fprintf( stderr, " [CUDA_VALIDATE_LISTS: %d] STEP %d\n", ret, data->step ); } return ret; @@ -1372,6 +1407,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, real *spad = (real *) scratch; rvec *rvec_spad; + ret = SUCCESS; + if ( compute_bonded_part1 == FALSE ) { /* 1. Bond Order Interactions */ @@ -1414,7 +1451,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaGetLastError( ), t_elapsed ); fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" ); #endif - fprintf( stderr, " [CUDA_COMPUTE_BONDED_FORCES:BO]\n" ); /* 2. Bond Energy Interactions */ t_start = Get_Time( ); @@ -1438,7 +1474,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaGetLastError( ), t_elapsed ); fprintf( stderr, "Cuda_Bond_Energy Done... \n" ); #endif - fprintf( stderr, " [CUDA_COMPUTE_BONDED_FORCES:Bond E]\n" ); /* 3. Atom Energy Interactions */ t_start = Get_Time( ); @@ -1482,7 +1517,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaGetLastError( ), t_elapsed ); fprintf( stderr, "test_LonePair_postprocess Done... \n"); #endif - fprintf( stderr, " [CUDA_COMPUTE_BONDED_FORCES:Atom E]\n" ); compute_bonded_part1 = TRUE; } @@ -1493,7 +1527,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, thbody = (int *) scratch; ret = Cuda_Estimate_Storage_Three_Body( system, control, data->step, dev_lists, thbody ); - fprintf( stderr, " [three_body: %d]\n", ret ); #if defined(DEBUG) fprintf( stderr, "system->total_thbodies = %d, lists:THREE_BODIES->num_intrs = %d,\n", @@ -1505,7 +1538,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, if ( ret == SUCCESS ) { - Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies ); + Cuda_Init_Three_Body_Indices( thbody, system->total_bonds ); cuda_memset( spad, 0, 6 * sizeof(real) * system->N + sizeof(rvec) * system->N * 2, "scratch" ); Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>> @@ -1514,7 +1547,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, (control_params *)control->d_control_params, *dev_workspace, *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES), system->n, system->N, system->reax_param.num_atom_types, - spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N)); + spad, spad + 2 * system->N, spad + 4 * system->N, (rvec *)(spad + 6 * system->N)); cudaThreadSynchronize( ); cudaCheckError( ); @@ -1560,7 +1593,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, t_elapsed ); fprintf( stderr, "Three_Body_Interactions Done... \n" ); #endif - fprintf( stderr, " [CUDA_COMPUTE_BONDED_FORCES:Valence Angles]\n" ); /* 5. Torsion Angles Interactions */ t_start = Get_Time( ); @@ -1614,7 +1646,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, cudaGetLastError( ), t_elapsed ); fprintf( stderr, " Four_Body_ Done... \n"); #endif - fprintf( stderr, " [CUDA_COMPUTE_BONDED_FORCES:Torsion]\n" ); /* 6. Hydrogen Bonds Interactions */ // FIX - 4 - Added additional check here @@ -1627,9 +1658,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1); -// Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE >>> - Cuda_Hydrogen_Bonds_MT <<< hbs, HB_BLOCK_SIZE, - HB_BLOCK_SIZE * (2 * sizeof(real) + 2 * sizeof(rvec)) >>> + Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE >>> +// Cuda_Hydrogen_Bonds_MT <<< hbs, HB_BLOCK_SIZE, +// HB_BLOCK_SIZE * (2 * sizeof(real) + 2 * sizeof(rvec)) >>> ( system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_hbp, system->reax_param.d_gp, (control_params *) control->d_control_params, @@ -1638,7 +1669,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, spad, (rvec *) (spad + 2 * system->n) ); cudaThreadSynchronize( ); cudaCheckError( ); - fprintf( stderr, " [CUDA_COMPUTE_BONDED_FORCES:Hbonds]\n" ); /* reduction for E_HB */ Cuda_Reduction_Sum( spad, @@ -1666,20 +1696,18 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, *(*dev_lists + BONDS), system->N ); cudaThreadSynchronize( ); cudaCheckError( ); - fprintf( stderr, " [CUDA_COMPUTE_BONDED_FORCES:Hbonds_PostProcess]\n" ); /* post process step2 */ hnbrs_blocks = (system->N * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) + (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1); -// Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof(rvec) >>> -// ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS) ); - Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, - HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>> + Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof(rvec) >>> + ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS) ); +// Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, +// HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>> ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS), system->N ); cudaThreadSynchronize( ); cudaCheckError( ); - fprintf( stderr, " [CUDA_COMPUTE_BONDED_FORCES:Hbonds_HNbrs]\n" ); t_elapsed = Get_Timing_Info( t_start ); diff --git a/PG-PuReMD/src/cuda_forces.h b/PG-PuReMD/src/cuda_forces.h index a4ae2655..7e635005 100644 --- a/PG-PuReMD/src/cuda_forces.h +++ b/PG-PuReMD/src/cuda_forces.h @@ -35,6 +35,8 @@ void Cuda_Compute_NonBonded_Forces( reax_system *, control_params *, simulation_data *, storage *, reax_list **, output_controls *, mpi_datatypes * ); +void Print_Forces( reax_system * ); + #ifdef __cplusplus } diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda_hydrogen_bonds.cu index 89682a39..95eda081 100644 --- a/PG-PuReMD/src/cuda_hydrogen_bonds.cu +++ b/PG-PuReMD/src/cuda_hydrogen_bonds.cu @@ -657,8 +657,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms, workspace = &p_workspace; hbonds = &p_hbonds; - start = Dev_Start_Index( i, hbonds ); - end = Dev_End_Index( i, hbonds ); + start = Dev_Start_Index( atoms[i].Hindex, hbonds ); + end = Dev_End_Index( atoms[i].Hindex, hbonds ); pj = start + threadIdx.x; #if defined(__SM_35__) rvec_MakeZero( __f ); @@ -682,8 +682,10 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms, pj += blockDim.x; } + __syncthreads( ); + #if defined(__SM_35__) - for ( int s = 16; s >= 1; s/=2 ) + for ( int s = 16; s >= 1; s /= 2 ) { __f[0] += shfl( __f[0], s ); __f[1] += shfl( __f[1], s ); @@ -699,22 +701,31 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms, { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 16] ); } + __syncthreads( ); + if ( threadIdx.x < 8 ) { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 8] ); } + __syncthreads( ); + if ( threadIdx.x < 4 ) { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 4] ); } + __syncthreads( ); + if ( threadIdx.x < 2 ) { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 2] ); } + __syncthreads( ); + if ( threadIdx.x < 1 ) { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 1] ); } + __syncthreads( ); if ( threadIdx.x == 0 ) { @@ -757,8 +768,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms, workspace = &( p_workspace ); hbonds = &p_hbonds; i = group_id; - start = Dev_Start_Index( i, hbonds ); - end = Dev_End_Index( i, hbonds ); + start = Dev_Start_Index( atoms[i].Hindex, hbonds ); + end = Dev_End_Index( atoms[i].Hindex, hbonds ); pj = start + lane_id; #if defined(__SM_35__) rvec_MakeZero( __f ); @@ -781,6 +792,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms, pj += __THREADS_PER_ATOM__; } + __syncthreads( ); + #if defined(__SM_35__) for ( s = __THREADS_PER_ATOM__ >> 1; s >= 1; s /= 2 ) { @@ -798,22 +811,31 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms, { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 16] ); } + __syncthreads( ); + if ( lane_id < 8 ) { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 8] ); } + __syncthreads( ); + if ( lane_id < 4 ) { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 4] ); } + __syncthreads( ); + if ( lane_id < 2 ) { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 2] ); } + __syncthreads( ); + if ( lane_id < 1 ) { rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 1] ); } + __syncthreads( ); if ( lane_id == 0 ) { diff --git a/PG-PuReMD/src/cuda_lin_alg.cu b/PG-PuReMD/src/cuda_lin_alg.cu index b7254a39..4f37d577 100644 --- a/PG-PuReMD/src/cuda_lin_alg.cu +++ b/PG-PuReMD/src/cuda_lin_alg.cu @@ -568,7 +568,7 @@ void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n) } -void Cuda_Dual_Matvec(sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size) +void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size ) { int blocks; @@ -590,14 +590,13 @@ void Cuda_Dual_Matvec(sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size) k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, sizeof(rvec2) * MATVEC_BLOCK_SIZE >>> #endif - (*H, a, b, n); - + ( *H, a, b, n ); cudaThreadSynchronize( ); cudaCheckError( ); } -void Cuda_Matvec(sparse_matrix *H, real *a, real *b, int n, int size) +void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size ) { int blocks; diff --git a/PG-PuReMD/src/cuda_neighbors.cu b/PG-PuReMD/src/cuda_neighbors.cu index 54f6d021..9c4fdee7 100644 --- a/PG-PuReMD/src/cuda_neighbors.cu +++ b/PG-PuReMD/src/cuda_neighbors.cu @@ -76,7 +76,7 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms, { for ( i = 0; i < 3; i++ ) { - c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]); + c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]); if ( c[i] >= g.native_end[i] ) { c[i] = g.native_end[i] - 1; @@ -240,7 +240,7 @@ CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms, { for ( i = 0; i < 3; i++ ) { - c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]); + c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]); if ( c[i] >= g.native_end[i] ) { c[i] = g.native_end[i] - 1; @@ -370,13 +370,14 @@ CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms, //got the indices nbr_data = my_start + nbrssofar[my_bucket] + tnbr[threadIdx.x] - 1; nbr_data->nbr = m; + if ( l < m ) { dvec[0] = atom2->x[0] - atom1->x[0]; dvec[1] = atom2->x[1] - atom1->x[1]; dvec[2] = atom2->x[2] - atom1->x[2]; d = rvec_Norm_Sqr( dvec ); - nbr_data->d = SQRT(d); + nbr_data->d = SQRT( d ); rvec_Copy( nbr_data->dvec, dvec ); ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], -1, g.rel_box[index_grid_3d(i, j, k, &g)] ); @@ -387,7 +388,7 @@ CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms, dvec[1] = atom1->x[1] - atom2->x[1]; dvec[2] = atom1->x[2] - atom2->x[2]; d = rvec_Norm_Sqr( dvec ); - nbr_data->d = SQRT(d); + nbr_data->d = SQRT( d ); rvec_Copy( nbr_data->dvec, dvec ); /* CHANGE ORIGINAL @@ -437,25 +438,25 @@ void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, #endif /* one thread per atom implementation */ -// blocks = (system->N / NBRS_BLOCK_SIZE) + -// ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); -// k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>> -// ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, -// *(*dev_lists + FAR_NBRS), system->n, system->N ); -// cudaThreadSynchronize( ); -// cudaCheckError( ); - - /* multiple threads per atom implementation */ - blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + - (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); - k_mt_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE, - //sizeof(int) * (NBRS_BLOCK_SIZE + NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM) >>> - sizeof(int) * 2 * NBRS_BLOCK_SIZE >>> - ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, - *(*dev_lists + FAR_NBRS), system->n, system->N ); + blocks = (system->N / NBRS_BLOCK_SIZE) + + ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); + k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>> + ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, + *(*dev_lists + FAR_NBRS), system->n, system->N ); cudaThreadSynchronize( ); cudaCheckError( ); + /* multiple threads per atom implementation */ +// blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + +// (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1); +// k_mt_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE, +// //sizeof(int) * (NBRS_BLOCK_SIZE + NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM) >>> +// sizeof(int) * 2 * NBRS_BLOCK_SIZE >>> +// ( system->d_my_atoms, system->my_ext_box, system->d_my_grid, +// *(*dev_lists + FAR_NBRS), system->n, system->N ); +// cudaThreadSynchronize( ); +// cudaCheckError( ); + #if defined(LOG_PERFORMANCE) if( system->my_rank == MASTER_NODE ) { diff --git a/PG-PuReMD/src/cuda_reduction.cu b/PG-PuReMD/src/cuda_reduction.cu index a9661449..02d800ee 100644 --- a/PG-PuReMD/src/cuda_reduction.cu +++ b/PG-PuReMD/src/cuda_reduction.cu @@ -41,7 +41,7 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n ) /* allocate temporary storage */ cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE, - "cub::sum::temp_storage" ); + "Cuda_Reduction_Sum::temp_storage" ); /* run sum-reduction */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, @@ -50,7 +50,7 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n ) cudaCheckError( ); /* deallocate temporary storage */ - cuda_free( d_temp_storage, "cub::sum::temp_storage" ); + cuda_free( d_temp_storage, "Cuda_Reduction_Sum::temp_storage" ); } @@ -71,7 +71,7 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) /* allocate temporary storage */ cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE, - "cub::sum::temp_storage" ); + "Cuda_Reduction_Sum::temp_storage" ); /* run sum-reduction */ cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes, @@ -80,7 +80,7 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n ) cudaCheckError( ); /* deallocate temporary storage */ - cuda_free( d_temp_storage, "cub::sum::temp_storage" ); + cuda_free( d_temp_storage, "Cuda_Reduction_Sum::temp_storage" ); } @@ -133,7 +133,7 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n ) /* allocate temporary storage */ cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE, - "cub::max::temp_storage" ); + "Cuda_Reduction_Max::temp_storage" ); /* run exclusive prefix sum */ cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes, @@ -142,7 +142,7 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n ) cudaCheckError( ); /* deallocate temporary storage */ - cuda_free( d_temp_storage, "cub::max::temp_storage" ); + cuda_free( d_temp_storage, "Cuda_Reduction_Max::temp_storage" ); } @@ -163,7 +163,7 @@ void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n ) /* allocate temporary storage */ cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE, - "cub::devicescan::temp_storage" ); + "Cuda_Scan_Excl_Sum::temp_storage" ); /* run exclusive prefix sum */ cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, @@ -172,7 +172,7 @@ void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n ) cudaCheckError( ); /* deallocate temporary storage */ - cuda_free( d_temp_storage, "cub::devicescan::temp_storage" ); + cuda_free( d_temp_storage, "Cuda_Scan_Excl_Sum::temp_storage" ); } diff --git a/PG-PuReMD/src/cuda_valence_angles.cu b/PG-PuReMD/src/cuda_valence_angles.cu index c0936fc0..d778c3b2 100644 --- a/PG-PuReMD/src/cuda_valence_angles.cu +++ b/PG-PuReMD/src/cuda_valence_angles.cu @@ -152,45 +152,40 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, i = pbond_ij->nbr; r_ij = pbond_ij->d; type_i = my_atoms[i].type; - // fprintf( out_control->eval, "i: %d\n", i ); /* first copy 3-body intrs from previously computed ones where i>k. in the second for-loop below, we compute only new 3-body intrs where i < k */ - /* + // The copy loop commented out because strange asynchronous issues started to surface // Each kernel now manually generates everything - - for( pk = start_j; pk < pi; ++pk ) { - - //printf("%d,%d \n", j, pk ); - - // fprintf( out_control->eval, "pk: %d\n", pk ); - start_pk = Dev_Start_Index( pk, thb_intrs ); - end_pk = Dev_End_Index( pk, thb_intrs ); - - for( t = start_pk; t < end_pk; ++t ) - if( thb_intrs->select.three_body_list[t].thb == i ) { - p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] ); - p_kji = &(thb_intrs->select.three_body_list[t]); - - p_ijk->thb = bonds->select.bond_list[pk].nbr; - p_ijk->pthb = pk; - p_ijk->theta = p_kji->theta; - rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); - rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); - rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); - - ++num_thb_intrs; - printf("\n"); - break; - } - } - */ +// for( pk = start_j; pk < pi; ++pk ) +// { +// start_pk = Dev_Start_Index( pk, thb_intrs ); +// end_pk = Dev_End_Index( pk, thb_intrs ); +// +// for( t = start_pk; t < end_pk; ++t ) +// { +// if( thb_intrs->select.three_body_list[t].thb == i ) +// { +// p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] ); +// p_kji = &(thb_intrs->select.three_body_list[t]); +// +// p_ijk->thb = bonds->select.bond_list[pk].nbr; +// p_ijk->pthb = pk; +// p_ijk->theta = p_kji->theta; +// rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk ); +// rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj ); +// rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di ); +// +// ++num_thb_intrs; +// break; +// } +// } +// } /* and this is the second for loop mentioned above */ //for( pk = pi+1; pk < end_j; ++pk ) { - // Except that now the loop goes all the way from start_j to end_j for( pk = start_j; pk < end_j; ++pk ) { @@ -239,26 +234,10 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, bo_ij->BO * bo_jk->BO > SQR(control->thb_cut) ) { r_jk = pbond_jk->d; - thbh = &( d_thbh[ index_thbp (type_i,type_j,type_k,num_atom_types) ] ); - - /* if( system->my_atoms[i].orig_id < system->my_atoms[k].orig_id ) - fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - system->my_atoms[i].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[k].orig_id, - bo_ij->BO, bo_jk->BO, p_ijk->theta ); - else - fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", - system->my_atoms[k].orig_id, - system->my_atoms[j].orig_id, - system->my_atoms[i].orig_id, - bo_jk->BO, bo_ij->BO, p_ijk->theta ); */ + thbh = &( d_thbh[ index_thbp(type_i, type_j, type_k, num_atom_types) ] ); for ( cnt = 0; cnt < thbh->cnt; ++cnt ) { - // fprintf( out_control->eval, "%6d%6d%6d -- exists in thbp\n", - // i+1, j+1, k+1 ); - if ( FABS(thbh->prm[cnt].p_val1) > 0.001 ) { thbp = &( thbh->prm[cnt] ); @@ -294,7 +273,8 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, { expval12theta = p_val1 * (1.0 - expval2theta); } - else // To avoid linear Me-H-Me angles (6/6/06) + // To avoid linear Me-H-Me angles (6/6/06) + else { expval12theta = p_val1 * -expval2theta; } @@ -315,8 +295,8 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, if ( pk < pi ) { - data_e_ang [j] += e_ang = - f7_ij * f7_jk * f8_Dj * expval12theta; + e_ang = f7_ij * f7_jk * f8_Dj * expval12theta; + data_e_ang[j] += e_ang; } /* END ANGLE ENERGY*/ @@ -362,14 +342,13 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, /* Similar to above comment regarding if statement */ if ( pk < pi ) { - e_coa = p_coa1 / (1. + exp_coa2) * EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) * EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) * EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * EXP( -p_coa4 * SQR(BOA_jk - 1.5) ); - data_e_coa [j] += e_coa; + data_e_coa[j] += e_coa; } CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa; @@ -385,20 +364,15 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, // we must again check for pk<pi for entire forces part if ( pk < pi ) { - /* - bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4)); - bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5)); - workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3); - workspace->CdDelta[i] += CEcoa4; - workspace->CdDelta[k] += CEcoa5; - */ bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4)); bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5)); workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3); +// workspace->CdDelta[i] += CEcoa4; +// workspace->CdDelta[k] += CEcoa5; pbond_ij->va_CdDelta += CEcoa4; pbond_jk->va_CdDelta += CEcoa5; - for( t = start_j; t < end_j; ++t ) + for ( t = start_j; t < end_j; ++t ) { pbond_jt = &( bonds->select.bond_list[t] ); bo_jt = &(pbond_jt->bo_data); @@ -406,25 +380,17 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, temp = CUBE( temp_bo_jt ); pBOjt7 = temp * temp * temp_bo_jt; - // fprintf( out_control->eval, "%6d%12.8f\n", - // workspace->reverse_map[bonds->select.bond_list[t].nbr], - // (CEval6 * pBOjt7) ); - bo_jt->Cdbo += (CEval6 * pBOjt7); bo_jt->Cdbopi += CEval5; bo_jt->Cdbopi2 += CEval5; } - if( control->virial == 0 ) + if ( control->virial == 0 ) { - /* - rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di ); - rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj ); - rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk ); - */ - +// rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di ); rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di ); rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj ); +// rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk ); rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk ); } else @@ -432,19 +398,19 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, /* terms not related to bond order derivatives are added directly into forces and pressure vector/tensor */ rvec_Scale( force, CEval8, p_ijk->dcos_di ); - //rvec_Add( workspace->f[i], force ); +// rvec_Add( workspace->f[i], force ); rvec_Add( pbond_ij->va_f, force ); rvec_iMultiply( ext_press, pbond_ij->rel_box, force ); - //rvec_Add( data->my_ext_press, ext_press ); - rvec_Add( my_ext_press [j], ext_press ); +// rvec_Add( data->my_ext_press, ext_press ); + rvec_Add( my_ext_press[j], ext_press ); rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj ); rvec_Scale( force, CEval8, p_ijk->dcos_dk ); - //rvec_Add( workspace->f[k], force ); +// rvec_Add( workspace->f[k], force ); rvec_Add( pbond_jk->va_f, force ); rvec_iMultiply( ext_press, pbond_jk->rel_box, force ); - rvec_Add( my_ext_press [j], ext_press ); + rvec_Add( my_ext_press[j], ext_press ); } } @@ -597,7 +563,7 @@ CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms, { int i, j, k, pi, pk; int start_j, end_j; - int cnt, num_thb_intrs; + int num_thb_intrs; real BOA_ij, BOA_jk; bond_data *pbond_ij, *pbond_jk; bond_order_data *bo_ij, *bo_jk; @@ -629,7 +595,7 @@ CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms, { i = pbond_ij->nbr; - for( pk = start_j; pk < end_j; ++pk ) + for ( pk = start_j; pk < end_j; ++pk ) { if ( pk == pi ) { diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c index ae60867e..2426e824 100644 --- a/PG-PuReMD/src/forces.c +++ b/PG-PuReMD/src/forces.c @@ -87,6 +87,7 @@ void Init_Force_Functions( control_params *control ) Interaction_Functions[0] = BO; Interaction_Functions[1] = Bonds; //Dummy_Interaction; Interaction_Functions[2] = Atom_Energy; //Dummy_Interaction; + Interaction_Functions[2] = Atom_Energy; //Dummy_Interaction; Interaction_Functions[3] = Valence_Angles; //Dummy_Interaction; Interaction_Functions[4] = Torsion_Angles; //Dummy_Interaction; if ( control->hbond_cut > 0.0 ) @@ -1882,7 +1883,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, if ( charge_flag == TRUE ) { retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control ); - fprintf( stderr, " [CUDA_INIT_FORCES: %d] STEP %d\n", retVal, data->step ); // int i; // static reax_list **temp_lists; @@ -1936,7 +1936,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, /********* bonded interactions ************/ retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control ); - fprintf( stderr, " [CUDA_COMPUTE_BONDED_FORCES: %d] STEP %d\n", retVal, data->step ); #if defined(LOG_PERFORMANCE) //MPI_Barrier( MPI_COMM_WORLD ); @@ -1960,7 +1959,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, if ( charge_flag == TRUE ) { Cuda_QEq( system, control, data, workspace, out_control, mpi_data ); - fprintf( stderr, " [CUDA_QEQ: %d] STEP %d\n", retVal, data->step ); } #if defined(LOG_PERFORMANCE) @@ -1980,7 +1978,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, /********* nonbonded interactions ************/ Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control, mpi_data ); - fprintf( stderr, " [CUDA_COMPUTE_NONBONDED_FORCES: %d] STEP %d\n", retVal, data->step ); #if defined(LOG_PERFORMANCE) //MPI_Barrier( MPI_COMM_WORLD ); @@ -1997,7 +1994,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, /*********** total force ***************/ Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data ); - fprintf( stderr, " [CUDA_COMPUTE_TOTAL_FORCE: %d] STEP %d\n", retVal, data->step ); #if defined(LOG_PERFORMANCE) //MPI_Barrier( MPI_COMM_WORLD ); @@ -2011,7 +2007,10 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control, system->my_rank, data->step ); //Print_Total_Force( system, data, workspace ); MPI_Barrier( MPI_COMM_WORLD ); + #endif + +// Print_Forces( system ); } return retVal; diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c index 2d09ec4c..faf15dd3 100644 --- a/PG-PuReMD/src/init_md.c +++ b/PG-PuReMD/src/init_md.c @@ -107,11 +107,12 @@ void Generate_Initial_Velocities( reax_system *system, real T ) int i; real m, scale, norm; - if ( T <= 0.1 ) { for ( i = 0; i < system->n; i++ ) + { rvec_MakeZero( system->my_atoms[i].v ); + } } else { @@ -248,7 +249,6 @@ int Cuda_Init_System( reax_system *system, control_params *control, int nrecv[MAX_NBRS]; Setup_New_Grid( system, control, MPI_COMM_WORLD ); - fprintf( stderr, " [SETUP NEW GRID]\n" ); #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d GRID:\n", system->my_rank ); @@ -256,9 +256,7 @@ int Cuda_Init_System( reax_system *system, control_params *control, #endif Bin_My_Atoms( system, &(workspace->realloc) ); - fprintf( stderr, " [BIN MY ATOMS]\n" ); Reorder_My_Atoms( system, workspace ); - fprintf( stderr, " [REORDER MY ATOMS]\n" ); /* estimate N and total capacity */ for ( i = 0; i < MAX_NBRS; ++i ) @@ -270,16 +268,12 @@ int Cuda_Init_System( reax_system *system, control_params *control, system->max_recved = 0; system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv, Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE ); - fprintf( stderr, " [SEND_RECV:ESTIMATE_BOUNDARY_ATOMS]\n" ); system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP ); Bin_Boundary_Atoms( system ); - fprintf( stderr, " [BIN_BOUNDARY_ATOMS]\n" ); /* Sync atoms here to continue the computation */ dev_alloc_system( system ); - fprintf( stderr, " [DEV ALLOC SYSTEM]\n" ); Sync_System( system ); - fprintf( stderr, " [SYNC SYSTEM]\n" ); /* estimate numH and Hcap */ Cuda_Reset_Atoms( system, control ); @@ -294,10 +288,8 @@ int Cuda_Init_System( reax_system *system, control_params *control, #endif Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D ); - fprintf( stderr, " [CUDA COMPUTE TOTAL MASS]\n" ); Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D ); - fprintf( stderr, " [CUDA COMPUTE CENTER OF MASS]\n" ); // if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE ) // { @@ -308,11 +300,9 @@ int Cuda_Init_System( reax_system *system, control_params *control, if ( !control->restart || (control->restart && control->random_vel) ) { Generate_Initial_Velocities( system, control->T_init ); - fprintf( stderr, " [GENERATE INITIAL VELOCITIES]\n" ); } Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); - fprintf( stderr, " [CUDA COMPUTE K.E.]\n" ); return SUCCESS; } @@ -1138,7 +1128,6 @@ void Cuda_Initialize( reax_system *system, control_params *control, system->my_rank ); MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); } - fprintf( stderr, " [INIT MPI DATATYPES]\n" ); /* SYSTEM */ if ( Cuda_Init_System( system, control, data, workspace, mpi_data, msg ) == FAILURE ) @@ -1148,22 +1137,18 @@ void Cuda_Initialize( reax_system *system, control_params *control, system->my_rank ); MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); } - fprintf( stderr, " [CUDA INIT SYSTEM]\n" ); /* GRID */ dev_alloc_grid( system ); Sync_Grid( &system->my_grid, &system->d_my_grid ); - fprintf( stderr, " [DEV ALLOC GRID]\n" ); //validate_grid( system ); /* SIMULATION_DATA */ Cuda_Init_Simulation_Data( system, control, data, msg ); - fprintf( stderr, " [CUDA INIT SIMULATION DATA]\n" ); /* WORKSPACE */ Cuda_Init_Workspace( system, control, workspace, msg ); - fprintf( stderr, " [CUDA INIT WORKSPACE]\n" ); #if defined(DEBUG) fprintf( stderr, "p%d: initialized workspace\n", system->my_rank ); @@ -1173,7 +1158,6 @@ void Cuda_Initialize( reax_system *system, control_params *control, /* CONTROL */ dev_alloc_control( control ); - fprintf( stderr, " [DEV ALLOC CONTROL]\n" ); /* LISTS */ if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) == @@ -1184,7 +1168,6 @@ void Cuda_Initialize( reax_system *system, control_params *control, system->my_rank ); MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); } - fprintf( stderr, " [CUDA INIT LISTS]\n" ); #if defined(DEBUG) fprintf( stderr, "p%d: initialized lists\n", system->my_rank ); @@ -1198,7 +1181,6 @@ void Cuda_Initialize( reax_system *system, control_params *control, system->my_rank ); MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); } - fprintf( stderr, " [INIT OUTPUT FILES]\n" ); #if defined(DEBUG) fprintf( stderr, "p%d: output files opened\n", system->my_rank ); @@ -1214,7 +1196,6 @@ void Cuda_Initialize( reax_system *system, control_params *control, system->my_rank ); MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE ); } - fprintf( stderr, " [INIT LOOKUP TABLES]\n" ); #if defined(DEBUG) fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank ); diff --git a/PG-PuReMD/src/integrate.c b/PG-PuReMD/src/integrate.c index bfbbe1f7..88b406b5 100644 --- a/PG-PuReMD/src/integrate.c +++ b/PG-PuReMD/src/integrate.c @@ -366,14 +366,12 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con ret = SUCCESS; Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data ); - fprintf( stderr, " [CUDA_REALLOCATE] STEP %d\n", data->step ); if ( verlet_part1_done == FALSE ) { /* velocity verlet, 1st part */ bNVT_update_velocity_part1( system, dt ); verlet_part1_done = TRUE; - fprintf( stderr, " [bNVT_UPDATE_VEL_PART1] STEP %d\n", data->step ); #if defined(DEBUG_FOCUS) fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step ); @@ -383,19 +381,14 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con if ( renbr ) { Update_Grid( system, control, mpi_data->world ); - fprintf( stderr, " [UPDATE_GRID] STEP %d\n", data->step ); } Output_Sync_Atoms( system ); - fprintf( stderr, " [OUTPUT_SYNC_ATOMS] STEP %d\n", data->step ); Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr ); - fprintf( stderr, " [COMM_ATOMS] STEP %d\n", data->step ); Sync_Atoms( system ); - fprintf( stderr, " [SYNC_ATOMS] STEP %d\n", data->step ); /* synch the Grid to the Device here */ Sync_Grid( &system->my_grid, &system->d_my_grid ); - fprintf( stderr, " [SYNC_GRID] STEP %d\n", data->step ); init_blocks( system ); @@ -406,7 +399,6 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con } Cuda_Reset( system, control, data, workspace, lists ); - fprintf( stderr, " [CUDA_RESET] STEP %d\n", data->step ); if ( renbr ) { @@ -419,7 +411,6 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con //TODO: move far_nbrs reallocation checks outside of renbr frequency check ret = Cuda_Estimate_Neighbors( system, data->step ); estimate_nbrs_done = 1; - fprintf( stderr, " [CUDA_ESTIMATE_NEIGHBORS: %d] STEP %d\n", ret, data->step ); } if ( ret == SUCCESS && estimate_nbrs_done == 1 ) @@ -439,14 +430,12 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con { ret = Cuda_Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data ); - fprintf( stderr, " [CUDA_COMPUTE_FORCES: %d] STEP %d\n", ret, data->step ); } if ( ret == SUCCESS ) { /* velocity verlet, 2nd part */ bNVT_update_velocity_part2( system, dt ); - fprintf( stderr, " [bNVT_UPDATE_VEL_PART2] STEP %d\n", data->step ); #if defined(DEBUG_FOCUS) fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step); @@ -455,7 +444,6 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con /* temperature scaler */ Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D ); - fprintf( stderr, " [CUDA_COMPUTE_KINETIC_ENERGY] STEP %d\n", data->step ); lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0); if ( lambda < MIN_dT ) diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c index f3a341db..9c028d42 100644 --- a/PG-PuReMD/src/io_tools.c +++ b/PG-PuReMD/src/io_tools.c @@ -1190,7 +1190,7 @@ void Output_Results( reax_system *system, control_params *control, out_control->energy_update_freq > 0 && data->step % out_control->energy_update_freq == 0 ) { -#if !defined(DEBUG) && !defined(DEBUG_FOCUS) +#if defined(DEBUG) && defined(DEBUG_FOCUS) fprintf( out_control->out, "%-6d%14.2f%14.2f%14.2f%11.2f%13.2f%13.5f\n", data->step, data->sys_en.e_tot, data->sys_en.e_pot, diff --git a/PG-PuReMD/src/io_tools.h b/PG-PuReMD/src/io_tools.h index 789991c3..6ae2d6d8 100644 --- a/PG-PuReMD/src/io_tools.h +++ b/PG-PuReMD/src/io_tools.h @@ -24,6 +24,7 @@ #include "reax_types.h" + int Init_Output_Files( reax_system*, control_params*, output_controls*, mpi_datatypes*, char* ); int Close_Output_Files( reax_system*, control_params*, @@ -106,4 +107,6 @@ void Print_Bond_List( reax_system*, control_params*, simulation_data*, reax_list**, output_controls*); #endif + + #endif diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c index a3071712..4d677687 100644 --- a/PG-PuReMD/src/parallelreax.c +++ b/PG-PuReMD/src/parallelreax.c @@ -326,6 +326,7 @@ int main( int argc, char* argv[] ) #if defined(__CUDA_DEBUG__) Reset( system, control, data, workspace, lists ); #endif + #if defined(DEBUG) fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank ); #endif @@ -400,13 +401,11 @@ int main( int argc, char* argv[] ) retries = 0; while ( data->step <= control->nsteps && retries < MAX_RETRIES ) { - fprintf( stderr, "[BEGIN] STEP %d\n", data->step ); ret = SUCCESS; if ( control->T_mode && retries == 0 ) { Temperature_Control( control, data ); - fprintf( stderr, " [TEMPERATURE CONTROL] STEP %d\n", data->step ); } #if defined(DEBUG) @@ -418,7 +417,6 @@ int main( int argc, char* argv[] ) #endif ret = Cuda_Evolve( system, control, data, workspace, lists, out_control, mpi_data ); - fprintf( stderr, "[EVOLVE] STEP %d\n", data->step ); #if defined(DEBUG) t_end = Get_Timing_Info( t_begin ); @@ -434,7 +432,6 @@ int main( int argc, char* argv[] ) ret = Cuda_Post_Evolve( system, control, data, workspace, lists, out_control, mpi_data ); } - fprintf( stderr, "[POST EVOLVE] STEP %d\n", data->step ); #if defined(__CUDA_DEBUG__) Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data); @@ -484,13 +481,14 @@ int main( int argc, char* argv[] ) else { ++retries; - fprintf( stderr, "INFO: retrying step %d...\n", data->step ); + fprintf( stderr, "[INFO] p%d: retrying step %d...\n", system->my_rank, data->step ); } } if ( retries >= MAX_RETRIES ) { - fprintf( stderr, "Maximum retries reached for this step. Terminating...\n" ); + fprintf( stderr, "Maximum retries reached for this step (%d). Terminating...\n", + retries ); MPI_Abort( MPI_COMM_WORLD, MAX_RETRIES_REACHED ); } @@ -635,7 +633,7 @@ int main( int argc, char* argv[] ) else { ++retries; - fprintf( stderr, "INFO: retrying step %d...\n", data->step ); + fprintf( stderr, "[INFO] p%d: retrying step %d...\n", system->my_rank, data->step ); } } -- GitLab