From 1b639864991825373af45975b7c1d155850a031f Mon Sep 17 00:00:00 2001 From: "Kurt A. O'Hearn" <ohearnku@msu.edu> Date: Wed, 28 Apr 2021 12:19:35 -0400 Subject: [PATCH] PG-PuReMD: refactor CUDA memory management routines. --- PG-PuReMD/src/comm_tools.c | 17 +-- PG-PuReMD/src/cuda/cuda_allocate.cu | 96 +++++++-------- PG-PuReMD/src/cuda/cuda_box.cu | 14 +-- PG-PuReMD/src/cuda/cuda_charges.cu | 36 +++--- PG-PuReMD/src/cuda/cuda_copy.cu | 136 +++++++++++----------- PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu | 12 +- PG-PuReMD/src/cuda/cuda_forces.cu | 40 +++---- PG-PuReMD/src/cuda/cuda_integrate.cu | 7 +- PG-PuReMD/src/cuda/cuda_lookup.cu | 98 +++++++++------- PG-PuReMD/src/cuda/cuda_neighbors.cu | 8 +- PG-PuReMD/src/cuda/cuda_reset_tools.cu | 4 +- PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu | 32 ++--- PG-PuReMD/src/cuda/cuda_system_props.cu | 30 +++-- PG-PuReMD/src/cuda/cuda_utils.cu | 53 ++++----- PG-PuReMD/src/cuda/cuda_utils.h | 5 +- PG-PuReMD/src/cuda/cuda_valence_angles.cu | 4 +- 16 files changed, 292 insertions(+), 300 deletions(-) diff --git a/PG-PuReMD/src/comm_tools.c b/PG-PuReMD/src/comm_tools.c index 44572ba2..4e7b1628 100644 --- a/PG-PuReMD/src/comm_tools.c +++ b/PG-PuReMD/src/comm_tools.c @@ -174,26 +174,21 @@ void Estimate_NT_Atoms( reax_system * const system, mpi_datatypes * const mpi_da /* Note: filename must be NULL-terminated before calling this function */ void Check_MPI_Error( int code, const char * const filename, int line ) { - int len; + int len, rank; char err_msg[MPI_MAX_ERROR_STRING]; -#if defined(DEBUG_FOCUS) - int rank; - - MPI_Comm_rank( MPI_COMM_WORLD, &rank ); - fprintf( stderr, "[INFO] Check_MPI_Error: p%d, file %.*s, line %d\n", rank, (int) strlen(filename), filename, line ); - fflush( stderr ); -#endif if ( code != MPI_SUCCESS ) { + MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Error_string( code, err_msg, &len ); fprintf( stderr, "[ERROR] MPI error\n" ); /* strlen safe here only if filename is NULL-terminated before calling Check_MPI_Error */ - fprintf( stderr, " [INFO] At line %d in file %.*s\n", - line, (int) strlen(filename), filename ); - fprintf( stderr, " [INFO] Error code %d\n", code ); + fprintf( stderr, " [INFO] At line %d in file %.*s on MPI processor %d\n", + line, (int) strlen(filename), filename, rank ); + fprintf( stderr, " [INFO] Error code: %d\n", code ); fprintf( stderr, " [INFO] Error message: %.*s\n", len, err_msg ); + MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR ); } } diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu index 083e107e..6cecfcb6 100644 --- a/PG-PuReMD/src/cuda/cuda_allocate.cu +++ b/PG-PuReMD/src/cuda/cuda_allocate.cu @@ -47,21 +47,21 @@ static void Cuda_Reallocate_System_Part1( reax_system *system, storage *workspac "Cuda_Reallocate_System_Part1::workspace->scratch" ); temp = (int *) workspace->scratch; - copy_device( temp, system->d_cm_entries, sizeof(int) * local_cap_old, - "Cuda_Reallocate_System_Part1::temp" ); + sCudaMemcpy( temp, system->d_cm_entries, sizeof(int) * local_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); cuda_free( system->d_cm_entries, "Cuda_Reallocate_System_Part1::d_cm_entries" ); cuda_malloc( (void **) &system->d_cm_entries, sizeof(int) * system->local_cap, TRUE, "Cuda_Reallocate_System_Part1::d_cm_entries" ); - copy_device( system->d_cm_entries, temp, sizeof(int) * local_cap_old, - "Cuda_Reallocate_System_Part1::temp" ); + sCudaMemcpy( system->d_cm_entries, temp, sizeof(int) * local_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); - copy_device( temp, system->d_max_cm_entries, sizeof(int) * local_cap_old, - "Cuda_Reallocate_System_Part1::temp" ); + sCudaMemcpy( temp, system->d_max_cm_entries, sizeof(int) * local_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); cuda_free( system->d_max_cm_entries, "Cuda_Reallocate_System_Part1::d_max_cm_entries" ); cuda_malloc( (void **) &system->d_max_cm_entries, sizeof(int) * system->local_cap, TRUE, "Cuda_Reallocate_System_Part1::d_max_cm_entries" ); - copy_device( system->d_max_cm_entries, temp, sizeof(int) * local_cap_old, - "Cuda_Reallocate_System_Part1::temp" ); + sCudaMemcpy( system->d_max_cm_entries, temp, sizeof(int) * local_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); } @@ -78,69 +78,69 @@ static void Cuda_Reallocate_System_Part2( reax_system *system, storage *workspac temp_atom = (reax_atom *) workspace->scratch; /* free the existing storage for atoms, leave other info allocated */ - copy_device( temp_atom, system->d_my_atoms, sizeof(reax_atom) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp_atom" ); + sCudaMemcpy( temp_atom, system->d_my_atoms, sizeof(reax_atom) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); cuda_free( system->d_my_atoms, "system::d_my_atoms" ); cuda_malloc( (void **) &system->d_my_atoms, sizeof(reax_atom) * system->total_cap, TRUE, "Cuda_Reallocate_System_Part2::d_my_atoms" ); - copy_device( system->d_my_atoms, temp_atom, sizeof(reax_atom) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp_atom" ); + sCudaMemcpy( system->d_my_atoms, temp_atom, sizeof(reax_atom) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); /* list management */ - copy_device( temp, system->d_far_nbrs, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( temp, system->d_far_nbrs, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); cuda_free( system->d_far_nbrs, "Cuda_Reallocate_System_Part2::d_far_nbrs" ); cuda_malloc( (void **) &system->d_far_nbrs, sizeof(int) * system->total_cap, TRUE, "Cuda_Reallocate_System_Part2::d_far_nbrs" ); - copy_device( system->d_far_nbrs, temp, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( system->d_far_nbrs, temp, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); - copy_device( temp, system->d_max_far_nbrs, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( temp, system->d_max_far_nbrs, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); cuda_free( system->d_max_far_nbrs, "Cuda_Reallocate_System_Part2::d_max_far_nbrs" ); cuda_malloc( (void **) &system->d_max_far_nbrs, sizeof(int) * system->total_cap, TRUE, "Cuda_Reallocate_System_Part2::d_max_far_nbrs" ); - copy_device( system->d_max_far_nbrs, temp, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( system->d_max_far_nbrs, temp, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); - copy_device( temp, system->d_bonds, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( temp, system->d_bonds, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); cuda_free( system->d_bonds, "Cuda_Reallocate_System_Part2::d_bonds" ); cuda_malloc( (void **) &system->d_bonds, sizeof(int) * system->total_cap, TRUE, "Cuda_Reallocate_System_Part2::d_bonds" ); - copy_device( system->d_bonds, temp, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( system->d_bonds, temp, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); - copy_device( temp, system->d_max_bonds, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( temp, system->d_max_bonds, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); cuda_free( system->d_max_bonds, "Cuda_Reallocate_System_Part2::d_max_bonds" ); cuda_malloc( (void **) &system->d_max_bonds, sizeof(int) * system->total_cap, TRUE, "Cuda_Reallocate_System_Part2::d_max_bonds" ); - copy_device( system->d_max_bonds, temp, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( system->d_max_bonds, temp, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); - copy_device( temp, system->d_hbonds, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( temp, system->d_hbonds, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); cuda_free( system->d_hbonds, "system::d_hbonds" ); cuda_malloc( (void **) &system->d_hbonds, sizeof(int) * system->total_cap, TRUE, "Cuda_Reallocate_System_Part2::d_hbonds" ); - copy_device( system->d_hbonds, temp, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( system->d_hbonds, temp, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); - copy_device( temp, system->d_max_hbonds, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( temp, system->d_max_hbonds, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); cuda_free( system->d_max_hbonds, "system::d_max_hbonds" ); cuda_malloc( (void **) &system->d_max_hbonds, sizeof(int) * system->total_cap, TRUE, "Cuda_Reallocate_System_Part2::d_max_hbonds" ); - copy_device( system->d_max_hbonds, temp, sizeof(int) * total_cap_old, - "Cuda_Reallocate_System_Part2::temp" ); + sCudaMemcpy( system->d_max_hbonds, temp, sizeof(int) * total_cap_old, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); } @@ -148,8 +148,8 @@ void Cuda_Allocate_Control( control_params *control ) { cuda_malloc( (void **)&control->d_control_params, sizeof(control_params), TRUE, "control_params" ); - copy_host_device( control, control->d_control_params, - sizeof(control_params), cudaMemcpyHostToDevice, "control_params" ); + sCudaMemcpy( control->d_control_params, control, + sizeof(control_params), cudaMemcpyHostToDevice, __FILE__, __LINE__ ); } @@ -219,7 +219,8 @@ void Cuda_Allocate_Grid( reax_system *system ) // // cuda_malloc( (void **) &local_cell.nbrs_x, sizeof(ivec) * host->max_nbrs, // TRUE, "alloc:grid:cells:nbrs_x" ); -// copy_device( local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof(ivec), "grid:nbrs_x" ); +// sCudaMemcpy( local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof(ivec), +// cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); // //fprintf( stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x ); // // cuda_malloc( (void **) &local_cell.nbrs_cp, sizeof(rvec) * host->max_nbrs, @@ -230,8 +231,8 @@ void Cuda_Allocate_Grid( reax_system *system ) // // TRUE, "alloc:grid:cells:nbrs" ); // //fprintf( stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs ); // -// copy_host_device( &local_cell, &device->cells[i], sizeof(grid_cell), -// cudaMemcpyHostToDevice, "grid:cell-alloc" ); +// sCudaMemcpy( &device->cells[i], &local_cell, sizeof(grid_cell), +// cudaMemcpyHostToDevice, __FILE__, __LINE__ ); // } } @@ -248,9 +249,8 @@ void Cuda_Deallocate_Grid_Cell_Atoms( reax_system *system ) for ( i = 0; i < total; ++i ) { - copy_host_device( &local_cell, &device->cells[i], - sizeof(grid_cell), cudaMemcpyDeviceToHost, - "Cuda_Deallocate_Grid_Cell_Atoms::grid" ); + sCudaMemcpy( &local_cell, &device->cells[i], + sizeof(grid_cell), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); cuda_free( local_cell.atoms, "Cuda_Deallocate_Grid_Cell_Atoms::grid_cell.atoms" ); @@ -270,12 +270,12 @@ void Cuda_Allocate_Grid_Cell_Atoms( reax_system *system, int cap ) for ( i = 0; i < total; i++ ) { - copy_host_device( &local_cell, &device->cells[i], - sizeof(grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc" ); + sCudaMemcpy( &local_cell, &device->cells[i], + sizeof(grid_cell), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); cuda_malloc( (void **)&local_cell.atoms, sizeof(int) * cap, TRUE, "realloc:grid:cells:atoms" ); - copy_host_device( &local_cell, &device->cells[i], - sizeof(grid_cell), cudaMemcpyHostToDevice, "grid:cell-realloc" ); + sCudaMemcpy( &local_cell, &device->cells[i], + sizeof(grid_cell), cudaMemcpyHostToDevice, __FILE__, __LINE__ ); } } diff --git a/PG-PuReMD/src/cuda/cuda_box.cu b/PG-PuReMD/src/cuda/cuda_box.cu index e13a1a01..fb8b62ab 100644 --- a/PG-PuReMD/src/cuda/cuda_box.cu +++ b/PG-PuReMD/src/cuda/cuda_box.cu @@ -96,15 +96,15 @@ void Cuda_Scale_Box( reax_system *system, control_params *control, system->big_box.box[1][1] *= mu[1]; system->big_box.box[2][2] *= mu[2]; - Make_Consistent( &(system->big_box) ); + Make_Consistent( &system->big_box ); Setup_My_Box( system, control ); Setup_My_Ext_Box( system, control ); Update_Comm( system ); - copy_host_device( &system->big_box, &system->d_big_box, - sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->big_box" ); - copy_host_device( &system->my_box, &system->d_my_box, - sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->my_box" ); - copy_host_device( &system->my_ext_box, &system->d_my_ext_box, - sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->my_ext_box" ); + sCudaMemcpy( &system->d_big_box, &system->big_box, + sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( &system->d_my_box, &system->my_box, + sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( &system->d_my_ext_box, &system->my_ext_box, + sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ ); } diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu index 7d71c6ac..3933082c 100644 --- a/PG-PuReMD/src/cuda/cuda_charges.cu +++ b/PG-PuReMD/src/cuda/cuda_charges.cu @@ -87,20 +87,20 @@ void Sort_Matrix_Rows( sparse_matrix * const A, reax_system const * const system /* copy row indices from device */ start = (int *) smalloc( sizeof(int) * system->total_cap, "Sort_Matrix_Rows::start" ); end = (int *) smalloc( sizeof(int) * system->total_cap, "Sort_Matrix_Rows::end" ); - copy_host_device( start, A->start, sizeof(int) * system->total_cap, - cudaMemcpyDeviceToHost, "Sort_Matrix_Rows::start" ); - copy_host_device( end, A->end, sizeof(int) * system->total_cap, - cudaMemcpyDeviceToHost, "Sort_Matrix_Rows::end" ); + sCudaMemcpy( start, A->start, sizeof(int) * system->total_cap, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( end, A->end, sizeof(int) * system->total_cap, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); /* make copies of column indices and non-zero values */ - cuda_malloc( (void **)&d_j_temp, sizeof(int) * system->total_cm_entries, + cuda_malloc( (void **) &d_j_temp, sizeof(int) * system->total_cm_entries, FALSE, "Sort_Matrix_Rows::d_j_temp" ); - cuda_malloc( (void **)&d_val_temp, sizeof(real) * system->total_cm_entries, + cuda_malloc( (void **) &d_val_temp, sizeof(real) * system->total_cm_entries, FALSE, "Sort_Matrix_Rows::d_val_temp" ); - copy_device( d_j_temp, A->j, sizeof(int) * system->total_cm_entries, - "Sort_Matrix_Rows::d_j_temp" ); - copy_device( d_val_temp, A->val, sizeof(real) * system->total_cm_entries, - "Sort_Matrix_Rows::d_val_temp" ); + sCudaMemcpy( d_j_temp, A->j, sizeof(int) * system->total_cm_entries, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( d_val_temp, A->val, sizeof(real) * system->total_cm_entries, + cudaMemcpyDeviceToDevice, __FILE__, __LINE__ ); for ( i = 0; i < system->n; ++i ) { @@ -421,8 +421,8 @@ static void Extrapolate_Charges_QEq_Part2( reax_system const * const system, ( system->d_my_atoms, *(workspace->d_workspace), u, spad, system->n ); cudaCheckError( ); - copy_host_device( q, spad, sizeof(real) * system->n, - cudaMemcpyDeviceToHost, "Extrapolate_Charges_QEq_Part2::spad" ); + sCudaMemcpy( q, spad, sizeof(real) * system->n, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } @@ -455,8 +455,8 @@ static void Update_Ghost_Atom_Charges( reax_system const * const system, sizeof(real) * (system->N - system->n), "Update_Ghost_Atom_Charges::workspace->scratch" ); spad = (real *) workspace->scratch; - copy_host_device( &q[system->n], spad, sizeof(real) * (system->N - system->n), - cudaMemcpyHostToDevice, "Update_Ghost_Atom_Charges::q" ); + sCudaMemcpy( spad, &q[system->n], sizeof(real) * (system->N - system->n), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); k_update_ghost_atom_charges <<< blocks, DEF_BLOCK_SIZE >>> ( system->d_my_atoms, spad, system->n, system->N ); @@ -501,8 +501,8 @@ static void Calculate_Charges_QEq( reax_system const * const system, ( spad, &spad[blocks], blocks ); cudaCheckError( ); - copy_host_device( &my_sum, &spad[blocks], - sizeof(rvec2), cudaMemcpyDeviceToHost, "Calculate_Charges_QEq::my_sum," ); + sCudaMemcpy( &my_sum, &spad[blocks], + sizeof(rvec2), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); #else cuda_check_malloc( &workspace->scratch, &workspace->scratch_size, sizeof(real) * 2, @@ -513,8 +513,8 @@ static void Calculate_Charges_QEq( reax_system const * const system, Cuda_Reduction_Sum( workspace->d_workspace->s, &spad[0], system->n ); Cuda_Reduction_Sum( workspace->d_workspace->t, &spad[1], system->n ); - copy_host_device( my_sum, spad, sizeof(real) * 2, - cudaMemcpyDeviceToHost, "Calculate_Charges_QEq::my_sum," ); + sCudaMemcpy( my_sum, spad, sizeof(real) * 2, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); #endif /* global reduction on pseudo-charges for s and t */ diff --git a/PG-PuReMD/src/cuda/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu index 3fd0b445..4fda724c 100644 --- a/PG-PuReMD/src/cuda/cuda_copy.cu +++ b/PG-PuReMD/src/cuda/cuda_copy.cu @@ -32,24 +32,19 @@ extern "C" void Cuda_Copy_Grid_Host_to_Device( grid *host, grid *device ) ivec_Copy( device->ghost_hbond_span, host->ghost_hbond_span ); ivec_Copy( device->ghost_bond_span, host->ghost_bond_span ); - copy_host_device( host->str, device->str, sizeof(int) * total, - cudaMemcpyHostToDevice, - "Cuda_Copy_Grid_Host_to_Device::str" ); - copy_host_device( host->end, device->end, sizeof(int) * total, - cudaMemcpyHostToDevice, - "Cuda_Copy_Grid_Host_to_Device::end" ); - copy_host_device( host->cutoff, device->cutoff, sizeof(real) * total, - cudaMemcpyHostToDevice, - "Cuda_Copy_Grid_Host_to_Device::cutoff" ); - copy_host_device( host->nbrs_x, device->nbrs_x, sizeof(ivec) * total - * host->max_nbrs, cudaMemcpyHostToDevice, - "Cuda_Copy_Grid_Host_to_Device::nbrs_x" ); - copy_host_device( host->nbrs_cp, device->nbrs_cp, sizeof(rvec) * total - * host->max_nbrs, cudaMemcpyHostToDevice, - "Cuda_Copy_Grid_Host_to_Device::nbrs_cp" ); - - copy_host_device( host->rel_box, device->rel_box, sizeof(ivec) * total, - cudaMemcpyHostToDevice, "Cuda_Copy_Grid_Host_to_Device::rel_box" ); + sCudaMemcpy( device->str, host->str, sizeof(int) * total, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( device->end, host->end, sizeof(int) * total, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( device->cutoff, host->cutoff, sizeof(real) * total, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( device->nbrs_x, host->nbrs_x, sizeof(ivec) * total * host->max_nbrs, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( device->nbrs_cp, host->nbrs_cp, sizeof(rvec) * total * host->max_nbrs, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + + sCudaMemcpy( device->rel_box, host->rel_box, sizeof(ivec) * total, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); device->max_nbrs = host->max_nbrs; } @@ -58,10 +53,8 @@ extern "C" void Cuda_Copy_Grid_Host_to_Device( grid *host, grid *device ) /* Copy atom info from host to device */ extern "C" void Cuda_Copy_Atoms_Host_to_Device( reax_system *system ) { - copy_host_device( system->my_atoms, system->d_my_atoms, - sizeof(reax_atom) * system->N, - cudaMemcpyHostToDevice, - "Cuda_Copy_Atoms_Host_to_Device::system->my_atoms" ); + sCudaMemcpy( system->d_my_atoms, system->my_atoms, sizeof(reax_atom) * system->N, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); } @@ -70,32 +63,31 @@ extern "C" void Cuda_Copy_System_Host_to_Device( reax_system *system ) { Cuda_Copy_Atoms_Host_to_Device( system ); - copy_host_device( &system->my_box, system->d_my_box, sizeof(simulation_box), - cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->my_box" ); + sCudaMemcpy( system->d_my_box, &system->my_box, + sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ ); - copy_host_device( &system->my_ext_box, system->d_my_ext_box, - sizeof(simulation_box), cudaMemcpyHostToDevice, - "Cuda_Copy_System_Host_to_Device::system->my_ext_box" ); + sCudaMemcpy( system->d_my_ext_box, &system->my_ext_box, + sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ ); - copy_host_device( system->reax_param.sbp, system->reax_param.d_sbp, + sCudaMemcpy( system->reax_param.d_sbp, system->reax_param.sbp, sizeof(single_body_parameters) * system->reax_param.num_atom_types, - cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->sbp" ); - copy_host_device( system->reax_param.tbp, system->reax_param.d_tbp, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( system->reax_param.d_tbp, system->reax_param.tbp, sizeof(two_body_parameters) * POW(system->reax_param.num_atom_types, 2), - cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->tbp" ); - copy_host_device( system->reax_param.thbp, system->reax_param.d_thbp, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( system->reax_param.d_thbp, system->reax_param.thbp, sizeof(three_body_header) * POW(system->reax_param.num_atom_types, 3), - cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->thbh" ); - copy_host_device( system->reax_param.hbp, system->reax_param.d_hbp, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( system->reax_param.d_hbp, system->reax_param.hbp, sizeof(hbond_parameters) * POW(system->reax_param.num_atom_types, 3), - cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->hbond" ); - copy_host_device( system->reax_param.fbp, system->reax_param.d_fbp, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( system->reax_param.d_fbp, system->reax_param.fbp, sizeof(four_body_header) * POW(system->reax_param.num_atom_types, 4), - cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->four_header" ); + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); - copy_host_device( system->reax_param.gp.l, system->reax_param.d_gp.l, - sizeof(real) * system->reax_param.gp.n_global, cudaMemcpyHostToDevice, - "Cuda_Copy_System_Host_to_Device::system->global_parameters" ); + sCudaMemcpy( system->reax_param.d_gp.l, system->reax_param.gp.l, + sizeof(real) * system->reax_param.gp.n_global, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); system->reax_param.d_gp.n_global = system->reax_param.gp.n_global; system->reax_param.d_gp.vdw_type = system->reax_param.gp.vdw_type; @@ -105,23 +97,23 @@ extern "C" void Cuda_Copy_System_Host_to_Device( reax_system *system ) /* Copy atom info from device to host */ extern "C" void Cuda_Copy_Atoms_Device_to_Host( reax_system *system ) { - copy_host_device( system->my_atoms, system->d_my_atoms, + sCudaMemcpy( system->my_atoms, system->d_my_atoms, sizeof(reax_atom) * system->N, - cudaMemcpyDeviceToHost, "Cuda_Copy_Atoms_Device_to_Host::my_atoms" ); + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } /* Copy simulation data from device to host */ extern "C" void Cuda_Copy_Simulation_Data_Device_to_Host( simulation_data *host, simulation_data *dev ) { - copy_host_device( &host->my_en, &dev->my_en, sizeof(energy_data), - cudaMemcpyDeviceToHost, "simulation_data:energy_data" ); - copy_host_device( &host->kin_press, &dev->kin_press, sizeof(real), - cudaMemcpyDeviceToHost, "simulation_data:kin_press" ); - copy_host_device( host->int_press, dev->int_press, sizeof(rvec), - cudaMemcpyDeviceToHost, "simulation_data:int_press" ); - copy_host_device( host->ext_press, dev->ext_press, sizeof(rvec), - cudaMemcpyDeviceToHost, "simulation_data:ext_press" ); + sCudaMemcpy( &host->my_en, &dev->my_en, sizeof(energy_data), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( &host->kin_press, &dev->kin_press, sizeof(real), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( host->int_press, dev->int_press, sizeof(rvec), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( host->ext_press, dev->ext_press, sizeof(rvec), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } @@ -146,45 +138,47 @@ extern "C" void Cuda_Copy_List_Device_to_Host( reax_list *host_list, reax_list * fprintf( stderr, " [INFO] trying to copy %d list from device to host\n", type ); #endif - copy_host_device( host_list->index, device_list->index, sizeof(int) * device_list->n, - cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::list->index" ); - copy_host_device( host_list->end_index, device_list->end_index, sizeof(int) * - device_list->n, cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::list->end_index" ); + sCudaMemcpy( host_list->index, device_list->index, + sizeof(int) * device_list->n, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( host_list->end_index, device_list->end_index, + sizeof(int) * device_list->n, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); switch ( type ) { case TYP_FAR_NEIGHBOR: - copy_host_device( host_list->far_nbr_list.nbr, device_list->far_nbr_list.nbr, + sCudaMemcpy( host_list->far_nbr_list.nbr, device_list->far_nbr_list.nbr, sizeof(int) * device_list->max_intrs, - cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::far_neighbor_list.nbr" ); - copy_host_device( host_list->far_nbr_list.rel_box, device_list->far_nbr_list.rel_box, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( host_list->far_nbr_list.rel_box, device_list->far_nbr_list.rel_box, sizeof(ivec) * device_list->max_intrs, - cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::far_neighbor_list.rel_box" ); - copy_host_device( host_list->far_nbr_list.d, device_list->far_nbr_list.d, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( host_list->far_nbr_list.d, device_list->far_nbr_list.d, sizeof(real) * device_list->max_intrs, - cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::far_neighbor_list.d" ); - copy_host_device( host_list->far_nbr_list.dvec, device_list->far_nbr_list.dvec, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( host_list->far_nbr_list.dvec, device_list->far_nbr_list.dvec, sizeof(rvec) * device_list->max_intrs, - cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::far_neighbor_list.dvec" ); + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); break; case TYP_BOND: - copy_host_device( host_list->bond_list, device_list->bond_list, + sCudaMemcpy( host_list->bond_list, device_list->bond_list, sizeof(bond_data) * device_list->max_intrs, - cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::bond_list" ); + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); break; case TYP_HBOND: - copy_host_device( host_list->hbond_list, device_list->hbond_list, + sCudaMemcpy( host_list->hbond_list, device_list->hbond_list, sizeof(hbond_data) * device_list->max_intrs, - cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::hbond_list" ); + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); break; case TYP_THREE_BODY: - copy_host_device( host_list->three_body_list, + sCudaMemcpy( host_list->three_body_list, device_list->three_body_list, sizeof(three_body_interaction_data ) * device_list->max_intrs, - cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::three_body_list" ); + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); break; default: @@ -217,8 +211,8 @@ extern "C" void Cuda_Copy_MPI_Data_Host_to_Device( mpi_datatypes *mpi_data ) { /* index is set during SendRecv and reused during MPI comms afterward, * so copy to device while SendRecv is still done on the host */ - copy_host_device( mpi_data->out_buffers[i].index, mpi_data->d_out_buffers[i].index, - mpi_data->d_out_buffers[i].index_size, cudaMemcpyHostToDevice, - "Cuda_Copy_MPI_Data_Host_to_Device::mpi_data->d_in1_buffer" ); + sCudaMemcpy( mpi_data->d_out_buffers[i].index, mpi_data->out_buffers[i].index, + mpi_data->d_out_buffers[i].index_size, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); } } diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu index 2c565e1b..306ccec6 100644 --- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu @@ -600,8 +600,8 @@ real Dot( storage * const workspace, // ret = MPI_Allreduce( &spad[k], &sum, 1, MPI_DOUBLE, MPI_SUM, comm ); // Check_MPI_Error( ret, __FILE__, __LINE__ ); //#else - copy_host_device( &temp, &spad[k], sizeof(real), - cudaMemcpyDeviceToHost, "Dot::temp" ); + sCudaMemcpy( &temp, &spad[k], sizeof(real), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); ret = MPI_Allreduce( &temp, &sum, 1, MPI_DOUBLE, MPI_SUM, comm ); Check_MPI_Error( ret, __FILE__, __LINE__ ); @@ -636,8 +636,8 @@ real Dot_local( storage * const workspace, Cuda_Reduction_Sum( spad, &spad[k], k ); //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI - copy_host_device( &sum, &spad[k], sizeof(real), - cudaMemcpyDeviceToHost, "Dot_local::sum" ); + sCudaMemcpy( &sum, &spad[k], sizeof(real), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); return sum; } @@ -683,8 +683,8 @@ void Dot_local_rvec2( control_params const * const control, cudaCheckError( ); //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI - copy_host_device( &sum, &spad[k + blocks], sizeof(rvec2), - cudaMemcpyDeviceToHost, "Dot_local_rvec2::sum" ); + sCudaMemcpy( &sum, &spad[k + blocks], sizeof(rvec2), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); *sum1 = sum[0]; *sum2 = sum[1]; diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu index cfdeef7c..ed76e02c 100644 --- a/PG-PuReMD/src/cuda/cuda_forces.cu +++ b/PG-PuReMD/src/cuda/cuda_forces.cu @@ -1763,8 +1763,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, Cuda_Reduction_Sum( system->d_max_cm_entries, system->d_total_cm_entries, workspace->d_workspace->H.n_max ); - copy_host_device( &system->total_cm_entries, system->d_total_cm_entries, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" ); + sCudaMemcpy( &system->total_cm_entries, system->d_total_cm_entries, + sizeof(int), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } if ( realloc_bonds == TRUE ) @@ -1782,8 +1782,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, Cuda_Reduction_Sum( system->d_max_bonds, system->d_total_bonds, system->total_cap ); - copy_host_device( &system->total_bonds, system->d_total_bonds, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_bonds" ); + sCudaMemcpy( &system->total_bonds, system->d_total_bonds, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } if ( system->numH > 0 && control->hbond_cut > 0.0 && realloc_hbonds == TRUE ) @@ -1801,8 +1801,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control, Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds, system->total_cap ); - copy_host_device( &system->total_hbonds, system->d_total_hbonds, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" ); + sCudaMemcpy( &system->total_hbonds, system->d_total_hbonds, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } else if ( step == 0 && (system->numH == 0 || control->hbond_cut <= 0.0) ) { @@ -2002,12 +2002,12 @@ int Cuda_Init_Forces( reax_system *system, control_params *control, #endif /* check reallocation flags on device */ - copy_host_device( &realloc_cm, system->d_realloc_cm_entries, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_cm_entries" ); - copy_host_device( &realloc_bonds, system->d_realloc_bonds, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" ); - copy_host_device( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" ); + sCudaMemcpy( &realloc_cm, system->d_realloc_cm_entries, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( &realloc_bonds, system->d_realloc_bonds, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) @@ -2211,10 +2211,10 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control, #endif /* check reallocation flags on device */ - copy_host_device( &realloc_bonds, system->d_realloc_bonds, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" ); - copy_host_device( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" ); + sCudaMemcpy( &realloc_bonds, system->d_realloc_bonds, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); + sCudaMemcpy( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) @@ -2356,13 +2356,13 @@ static void Cuda_Compute_Total_Force( reax_system *system, control_params *contr * based on the neighbors information each processor has had. * final values of force on each atom needs to be computed by adding up * all partially-final pieces */ - copy_host_device( f, workspace->d_workspace->f, sizeof(rvec) * system->N , - cudaMemcpyDeviceToHost, "Cuda_Compute_Total_Force::workspace->d_workspace->f" ); + sCudaMemcpy( f, workspace->d_workspace->f, sizeof(rvec) * system->N , + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); Coll( system, mpi_data, f, RVEC_PTR_TYPE, mpi_data->mpi_rvec ); - copy_host_device( f, workspace->d_workspace->f, sizeof(rvec) * system->N, - cudaMemcpyHostToDevice, "Cuda_Compute_Total_Force::workspace->d_workspace->f" ); + sCudaMemcpy( workspace->d_workspace->f, f, sizeof(rvec) * system->N, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); Cuda_Total_Forces_Part2( system, workspace ); } diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu index 9a4a50e6..acd257f5 100644 --- a/PG-PuReMD/src/cuda/cuda_integrate.cu +++ b/PG-PuReMD/src/cuda/cuda_integrate.cu @@ -237,7 +237,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Part2( reax_system *system, storage *worksp real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, storage *workspace, - real dt, real v_xi_old, real * d_my_ekin, real * d_total_my_ekin ) + real dt, real v_xi_old, real *d_my_ekin, real *d_total_my_ekin ) { int blocks; real my_ekin; @@ -252,9 +252,8 @@ real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, storage *worksp Cuda_Reduction_Sum( d_my_ekin, d_total_my_ekin, system->n ); - copy_host_device( &my_ekin, d_total_my_ekin, sizeof(real), - cudaMemcpyDeviceToHost, - "Velocity_Verlet_Nose_Hoover_NVT_Part3::d_total_my_ekin" ); + sCudaMemcpy( &my_ekin, d_total_my_ekin, sizeof(real), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); return my_ekin; } diff --git a/PG-PuReMD/src/cuda/cuda_lookup.cu b/PG-PuReMD/src/cuda/cuda_lookup.cu index c65046ff..200699e2 100644 --- a/PG-PuReMD/src/cuda/cuda_lookup.cu +++ b/PG-PuReMD/src/cuda/cuda_lookup.cu @@ -19,7 +19,7 @@ extern "C" void Cuda_Copy_LR_Lookup_Table_Host_to_Device( reax_system *system, fprintf( stderr, "Copying the LR Lookyp Table to the device ... \n" ); cuda_malloc( (void **) &workspace->d_LR, - sizeof(LR_lookup_table) * ( num_atom_types * num_atom_types ), + sizeof(LR_lookup_table) * num_atom_types * num_atom_types, FALSE, "LR_lookup:table" ); /* @@ -30,9 +30,9 @@ extern "C" void Cuda_Copy_LR_Lookup_Table_Host_to_Device( reax_system *system, existing_types[ system->atoms[i].type ] = 1; */ - copy_host_device( workspace->LR, workspace->d_LR, + sCudaMemcpy( workspace->d_LR, workspace->LR, sizeof(LR_lookup_table) * (num_atom_types * num_atom_types), - cudaMemcpyHostToDevice, "LR_lookup:table" ); + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); for( i = 0; i < num_atom_types; ++i ) { @@ -42,47 +42,61 @@ extern "C" void Cuda_Copy_LR_Lookup_Table_Host_to_Device( reax_system *system, { if ( aggregated[j] ) { - cuda_malloc( (void **) &d_y, - sizeof(LR_data) * (control->tabulate + 1), FALSE, "LR_lookup:d_y" ); - copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].y, d_y, - sizeof(LR_data) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:y" ); - copy_host_device ( &d_y, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].y, - sizeof(LR_data *), cudaMemcpyHostToDevice, "LR_lookup:y" ); - - cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:h" ); - copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].H, temp, - sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:h" ); - copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].H, - sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:h" ); - - cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:vdW" ); - copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].vdW, temp, - sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:vdW" ); - copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].vdW, - sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:vdW" ); - - cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:CEvd" ); - copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].CEvd, temp, - sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:CEvd" ); - copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].CEvd, - sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:CDvd"); - - cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:ele" ); - copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].ele, temp, - sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ele" ); - copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].ele, - sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ele" ); - - cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:ceclmb" ); - copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].CEclmb, temp, - sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ceclmb" ); - copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].CEclmb, - sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ceclmb" ); + cuda_malloc( (void **) &d_y, sizeof(LR_data) * (control->tabulate + 1), + FALSE, "LR_lookup:d_y" ); + sCudaMemcpy( d_y, workspace->LR[ index_lr(i, j, num_atom_types) ].y, + sizeof(LR_data) * (control->tabulate + 1), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].y, &d_y, + sizeof(LR_data *), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + + cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), + FALSE, "LR_lookup:h" ); + sCudaMemcpy( temp, workspace->LR[ index_lr(i, j, num_atom_types) ].H, + sizeof(cubic_spline_coef) * (control->tabulate + 1), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].H, &temp, + sizeof(cubic_spline_coef *), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + + cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), + FALSE, "LR_lookup:vdW" ); + sCudaMemcpy( temp, workspace->LR[ index_lr(i, j, num_atom_types) ].vdW, + sizeof(cubic_spline_coef) * (control->tabulate + 1), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].vdW, &temp, + sizeof(cubic_spline_coef *), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + + cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), + FALSE, "LR_lookup:CEvd" ); + sCudaMemcpy( temp, workspace->LR[ index_lr(i, j, num_atom_types) ].CEvd, + sizeof(cubic_spline_coef) * (control->tabulate + 1), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].CEvd, &temp, + sizeof(cubic_spline_coef *), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + + cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), + FALSE, "LR_lookup:ele" ); + sCudaMemcpy( temp,workspace->LR[ index_lr(i, j, num_atom_types) ].ele, + sizeof(cubic_spline_coef) * (control->tabulate + 1), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].ele, &temp, + sizeof(cubic_spline_coef *), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + + cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), + FALSE, "LR_lookup:ceclmb" ); + sCudaMemcpy( temp, workspace->LR[ index_lr(i, j, num_atom_types) ].CEclmb, + sizeof(cubic_spline_coef) * (control->tabulate + 1), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); + sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].CEclmb, &temp, + sizeof(cubic_spline_coef *), + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); } } } } - - fprintf( stderr, "Copy of the LR Lookup Table to the device complete ... \n" ); } - diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu index 1ef460f3..bba3f788 100644 --- a/PG-PuReMD/src/cuda/cuda_neighbors.cu +++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu @@ -535,8 +535,8 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system, // cudaCheckError( ); /* check reallocation flag on device */ - copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" ); + sCudaMemcpy( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); ret = (ret_far_nbr == FALSE) ? SUCCESS : FAILURE; workspace->d_workspace->realloc.far_nbrs = ret_far_nbr; @@ -591,8 +591,8 @@ void Cuda_Estimate_Num_Neighbors( reax_system *system, simulation_data *data ) Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs, system->total_cap ); - copy_host_device( &system->total_far_nbrs, system->d_total_far_nbrs, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_total_far_nbrs" ); + sCudaMemcpy( &system->total_far_nbrs, system->d_total_far_nbrs, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); #if defined(LOG_PERFORMANCE) cudaEventRecord( time_event[1] ); diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu index 532d643c..a74c18b4 100644 --- a/PG-PuReMD/src/cuda/cuda_reset_tools.cu +++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu @@ -94,8 +94,8 @@ void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *contro Cuda_Reduction_Sum( hindex, system->d_numH, system->N ); #endif - copy_host_device( &system->numH, system->d_numH, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Reset_Atoms_HBond_Indices::d_numH" ); + sCudaMemcpy( &system->numH, system->d_numH, sizeof(int), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu index a3a485a5..d638c044 100644 --- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu +++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu @@ -601,14 +601,14 @@ static void Dual_Sparse_MatVec_Comm_Part1( const reax_system * const system, "Dual_Sparse_MatVec_Comm_Part1::workspace->host_scratch" ); spad = (rvec2 *) workspace->host_scratch; - copy_host_device( spad, (void *) x, sizeof(rvec2) * n, - cudaMemcpyDeviceToHost, "Dual_Sparse_MatVec_Comm_Part1::x (d-to-h)" ); + sCudaMemcpy( spad, (void *) x, sizeof(rvec2) * n, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */ Dist( system, mpi_data, spad, buf_type, mpi_type ); - copy_host_device( spad, (void *) x, sizeof(rvec2) * n, - cudaMemcpyHostToDevice, "Dual_Sparse_MatVec_Comm_Part1::x (h-to-d)" ); + sCudaMemcpy( (void *) x, spad, sizeof(rvec2) * n, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); #endif } @@ -696,13 +696,13 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system, sizeof(rvec2) * n1, TRUE, SAFE_ZONE, "Dual_Sparse_MatVec_Comm_Part2::workspace->host_scratch" ); spad = (rvec2 *) workspace->host_scratch; - copy_host_device( spad, b, sizeof(rvec2) * n1, - cudaMemcpyDeviceToHost, "Dual_Sparse_MatVec_Comm_Part2::b" ); + sCudaMemcpy( spad, b, sizeof(rvec2) * n1, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); Coll( system, mpi_data, spad, buf_type, mpi_type ); - copy_host_device( spad, b, sizeof(rvec2) * n2, - cudaMemcpyHostToDevice, "Dual_Sparse_MatVec_Comm_Part2::b" ); + sCudaMemcpy( b, spad, sizeof(rvec2) * n2, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); #endif } } @@ -778,14 +778,14 @@ static void Sparse_MatVec_Comm_Part1( const reax_system * const system, sizeof(real) * n, TRUE, SAFE_ZONE, "Sparse_MatVec_Comm_Part1::workspace->host_scratch" ); spad = (real *) workspace->host_scratch; - copy_host_device( spad, (void *) x, sizeof(real) * n, - cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part1::x" ); + sCudaMemcpy( spad, (void *) x, sizeof(real) * n, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */ Dist( system, mpi_data, spad, buf_type, mpi_type ); - copy_host_device( spad, (void *) x, sizeof(real) * n, - cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part1::x" ); + sCudaMemcpy( (void *) x, spad, sizeof(real) * n, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); #endif } @@ -871,13 +871,13 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system, sizeof(real) * n1, TRUE, SAFE_ZONE, "Sparse_MatVec_Comm_Part2::workspace->host_scratch" ); spad = (real *) workspace->host_scratch; - copy_host_device( spad, b, sizeof(real) * n1, - cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::b" ); + sCudaMemcpy( spad, b, sizeof(real) * n1, + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); Coll( system, mpi_data, spad, buf_type, mpi_type ); - copy_host_device( spad, b, sizeof(real) * n2, - cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::b" ); + sCudaMemcpy( b, spad, sizeof(real) * n2, + cudaMemcpyHostToDevice, __FILE__, __LINE__ ); #endif } } diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu index 4e819dd1..ee10c7ca 100644 --- a/PG-PuReMD/src/cuda/cuda_system_props.cu +++ b/PG-PuReMD/src/cuda/cuda_system_props.cu @@ -539,8 +539,8 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, ( spad, &spad[control->blocks], control->blocks ); cudaCheckError( ); - copy_host_device( xcm, &spad[control->blocks], sizeof(rvec), - cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::xcm" ); + sCudaMemcpy( xcm, &spad[control->blocks], sizeof(rvec), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); // vcm cuda_memset( spad, 0, sizeof(rvec) * (control->blocks + 1), @@ -556,8 +556,8 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, ( spad, &spad[control->blocks], control->blocks ); cudaCheckError( ); - copy_host_device( vcm, &spad[control->blocks], sizeof(rvec), - cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::vcm" ); + sCudaMemcpy( vcm, &spad[control->blocks], sizeof(rvec), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); // amcm cuda_memset( spad, 0, sizeof(rvec) * (control->blocks + 1), @@ -573,8 +573,8 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control, ( spad, &spad[control->blocks], control->blocks ); cudaCheckError( ); - copy_host_device( amcm, &spad[control->blocks], sizeof(rvec), - cudaMemcpyDeviceToHost,"Cuda_Compute_Momentum::amcm" ); + sCudaMemcpy( amcm, &spad[control->blocks], sizeof(rvec), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } @@ -614,9 +614,9 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c ( spad, &spad[6 * control->blocks], control->blocks ); cudaCheckError( ); - copy_host_device( t, &spad[6 * control->blocks], + sCudaMemcpy( t, &spad[6 * control->blocks], sizeof(real) * 6, cudaMemcpyDeviceToHost, - "Cuda_Compute_Intertial_Tensor::t" ); + __FILE__, __LINE__ ); } @@ -687,9 +687,8 @@ extern "C" void Cuda_Compute_Kinetic_Energy( reax_system *system, * and this call finishes the global reduction across all blocks */ Cuda_Reduction_Sum( kinetic_energy, &kinetic_energy[system->n], system->n ); - copy_host_device( &data->my_en.e_kin, &kinetic_energy[system->n], - sizeof(real), cudaMemcpyDeviceToHost, - "Cuda_Compute_Kinetic_Energy::tmp" ); + sCudaMemcpy( &data->my_en.e_kin, &kinetic_energy[system->n], + sizeof(real), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); ret = MPI_Allreduce( &data->my_en.e_kin, &data->sys_en.e_kin, 1, MPI_DOUBLE, MPI_SUM, comm ); @@ -722,8 +721,8 @@ void Cuda_Compute_Total_Mass( reax_system *system, control_params *control, Cuda_Reduction_Sum( spad, &spad[system->n], system->n ); - copy_host_device( &my_M, &spad[system->n], sizeof(real), - cudaMemcpyDeviceToHost, "total_mass::my_M" ); + sCudaMemcpy( &my_M, &spad[system->n], sizeof(real), + cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); ret = MPI_Allreduce( &my_M, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm ); Check_MPI_Error( ret, __FILE__, __LINE__ ); @@ -877,9 +876,8 @@ void Cuda_Compute_Pressure( reax_system* system, control_params *control, control->blocks ); cudaCheckError( ); - copy_host_device( &int_press, &rvec_spad[system->n + control->blocks], - sizeof(rvec), cudaMemcpyDeviceToHost, - "Cuda_Compute_Pressure::int_press" ); + sCudaMemcpy( &int_press, &rvec_spad[system->n + control->blocks], + sizeof(rvec), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); } /* sum up internal and external pressure */ diff --git a/PG-PuReMD/src/cuda/cuda_utils.cu b/PG-PuReMD/src/cuda/cuda_utils.cu index a3dbe4cb..952ab10b 100644 --- a/PG-PuReMD/src/cuda/cuda_utils.cu +++ b/PG-PuReMD/src/cuda/cuda_utils.cu @@ -109,42 +109,35 @@ void cuda_check_malloc( void **ptr, size_t *cur_size, size_t new_size, const cha } -void copy_host_device( void *host, void *dev, size_t size, - cudaMemcpyKind dir, const char *msg ) +/* Safe wrapper around cudaMemcpy + * + * dest: address to be copied to + * src: address to be copied from + * size: num. bytes to copy + * dir: CUDA enum specifying address types for dest and src + * filename: NULL-terminated source filename where function call originated + * line: line of source filen where function call originated + */ +void sCudaMemcpy( void * const dest, void const * const src, size_t size, + cudaMemcpyKind dir, const char * const filename, int line ) { - cudaError_t retVal = cudaErrorNotReady; + int rank; + cudaError_t ret; - if ( dir == cudaMemcpyHostToDevice ) - { - retVal = cudaMemcpy( dev, host, size, cudaMemcpyHostToDevice ); - } - else - { - retVal = cudaMemcpy( host, dev, size, cudaMemcpyDeviceToHost ); - } + ret = cudaMemcpy( dest, src, size, dir ); - if ( retVal != cudaSuccess ) + if ( ret != cudaSuccess ) { - fprintf( stderr, - "[ERROR] could not copy resource %s from host to device\n [INFO] CUDA API error code: %d\n", - msg, retVal ); - exit( INSUFFICIENT_MEMORY ); - } -} + MPI_Comm_rank( MPI_COMM_WORLD, &rank ); + const char *str = cudaGetErrorString( ret ); + fprintf( stderr, "[ERROR] CUDA error: memory copy failure\n" ); + fprintf( stderr, " [INFO] At line %d in file %.*s on MPI processor %d\n", + line, (int) strlen(filename), filename, rank ); + fprintf( stderr, " [INFO] Error code: %d\n", ret ); + fprintf( stderr, " [INFO] Error message: %.*s\n", (int) strlen(str), str ); -void copy_device( void *dest, void *src, size_t size, const char *msg ) -{ - cudaError_t retVal; - - retVal = cudaMemcpy( dest, src, size, cudaMemcpyDeviceToDevice ); - - if ( retVal != cudaSuccess ) - { - fprintf( stderr, - "[ERROR] could not copy resource %s from device to device\n [INFO] CUDA API error code: %d\n", - msg, retVal ); - exit( INSUFFICIENT_MEMORY ); + MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR ); } } diff --git a/PG-PuReMD/src/cuda/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h index 6a3a6efd..ac4288b1 100644 --- a/PG-PuReMD/src/cuda/cuda_utils.h +++ b/PG-PuReMD/src/cuda/cuda_utils.h @@ -12,9 +12,8 @@ void cuda_memset( void *, int , size_t , const char * ); void cuda_check_malloc( void **, size_t *, size_t, const char * ); -void copy_host_device( void *, void *, size_t, enum cudaMemcpyKind, const char * ); - -void copy_device( void *, void *, size_t, const char * ); +void sCudaMemcpy( void * const, void const * const, size_t, + enum cudaMemcpyKind, const char * const, int ); void Cuda_Print_Mem_Usage( ); diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu index ecab2563..989b26bc 100644 --- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu +++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu @@ -1365,8 +1365,8 @@ static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds ); - copy_host_device( &system->total_thbodies, system->d_total_thbodies, sizeof(int), - cudaMemcpyDeviceToHost, "Cuda_Estimate_Storage_Three_Body::d_total_thbodies" ); + sCudaMemcpy( &system->total_thbodies, system->d_total_thbodies, + sizeof(int), cudaMemcpyDeviceToHost, __FILE__, __LINE__ ); if ( data->step - data->prev_steps == 0 ) { -- GitLab