From 1b639864991825373af45975b7c1d155850a031f Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Wed, 28 Apr 2021 12:19:35 -0400
Subject: [PATCH] PG-PuReMD: refactor CUDA memory management routines.

---
 PG-PuReMD/src/comm_tools.c                |  17 +--
 PG-PuReMD/src/cuda/cuda_allocate.cu       |  96 +++++++--------
 PG-PuReMD/src/cuda/cuda_box.cu            |  14 +--
 PG-PuReMD/src/cuda/cuda_charges.cu        |  36 +++---
 PG-PuReMD/src/cuda/cuda_copy.cu           | 136 +++++++++++-----------
 PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu  |  12 +-
 PG-PuReMD/src/cuda/cuda_forces.cu         |  40 +++----
 PG-PuReMD/src/cuda/cuda_integrate.cu      |   7 +-
 PG-PuReMD/src/cuda/cuda_lookup.cu         |  98 +++++++++-------
 PG-PuReMD/src/cuda/cuda_neighbors.cu      |   8 +-
 PG-PuReMD/src/cuda/cuda_reset_tools.cu    |   4 +-
 PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu   |  32 ++---
 PG-PuReMD/src/cuda/cuda_system_props.cu   |  30 +++--
 PG-PuReMD/src/cuda/cuda_utils.cu          |  53 ++++-----
 PG-PuReMD/src/cuda/cuda_utils.h           |   5 +-
 PG-PuReMD/src/cuda/cuda_valence_angles.cu |   4 +-
 16 files changed, 292 insertions(+), 300 deletions(-)

diff --git a/PG-PuReMD/src/comm_tools.c b/PG-PuReMD/src/comm_tools.c
index 44572ba2..4e7b1628 100644
--- a/PG-PuReMD/src/comm_tools.c
+++ b/PG-PuReMD/src/comm_tools.c
@@ -174,26 +174,21 @@ void Estimate_NT_Atoms( reax_system * const system, mpi_datatypes * const mpi_da
 /* Note: filename must be NULL-terminated before calling this function */
 void Check_MPI_Error( int code, const char * const filename, int line )
 {
-    int len;
+    int len, rank;
     char err_msg[MPI_MAX_ERROR_STRING];
-#if defined(DEBUG_FOCUS)
-    int rank;
-
-    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
-    fprintf( stderr, "[INFO] Check_MPI_Error: p%d, file %.*s, line %d\n", rank, (int) strlen(filename), filename, line );
-    fflush( stderr );
-#endif
 
     if ( code != MPI_SUCCESS )
     {
+        MPI_Comm_rank( MPI_COMM_WORLD, &rank );
         MPI_Error_string( code, err_msg, &len );
 
         fprintf( stderr, "[ERROR] MPI error\n" );
         /* strlen safe here only if filename is NULL-terminated before calling Check_MPI_Error */
-        fprintf( stderr, "    [INFO] At line %d in file %.*s\n",
-                line, (int) strlen(filename), filename );
-        fprintf( stderr, "    [INFO] Error code %d\n", code );
+        fprintf( stderr, "    [INFO] At line %d in file %.*s on MPI processor %d\n",
+                line, (int) strlen(filename), filename, rank );
+        fprintf( stderr, "    [INFO] Error code: %d\n", code );
         fprintf( stderr, "    [INFO] Error message: %.*s\n", len, err_msg );
+
         MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR );
     }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index 083e107e..6cecfcb6 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -47,21 +47,21 @@ static void Cuda_Reallocate_System_Part1( reax_system *system, storage *workspac
             "Cuda_Reallocate_System_Part1::workspace->scratch" );
     temp = (int *) workspace->scratch;
 
-    copy_device( temp, system->d_cm_entries, sizeof(int) * local_cap_old,
-            "Cuda_Reallocate_System_Part1::temp" );
+    sCudaMemcpy( temp, system->d_cm_entries, sizeof(int) * local_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
     cuda_free( system->d_cm_entries, "Cuda_Reallocate_System_Part1::d_cm_entries" );
     cuda_malloc( (void **) &system->d_cm_entries,
             sizeof(int) * system->local_cap, TRUE, "Cuda_Reallocate_System_Part1::d_cm_entries" );
-    copy_device( system->d_cm_entries, temp, sizeof(int) * local_cap_old,
-            "Cuda_Reallocate_System_Part1::temp" );
+    sCudaMemcpy( system->d_cm_entries, temp, sizeof(int) * local_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 
-    copy_device( temp, system->d_max_cm_entries, sizeof(int) * local_cap_old,
-            "Cuda_Reallocate_System_Part1::temp" );
+    sCudaMemcpy( temp, system->d_max_cm_entries, sizeof(int) * local_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
     cuda_free( system->d_max_cm_entries, "Cuda_Reallocate_System_Part1::d_max_cm_entries" );
     cuda_malloc( (void **) &system->d_max_cm_entries,
             sizeof(int) * system->local_cap, TRUE, "Cuda_Reallocate_System_Part1::d_max_cm_entries" );
-    copy_device( system->d_max_cm_entries, temp, sizeof(int) * local_cap_old,
-            "Cuda_Reallocate_System_Part1::temp" );
+    sCudaMemcpy( system->d_max_cm_entries, temp, sizeof(int) * local_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 }
 
 
@@ -78,69 +78,69 @@ static void Cuda_Reallocate_System_Part2( reax_system *system, storage *workspac
     temp_atom = (reax_atom *) workspace->scratch;
 
     /* free the existing storage for atoms, leave other info allocated */
-    copy_device( temp_atom, system->d_my_atoms, sizeof(reax_atom) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp_atom" );
+    sCudaMemcpy( temp_atom, system->d_my_atoms, sizeof(reax_atom) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
     cuda_free( system->d_my_atoms, "system::d_my_atoms" );
     cuda_malloc( (void **) &system->d_my_atoms,
             sizeof(reax_atom) * system->total_cap, TRUE,
             "Cuda_Reallocate_System_Part2::d_my_atoms" );
-    copy_device( system->d_my_atoms, temp_atom, sizeof(reax_atom) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp_atom" );
+    sCudaMemcpy( system->d_my_atoms, temp_atom, sizeof(reax_atom) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 
     /* list management */
-    copy_device( temp, system->d_far_nbrs, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( temp, system->d_far_nbrs, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
     cuda_free( system->d_far_nbrs, "Cuda_Reallocate_System_Part2::d_far_nbrs" );
     cuda_malloc( (void **) &system->d_far_nbrs,
             sizeof(int) * system->total_cap, TRUE,
             "Cuda_Reallocate_System_Part2::d_far_nbrs" );
-    copy_device( system->d_far_nbrs, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( system->d_far_nbrs, temp, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 
-    copy_device( temp, system->d_max_far_nbrs, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( temp, system->d_max_far_nbrs, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
     cuda_free( system->d_max_far_nbrs, "Cuda_Reallocate_System_Part2::d_max_far_nbrs" );
     cuda_malloc( (void **) &system->d_max_far_nbrs,
             sizeof(int) * system->total_cap, TRUE,
             "Cuda_Reallocate_System_Part2::d_max_far_nbrs" );
-    copy_device( system->d_max_far_nbrs, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( system->d_max_far_nbrs, temp, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 
-    copy_device( temp, system->d_bonds, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( temp, system->d_bonds, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
     cuda_free( system->d_bonds, "Cuda_Reallocate_System_Part2::d_bonds" );
     cuda_malloc( (void **) &system->d_bonds,
             sizeof(int) * system->total_cap, TRUE,
             "Cuda_Reallocate_System_Part2::d_bonds" );
-    copy_device( system->d_bonds, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( system->d_bonds, temp, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 
-    copy_device( temp, system->d_max_bonds, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( temp, system->d_max_bonds, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
     cuda_free( system->d_max_bonds, "Cuda_Reallocate_System_Part2::d_max_bonds" );
     cuda_malloc( (void **) &system->d_max_bonds,
             sizeof(int) * system->total_cap, TRUE,
             "Cuda_Reallocate_System_Part2::d_max_bonds" );
-    copy_device( system->d_max_bonds, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( system->d_max_bonds, temp, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 
-    copy_device( temp, system->d_hbonds, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( temp, system->d_hbonds, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
     cuda_free( system->d_hbonds, "system::d_hbonds" );
     cuda_malloc( (void **) &system->d_hbonds,
             sizeof(int) * system->total_cap, TRUE,
             "Cuda_Reallocate_System_Part2::d_hbonds" );
-    copy_device( system->d_hbonds, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( system->d_hbonds, temp, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 
-    copy_device( temp, system->d_max_hbonds, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( temp, system->d_max_hbonds, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
     cuda_free( system->d_max_hbonds, "system::d_max_hbonds" );
     cuda_malloc( (void **) &system->d_max_hbonds,
             sizeof(int) * system->total_cap, TRUE,
             "Cuda_Reallocate_System_Part2::d_max_hbonds" );
-    copy_device( system->d_max_hbonds, temp, sizeof(int) * total_cap_old,
-            "Cuda_Reallocate_System_Part2::temp" );
+    sCudaMemcpy( system->d_max_hbonds, temp, sizeof(int) * total_cap_old,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 }
 
 
@@ -148,8 +148,8 @@ void Cuda_Allocate_Control( control_params *control )
 {
     cuda_malloc( (void **)&control->d_control_params,
             sizeof(control_params), TRUE, "control_params" );
-    copy_host_device( control, control->d_control_params,
-            sizeof(control_params), cudaMemcpyHostToDevice, "control_params" );
+    sCudaMemcpy( control->d_control_params, control,
+            sizeof(control_params), cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 }
 
 
@@ -219,7 +219,8 @@ void Cuda_Allocate_Grid( reax_system *system )
 //
 //        cuda_malloc( (void **) &local_cell.nbrs_x, sizeof(ivec) * host->max_nbrs,
 //                TRUE, "alloc:grid:cells:nbrs_x" );
-//        copy_device( local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof(ivec), "grid:nbrs_x" );
+//        sCudaMemcpy( local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof(ivec),
+//                cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 //        //fprintf( stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x );
 //
 //        cuda_malloc( (void **) &local_cell.nbrs_cp, sizeof(rvec) * host->max_nbrs,
@@ -230,8 +231,8 @@ void Cuda_Allocate_Grid( reax_system *system )
 //        //                TRUE, "alloc:grid:cells:nbrs" );
 //        //fprintf( stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs );
 //
-//        copy_host_device( &local_cell, &device->cells[i], sizeof(grid_cell),
-//                cudaMemcpyHostToDevice, "grid:cell-alloc" );
+//        sCudaMemcpy( &device->cells[i], &local_cell, sizeof(grid_cell),
+//                cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 //    }
 }
 
@@ -248,9 +249,8 @@ void Cuda_Deallocate_Grid_Cell_Atoms( reax_system *system )
 
     for ( i = 0; i < total; ++i )
     {
-        copy_host_device( &local_cell, &device->cells[i], 
-                sizeof(grid_cell), cudaMemcpyDeviceToHost,
-                "Cuda_Deallocate_Grid_Cell_Atoms::grid" );
+        sCudaMemcpy( &local_cell, &device->cells[i], 
+                sizeof(grid_cell), cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
         cuda_free( local_cell.atoms,
                 "Cuda_Deallocate_Grid_Cell_Atoms::grid_cell.atoms" );
@@ -270,12 +270,12 @@ void Cuda_Allocate_Grid_Cell_Atoms( reax_system *system, int cap )
 
     for ( i = 0; i < total; i++ )
     {
-        copy_host_device( &local_cell, &device->cells[i], 
-                sizeof(grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc" );
+        sCudaMemcpy( &local_cell, &device->cells[i], 
+                sizeof(grid_cell), cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
         cuda_malloc( (void **)&local_cell.atoms, sizeof(int) * cap, 
                 TRUE, "realloc:grid:cells:atoms" );
-        copy_host_device( &local_cell, &device->cells[i], 
-                sizeof(grid_cell), cudaMemcpyHostToDevice, "grid:cell-realloc" );
+        sCudaMemcpy( &local_cell, &device->cells[i], 
+                sizeof(grid_cell), cudaMemcpyHostToDevice, __FILE__, __LINE__ );
     }
 }
 
diff --git a/PG-PuReMD/src/cuda/cuda_box.cu b/PG-PuReMD/src/cuda/cuda_box.cu
index e13a1a01..fb8b62ab 100644
--- a/PG-PuReMD/src/cuda/cuda_box.cu
+++ b/PG-PuReMD/src/cuda/cuda_box.cu
@@ -96,15 +96,15 @@ void Cuda_Scale_Box( reax_system *system, control_params *control,
     system->big_box.box[1][1] *= mu[1];
     system->big_box.box[2][2] *= mu[2];
 
-    Make_Consistent( &(system->big_box) );
+    Make_Consistent( &system->big_box );
     Setup_My_Box( system, control );
     Setup_My_Ext_Box( system, control );
     Update_Comm( system );
 
-    copy_host_device( &system->big_box, &system->d_big_box,
-            sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->big_box" );
-    copy_host_device( &system->my_box, &system->d_my_box,
-            sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->my_box" );
-    copy_host_device( &system->my_ext_box, &system->d_my_ext_box,
-            sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->my_ext_box" );
+    sCudaMemcpy( &system->d_big_box, &system->big_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( &system->d_my_box, &system->my_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( &system->d_my_ext_box, &system->my_ext_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 }
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index 7d71c6ac..3933082c 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -87,20 +87,20 @@ void Sort_Matrix_Rows( sparse_matrix * const A, reax_system const * const system
     /* copy row indices from device */
     start = (int *) smalloc( sizeof(int) * system->total_cap, "Sort_Matrix_Rows::start" );
     end = (int *) smalloc( sizeof(int) * system->total_cap, "Sort_Matrix_Rows::end" );
-    copy_host_device( start, A->start, sizeof(int) * system->total_cap, 
-            cudaMemcpyDeviceToHost, "Sort_Matrix_Rows::start" );
-    copy_host_device( end, A->end, sizeof(int) * system->total_cap, 
-            cudaMemcpyDeviceToHost, "Sort_Matrix_Rows::end" );
+    sCudaMemcpy( start, A->start, sizeof(int) * system->total_cap, 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+    sCudaMemcpy( end, A->end, sizeof(int) * system->total_cap, 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     /* make copies of column indices and non-zero values */
-    cuda_malloc( (void **)&d_j_temp, sizeof(int) * system->total_cm_entries,
+    cuda_malloc( (void **) &d_j_temp, sizeof(int) * system->total_cm_entries,
             FALSE, "Sort_Matrix_Rows::d_j_temp" );
-    cuda_malloc( (void **)&d_val_temp, sizeof(real) * system->total_cm_entries,
+    cuda_malloc( (void **) &d_val_temp, sizeof(real) * system->total_cm_entries,
             FALSE, "Sort_Matrix_Rows::d_val_temp" );
-    copy_device( d_j_temp, A->j, sizeof(int) * system->total_cm_entries,
-            "Sort_Matrix_Rows::d_j_temp" );
-    copy_device( d_val_temp, A->val, sizeof(real) * system->total_cm_entries,
-            "Sort_Matrix_Rows::d_val_temp" );
+    sCudaMemcpy( d_j_temp, A->j, sizeof(int) * system->total_cm_entries,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( d_val_temp, A->val, sizeof(real) * system->total_cm_entries,
+            cudaMemcpyDeviceToDevice, __FILE__, __LINE__ );
 
     for ( i = 0; i < system->n; ++i )
     {
@@ -421,8 +421,8 @@ static void Extrapolate_Charges_QEq_Part2( reax_system const * const system,
         ( system->d_my_atoms, *(workspace->d_workspace), u, spad, system->n );
     cudaCheckError( );
 
-    copy_host_device( q, spad, sizeof(real) * system->n, 
-            cudaMemcpyDeviceToHost, "Extrapolate_Charges_QEq_Part2::spad" );
+    sCudaMemcpy( q, spad, sizeof(real) * system->n, 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 }
 
 
@@ -455,8 +455,8 @@ static void Update_Ghost_Atom_Charges( reax_system const * const system,
             sizeof(real) * (system->N - system->n),
             "Update_Ghost_Atom_Charges::workspace->scratch" );
     spad = (real *) workspace->scratch;
-    copy_host_device( &q[system->n], spad, sizeof(real) * (system->N - system->n),
-            cudaMemcpyHostToDevice, "Update_Ghost_Atom_Charges::q" );
+    sCudaMemcpy( spad, &q[system->n], sizeof(real) * (system->N - system->n),
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 
     k_update_ghost_atom_charges <<< blocks, DEF_BLOCK_SIZE >>>
         ( system->d_my_atoms, spad, system->n, system->N );
@@ -501,8 +501,8 @@ static void Calculate_Charges_QEq( reax_system const * const system,
         ( spad, &spad[blocks], blocks );
     cudaCheckError( );
 
-    copy_host_device( &my_sum, &spad[blocks],
-            sizeof(rvec2), cudaMemcpyDeviceToHost, "Calculate_Charges_QEq::my_sum," );
+    sCudaMemcpy( &my_sum, &spad[blocks],
+            sizeof(rvec2), cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 #else
     cuda_check_malloc( &workspace->scratch, &workspace->scratch_size,
             sizeof(real) * 2,
@@ -513,8 +513,8 @@ static void Calculate_Charges_QEq( reax_system const * const system,
     Cuda_Reduction_Sum( workspace->d_workspace->s, &spad[0], system->n );
     Cuda_Reduction_Sum( workspace->d_workspace->t, &spad[1], system->n );
 
-    copy_host_device( my_sum, spad, sizeof(real) * 2,
-            cudaMemcpyDeviceToHost, "Calculate_Charges_QEq::my_sum," );
+    sCudaMemcpy( my_sum, spad, sizeof(real) * 2,
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 #endif
 
     /* global reduction on pseudo-charges for s and t */
diff --git a/PG-PuReMD/src/cuda/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu
index 3fd0b445..4fda724c 100644
--- a/PG-PuReMD/src/cuda/cuda_copy.cu
+++ b/PG-PuReMD/src/cuda/cuda_copy.cu
@@ -32,24 +32,19 @@ extern "C" void Cuda_Copy_Grid_Host_to_Device( grid *host, grid *device )
     ivec_Copy( device->ghost_hbond_span, host->ghost_hbond_span );
     ivec_Copy( device->ghost_bond_span, host->ghost_bond_span );
 
-    copy_host_device( host->str, device->str, sizeof(int) * total,
-            cudaMemcpyHostToDevice,
-            "Cuda_Copy_Grid_Host_to_Device::str" );
-    copy_host_device( host->end, device->end, sizeof(int) * total,
-            cudaMemcpyHostToDevice,
-            "Cuda_Copy_Grid_Host_to_Device::end" );
-    copy_host_device( host->cutoff, device->cutoff, sizeof(real) * total,
-            cudaMemcpyHostToDevice,
-            "Cuda_Copy_Grid_Host_to_Device::cutoff" );
-    copy_host_device( host->nbrs_x, device->nbrs_x, sizeof(ivec) * total
-            * host->max_nbrs, cudaMemcpyHostToDevice,
-            "Cuda_Copy_Grid_Host_to_Device::nbrs_x" );
-    copy_host_device( host->nbrs_cp, device->nbrs_cp, sizeof(rvec) * total
-            * host->max_nbrs, cudaMemcpyHostToDevice,
-            "Cuda_Copy_Grid_Host_to_Device::nbrs_cp" );
-
-    copy_host_device( host->rel_box, device->rel_box, sizeof(ivec) * total,
-            cudaMemcpyHostToDevice, "Cuda_Copy_Grid_Host_to_Device::rel_box" );
+    sCudaMemcpy( device->str, host->str, sizeof(int) * total,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( device->end, host->end, sizeof(int) * total,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( device->cutoff, host->cutoff, sizeof(real) * total,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( device->nbrs_x, host->nbrs_x, sizeof(ivec) * total * host->max_nbrs,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( device->nbrs_cp, host->nbrs_cp, sizeof(rvec) * total * host->max_nbrs,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+
+    sCudaMemcpy( device->rel_box, host->rel_box, sizeof(ivec) * total,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 
     device->max_nbrs = host->max_nbrs;
 }
@@ -58,10 +53,8 @@ extern "C" void Cuda_Copy_Grid_Host_to_Device( grid *host, grid *device )
 /* Copy atom info from host to device */
 extern "C" void Cuda_Copy_Atoms_Host_to_Device( reax_system *system )
 {
-    copy_host_device( system->my_atoms, system->d_my_atoms,
-            sizeof(reax_atom) * system->N,
-            cudaMemcpyHostToDevice,
-            "Cuda_Copy_Atoms_Host_to_Device::system->my_atoms" );
+    sCudaMemcpy( system->d_my_atoms, system->my_atoms, sizeof(reax_atom) * system->N,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 }
 
 
@@ -70,32 +63,31 @@ extern "C" void Cuda_Copy_System_Host_to_Device( reax_system *system )
 {
     Cuda_Copy_Atoms_Host_to_Device( system );
 
-    copy_host_device( &system->my_box, system->d_my_box, sizeof(simulation_box),
-            cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->my_box" );
+    sCudaMemcpy( system->d_my_box, &system->my_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 
-    copy_host_device( &system->my_ext_box, system->d_my_ext_box,
-            sizeof(simulation_box), cudaMemcpyHostToDevice,
-            "Cuda_Copy_System_Host_to_Device::system->my_ext_box" );
+    sCudaMemcpy( system->d_my_ext_box, &system->my_ext_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 
-    copy_host_device( system->reax_param.sbp, system->reax_param.d_sbp,
+    sCudaMemcpy( system->reax_param.d_sbp, system->reax_param.sbp,
             sizeof(single_body_parameters) * system->reax_param.num_atom_types,
-            cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->sbp" );
-    copy_host_device( system->reax_param.tbp, system->reax_param.d_tbp,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( system->reax_param.d_tbp, system->reax_param.tbp,
             sizeof(two_body_parameters) * POW(system->reax_param.num_atom_types, 2),
-            cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->tbp" );
-    copy_host_device( system->reax_param.thbp, system->reax_param.d_thbp,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( system->reax_param.d_thbp, system->reax_param.thbp,
             sizeof(three_body_header) * POW(system->reax_param.num_atom_types, 3),
-            cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->thbh" );
-    copy_host_device( system->reax_param.hbp, system->reax_param.d_hbp,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( system->reax_param.d_hbp, system->reax_param.hbp,
             sizeof(hbond_parameters) * POW(system->reax_param.num_atom_types, 3),
-            cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->hbond" );
-    copy_host_device( system->reax_param.fbp, system->reax_param.d_fbp, 
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+    sCudaMemcpy( system->reax_param.d_fbp, system->reax_param.fbp,
             sizeof(four_body_header) * POW(system->reax_param.num_atom_types, 4),
-            cudaMemcpyHostToDevice, "Cuda_Copy_System_Host_to_Device::system->four_header" );
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 
-    copy_host_device( system->reax_param.gp.l, system->reax_param.d_gp.l,
-            sizeof(real) * system->reax_param.gp.n_global, cudaMemcpyHostToDevice,
-            "Cuda_Copy_System_Host_to_Device::system->global_parameters" );
+    sCudaMemcpy( system->reax_param.d_gp.l, system->reax_param.gp.l,
+            sizeof(real) * system->reax_param.gp.n_global,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 
     system->reax_param.d_gp.n_global = system->reax_param.gp.n_global; 
     system->reax_param.d_gp.vdw_type = system->reax_param.gp.vdw_type; 
@@ -105,23 +97,23 @@ extern "C" void Cuda_Copy_System_Host_to_Device( reax_system *system )
 /* Copy atom info from device to host */
 extern "C" void Cuda_Copy_Atoms_Device_to_Host( reax_system *system )
 {
-    copy_host_device( system->my_atoms, system->d_my_atoms,
+    sCudaMemcpy( system->my_atoms, system->d_my_atoms,
             sizeof(reax_atom) * system->N,
-            cudaMemcpyDeviceToHost, "Cuda_Copy_Atoms_Device_to_Host::my_atoms" );
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 }
 
 
 /* Copy simulation data from device to host */
 extern "C" void Cuda_Copy_Simulation_Data_Device_to_Host( simulation_data *host, simulation_data *dev )
 {
-    copy_host_device( &host->my_en, &dev->my_en, sizeof(energy_data), 
-            cudaMemcpyDeviceToHost, "simulation_data:energy_data" );
-    copy_host_device( &host->kin_press, &dev->kin_press, sizeof(real), 
-            cudaMemcpyDeviceToHost, "simulation_data:kin_press" );
-    copy_host_device( host->int_press, dev->int_press, sizeof(rvec), 
-            cudaMemcpyDeviceToHost, "simulation_data:int_press" );
-    copy_host_device( host->ext_press, dev->ext_press, sizeof(rvec), 
-            cudaMemcpyDeviceToHost, "simulation_data:ext_press" );
+    sCudaMemcpy( &host->my_en, &dev->my_en, sizeof(energy_data), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+    sCudaMemcpy( &host->kin_press, &dev->kin_press, sizeof(real), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+    sCudaMemcpy( host->int_press, dev->int_press, sizeof(rvec), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+    sCudaMemcpy( host->ext_press, dev->ext_press, sizeof(rvec), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 }
 
 
@@ -146,45 +138,47 @@ extern "C" void Cuda_Copy_List_Device_to_Host( reax_list *host_list, reax_list *
     fprintf( stderr, " [INFO] trying to copy %d list from device to host\n", type );
 #endif
 
-    copy_host_device( host_list->index, device_list->index, sizeof(int) * device_list->n,
-            cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::list->index" );
-    copy_host_device( host_list->end_index, device_list->end_index, sizeof(int) *
-            device_list->n, cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::list->end_index" );
+    sCudaMemcpy( host_list->index, device_list->index,
+            sizeof(int) * device_list->n,
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+    sCudaMemcpy( host_list->end_index, device_list->end_index,
+            sizeof(int) * device_list->n,
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     switch ( type )
     {   
         case TYP_FAR_NEIGHBOR:
-            copy_host_device( host_list->far_nbr_list.nbr, device_list->far_nbr_list.nbr,
+            sCudaMemcpy( host_list->far_nbr_list.nbr, device_list->far_nbr_list.nbr,
                     sizeof(int) * device_list->max_intrs,
-                    cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::far_neighbor_list.nbr" );
-            copy_host_device( host_list->far_nbr_list.rel_box, device_list->far_nbr_list.rel_box,
+                    cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+            sCudaMemcpy( host_list->far_nbr_list.rel_box, device_list->far_nbr_list.rel_box,
                     sizeof(ivec) * device_list->max_intrs,
-                    cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::far_neighbor_list.rel_box" );
-            copy_host_device( host_list->far_nbr_list.d, device_list->far_nbr_list.d,
+                    cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+            sCudaMemcpy( host_list->far_nbr_list.d, device_list->far_nbr_list.d,
                     sizeof(real) * device_list->max_intrs,
-                    cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::far_neighbor_list.d" );
-            copy_host_device( host_list->far_nbr_list.dvec, device_list->far_nbr_list.dvec,
+                    cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+            sCudaMemcpy( host_list->far_nbr_list.dvec, device_list->far_nbr_list.dvec,
                     sizeof(rvec) * device_list->max_intrs,
-                    cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::far_neighbor_list.dvec" );
+                    cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
             break;
 
         case TYP_BOND:
-            copy_host_device( host_list->bond_list, device_list->bond_list,
+            sCudaMemcpy( host_list->bond_list, device_list->bond_list,
                     sizeof(bond_data) * device_list->max_intrs,
-                    cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::bond_list" );
+                    cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
             break;
 
         case TYP_HBOND:
-            copy_host_device( host_list->hbond_list, device_list->hbond_list,
+            sCudaMemcpy( host_list->hbond_list, device_list->hbond_list,
                     sizeof(hbond_data) * device_list->max_intrs,
-                    cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::hbond_list" );
+                    cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
             break;
 
         case TYP_THREE_BODY:
-            copy_host_device( host_list->three_body_list,
+            sCudaMemcpy( host_list->three_body_list,
                     device_list->three_body_list,
                     sizeof(three_body_interaction_data ) * device_list->max_intrs,
-                    cudaMemcpyDeviceToHost, "Cuda_Copy_List_Device_to_Host::three_body_list" );
+                    cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
             break;
 
         default:
@@ -217,8 +211,8 @@ extern "C" void Cuda_Copy_MPI_Data_Host_to_Device( mpi_datatypes *mpi_data )
     {
         /* index is set during SendRecv and reused during MPI comms afterward,
          * so copy to device while SendRecv is still done on the host */
-        copy_host_device( mpi_data->out_buffers[i].index, mpi_data->d_out_buffers[i].index,
-                mpi_data->d_out_buffers[i].index_size, cudaMemcpyHostToDevice,
-                "Cuda_Copy_MPI_Data_Host_to_Device::mpi_data->d_in1_buffer" );
+        sCudaMemcpy( mpi_data->d_out_buffers[i].index, mpi_data->out_buffers[i].index,
+                mpi_data->d_out_buffers[i].index_size,
+                cudaMemcpyHostToDevice, __FILE__, __LINE__ );
     }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
index 2c565e1b..306ccec6 100644
--- a/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_dense_lin_alg.cu
@@ -600,8 +600,8 @@ real Dot( storage * const workspace,
 //    ret = MPI_Allreduce( &spad[k], &sum, 1, MPI_DOUBLE, MPI_SUM, comm );
 //    Check_MPI_Error( ret, __FILE__, __LINE__ );
 //#else
-    copy_host_device( &temp, &spad[k], sizeof(real),
-            cudaMemcpyDeviceToHost, "Dot::temp" );
+    sCudaMemcpy( &temp, &spad[k], sizeof(real),
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     ret = MPI_Allreduce( &temp, &sum, 1, MPI_DOUBLE, MPI_SUM, comm );
     Check_MPI_Error( ret, __FILE__, __LINE__ );
@@ -636,8 +636,8 @@ real Dot_local( storage * const workspace,
     Cuda_Reduction_Sum( spad, &spad[k], k );
 
     //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI
-    copy_host_device( &sum, &spad[k], sizeof(real),
-            cudaMemcpyDeviceToHost, "Dot_local::sum" );
+    sCudaMemcpy( &sum, &spad[k], sizeof(real),
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     return sum;
 }
@@ -683,8 +683,8 @@ void Dot_local_rvec2( control_params const * const control,
     cudaCheckError( );
 
     //TODO: keep result of reduction on devie and pass directly to CUDA-aware MPI
-    copy_host_device( &sum, &spad[k + blocks], sizeof(rvec2),
-            cudaMemcpyDeviceToHost, "Dot_local_rvec2::sum" );
+    sCudaMemcpy( &sum, &spad[k + blocks], sizeof(rvec2),
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     *sum1 = sum[0];
     *sum2 = sum[1];
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index cfdeef7c..ed76e02c 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -1763,8 +1763,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
 
         Cuda_Reduction_Sum( system->d_max_cm_entries,
                 system->d_total_cm_entries, workspace->d_workspace->H.n_max );
-        copy_host_device( &system->total_cm_entries, system->d_total_cm_entries, sizeof(int),
-                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_cm_entries" );
+        sCudaMemcpy( &system->total_cm_entries, system->d_total_cm_entries,
+                sizeof(int), cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
     }
 
     if ( realloc_bonds == TRUE )
@@ -1782,8 +1782,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
 
         Cuda_Reduction_Sum( system->d_max_bonds, system->d_total_bonds,
                 system->total_cap );
-        copy_host_device( &system->total_bonds, system->d_total_bonds, sizeof(int), 
-                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_bonds" );
+        sCudaMemcpy( &system->total_bonds, system->d_total_bonds, sizeof(int), 
+                cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
     }
 
     if ( system->numH > 0 && control->hbond_cut > 0.0 && realloc_hbonds == TRUE )
@@ -1801,8 +1801,8 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
 
         Cuda_Reduction_Sum( system->d_max_hbonds, system->d_total_hbonds,
                 system->total_cap );
-        copy_host_device( &system->total_hbonds, system->d_total_hbonds, sizeof(int), 
-                cudaMemcpyDeviceToHost, "Cuda_Estimate_Storages::d_total_hbonds" );
+        sCudaMemcpy( &system->total_hbonds, system->d_total_hbonds, sizeof(int), 
+                cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
     }
     else if ( step == 0 && (system->numH == 0 || control->hbond_cut <= 0.0) )
     {
@@ -2002,12 +2002,12 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
 #endif
 
     /* check reallocation flags on device */
-    copy_host_device( &realloc_cm, system->d_realloc_cm_entries, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_cm_entries" );
-    copy_host_device( &realloc_bonds, system->d_realloc_bonds, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" );
-    copy_host_device( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" );
+    sCudaMemcpy( &realloc_cm, system->d_realloc_cm_entries, sizeof(int), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+    sCudaMemcpy( &realloc_bonds, system->d_realloc_bonds, sizeof(int), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+    sCudaMemcpy( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
 #if defined(LOG_PERFORMANCE)
     if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) 
@@ -2211,10 +2211,10 @@ int Cuda_Init_Forces_No_Charges( reax_system *system, control_params *control,
 #endif
 
     /* check reallocation flags on device */
-    copy_host_device( &realloc_bonds, system->d_realloc_bonds, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_bonds" );
-    copy_host_device( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Init_Forces::d_realloc_hbonds" );
+    sCudaMemcpy( &realloc_bonds, system->d_realloc_bonds, sizeof(int), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
+    sCudaMemcpy( &realloc_hbonds, system->d_realloc_hbonds, sizeof(int), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
 #if defined(LOG_PERFORMANCE)
     if ( cudaEventQuery( time_event[0] ) != cudaSuccess ) 
@@ -2356,13 +2356,13 @@ static void Cuda_Compute_Total_Force( reax_system *system, control_params *contr
      * based on the neighbors information each processor has had.
      * final values of force on each atom needs to be computed by adding up
      * all partially-final pieces */
-    copy_host_device( f, workspace->d_workspace->f, sizeof(rvec) * system->N ,
-            cudaMemcpyDeviceToHost, "Cuda_Compute_Total_Force::workspace->d_workspace->f" );
+    sCudaMemcpy( f, workspace->d_workspace->f, sizeof(rvec) * system->N ,
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     Coll( system, mpi_data, f, RVEC_PTR_TYPE, mpi_data->mpi_rvec );
 
-    copy_host_device( f, workspace->d_workspace->f, sizeof(rvec) * system->N,
-            cudaMemcpyHostToDevice, "Cuda_Compute_Total_Force::workspace->d_workspace->f" );
+    sCudaMemcpy( workspace->d_workspace->f, f, sizeof(rvec) * system->N,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 
     Cuda_Total_Forces_Part2( system, workspace );
 }
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu
index 9a4a50e6..acd257f5 100644
--- a/PG-PuReMD/src/cuda/cuda_integrate.cu
+++ b/PG-PuReMD/src/cuda/cuda_integrate.cu
@@ -237,7 +237,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Part2( reax_system *system, storage *worksp
 
 
 real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, storage *workspace,
-       real dt, real v_xi_old, real * d_my_ekin, real * d_total_my_ekin )
+       real dt, real v_xi_old, real *d_my_ekin, real *d_total_my_ekin )
 {
     int blocks;
     real my_ekin;
@@ -252,9 +252,8 @@ real Velocity_Verlet_Nose_Hoover_NVT_Part3( reax_system *system, storage *worksp
 
     Cuda_Reduction_Sum( d_my_ekin, d_total_my_ekin, system->n );
 
-    copy_host_device( &my_ekin, d_total_my_ekin, sizeof(real), 
-            cudaMemcpyDeviceToHost,
-            "Velocity_Verlet_Nose_Hoover_NVT_Part3::d_total_my_ekin" );
+    sCudaMemcpy( &my_ekin, d_total_my_ekin, sizeof(real), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     return my_ekin;
 }
diff --git a/PG-PuReMD/src/cuda/cuda_lookup.cu b/PG-PuReMD/src/cuda/cuda_lookup.cu
index c65046ff..200699e2 100644
--- a/PG-PuReMD/src/cuda/cuda_lookup.cu
+++ b/PG-PuReMD/src/cuda/cuda_lookup.cu
@@ -19,7 +19,7 @@ extern "C" void Cuda_Copy_LR_Lookup_Table_Host_to_Device( reax_system *system,
     fprintf( stderr, "Copying the LR Lookyp Table to the device ... \n" );
 
     cuda_malloc( (void **) &workspace->d_LR,
-            sizeof(LR_lookup_table) * ( num_atom_types * num_atom_types ),
+            sizeof(LR_lookup_table) * num_atom_types * num_atom_types,
             FALSE, "LR_lookup:table" );
 
     /*
@@ -30,9 +30,9 @@ extern "C" void Cuda_Copy_LR_Lookup_Table_Host_to_Device( reax_system *system,
        existing_types[ system->atoms[i].type ] = 1;
      */
 
-    copy_host_device( workspace->LR, workspace->d_LR,
+    sCudaMemcpy( workspace->d_LR, workspace->LR,
             sizeof(LR_lookup_table) * (num_atom_types * num_atom_types), 
-            cudaMemcpyHostToDevice, "LR_lookup:table" );
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 
     for( i = 0; i < num_atom_types; ++i )
     {
@@ -42,47 +42,61 @@ extern "C" void Cuda_Copy_LR_Lookup_Table_Host_to_Device( reax_system *system,
             {
                 if ( aggregated[j] )
                 {
-                    cuda_malloc( (void **) &d_y,
-                            sizeof(LR_data) * (control->tabulate + 1), FALSE, "LR_lookup:d_y" );
-                    copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].y, d_y, 
-                            sizeof(LR_data) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:y" );
-                    copy_host_device ( &d_y, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].y, 
-                            sizeof(LR_data *), cudaMemcpyHostToDevice, "LR_lookup:y" );
-
-                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:h" );
-                    copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].H, temp, 
-                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:h" );
-                    copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].H, 
-                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:h" );
-
-                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:vdW" );
-                    copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].vdW, temp, 
-                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:vdW" );
-                    copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].vdW,
-                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:vdW" );
-
-                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:CEvd" );
-                    copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].CEvd, temp, 
-                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:CEvd" );
-                    copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].CEvd, 
-                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:CDvd");
-
-                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:ele" );
-                    copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].ele, temp,
-                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ele" );
-                    copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].ele,
-                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ele" );
-
-                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1), FALSE, "LR_lookup:ceclmb" );
-                    copy_host_device( workspace->LR[ index_lr(i, j, num_atom_types) ].CEclmb, temp,
-                            sizeof(cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ceclmb" );
-                    copy_host_device( &temp, &workspace->d_LR[ index_lr(i, j, num_atom_types) ].CEclmb,
-                            sizeof(cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ceclmb" );
+                    cuda_malloc( (void **) &d_y, sizeof(LR_data) * (control->tabulate + 1),
+                            FALSE, "LR_lookup:d_y" );
+                    sCudaMemcpy( d_y, workspace->LR[ index_lr(i, j, num_atom_types) ].y,
+                            sizeof(LR_data) * (control->tabulate + 1),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+                    sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].y, &d_y,
+                            sizeof(LR_data *),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            FALSE, "LR_lookup:h" );
+                    sCudaMemcpy( temp, workspace->LR[ index_lr(i, j, num_atom_types) ].H,
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+                    sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].H, &temp,
+                            sizeof(cubic_spline_coef *),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            FALSE, "LR_lookup:vdW" );
+                    sCudaMemcpy( temp, workspace->LR[ index_lr(i, j, num_atom_types) ].vdW,
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+                    sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].vdW, &temp,
+                            sizeof(cubic_spline_coef *),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            FALSE, "LR_lookup:CEvd" );
+                    sCudaMemcpy( temp, workspace->LR[ index_lr(i, j, num_atom_types) ].CEvd,
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+                    sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].CEvd, &temp,
+                            sizeof(cubic_spline_coef *),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            FALSE, "LR_lookup:ele" );
+                    sCudaMemcpy( temp,workspace->LR[ index_lr(i, j, num_atom_types) ].ele,
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+                    sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].ele, &temp,
+                            sizeof(cubic_spline_coef *),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+
+                    cuda_malloc( (void **) &temp, sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            FALSE, "LR_lookup:ceclmb" );
+                    sCudaMemcpy( temp, workspace->LR[ index_lr(i, j, num_atom_types) ].CEclmb,
+                            sizeof(cubic_spline_coef) * (control->tabulate + 1),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
+                    sCudaMemcpy( &workspace->d_LR[ index_lr(i, j, num_atom_types) ].CEclmb, &temp,
+                            sizeof(cubic_spline_coef *),
+                            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
                 }
             }
         }
     }
-
-    fprintf( stderr, "Copy of the LR Lookup Table to the device complete ... \n" );
 }
-
diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu
index 1ef460f3..bba3f788 100644
--- a/PG-PuReMD/src/cuda/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu
@@ -535,8 +535,8 @@ extern "C" int Cuda_Generate_Neighbor_Lists( reax_system *system,
 //    cudaCheckError( );
 
     /* check reallocation flag on device */
-    copy_host_device( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Generate_Neighbor_Lists::d_realloc_far_nbrs" );
+    sCudaMemcpy( &ret_far_nbr, system->d_realloc_far_nbrs, sizeof(int), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     ret = (ret_far_nbr == FALSE) ? SUCCESS : FAILURE;
     workspace->d_workspace->realloc.far_nbrs = ret_far_nbr;
@@ -591,8 +591,8 @@ void Cuda_Estimate_Num_Neighbors( reax_system *system, simulation_data *data )
 
     Cuda_Reduction_Sum( system->d_max_far_nbrs, system->d_total_far_nbrs,
             system->total_cap );
-    copy_host_device( &system->total_far_nbrs, system->d_total_far_nbrs, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Estimate_Neighbors::d_total_far_nbrs" );
+    sCudaMemcpy( &system->total_far_nbrs, system->d_total_far_nbrs, sizeof(int), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
 #if defined(LOG_PERFORMANCE)
     cudaEventRecord( time_event[1] );
diff --git a/PG-PuReMD/src/cuda/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
index 532d643c..a74c18b4 100644
--- a/PG-PuReMD/src/cuda/cuda_reset_tools.cu
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
@@ -94,8 +94,8 @@ void Cuda_Reset_Atoms_HBond_Indices( reax_system* system, control_params *contro
     Cuda_Reduction_Sum( hindex, system->d_numH, system->N );
 #endif
 
-    copy_host_device( &system->numH, system->d_numH, sizeof(int), 
-            cudaMemcpyDeviceToHost, "Cuda_Reset_Atoms_HBond_Indices::d_numH" );
+    sCudaMemcpy( &system->numH, system->d_numH, sizeof(int), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 }
 
 
diff --git a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
index a3a485a5..d638c044 100644
--- a/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_spar_lin_alg.cu
@@ -601,14 +601,14 @@ static void Dual_Sparse_MatVec_Comm_Part1( const reax_system * const system,
             "Dual_Sparse_MatVec_Comm_Part1::workspace->host_scratch" );
     spad = (rvec2 *) workspace->host_scratch;
 
-    copy_host_device( spad, (void *) x, sizeof(rvec2) * n,
-            cudaMemcpyDeviceToHost, "Dual_Sparse_MatVec_Comm_Part1::x (d-to-h)" );
+    sCudaMemcpy( spad, (void *) x, sizeof(rvec2) * n,
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */
     Dist( system, mpi_data, spad, buf_type, mpi_type );
 
-    copy_host_device( spad, (void *) x, sizeof(rvec2) * n,
-            cudaMemcpyHostToDevice, "Dual_Sparse_MatVec_Comm_Part1::x (h-to-d)" );
+    sCudaMemcpy( (void *) x, spad, sizeof(rvec2) * n,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 #endif
 }
 
@@ -696,13 +696,13 @@ static void Dual_Sparse_MatVec_Comm_Part2( const reax_system * const system,
                 sizeof(rvec2) * n1, TRUE, SAFE_ZONE,
                 "Dual_Sparse_MatVec_Comm_Part2::workspace->host_scratch" );
         spad = (rvec2 *) workspace->host_scratch;
-        copy_host_device( spad, b, sizeof(rvec2) * n1,
-                cudaMemcpyDeviceToHost, "Dual_Sparse_MatVec_Comm_Part2::b" );
+        sCudaMemcpy( spad, b, sizeof(rvec2) * n1,
+                cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
         Coll( system, mpi_data, spad, buf_type, mpi_type );
 
-        copy_host_device( spad, b, sizeof(rvec2) * n2,
-                cudaMemcpyHostToDevice, "Dual_Sparse_MatVec_Comm_Part2::b" );
+        sCudaMemcpy( b, spad, sizeof(rvec2) * n2,
+                cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 #endif
     }
 }
@@ -778,14 +778,14 @@ static void Sparse_MatVec_Comm_Part1( const reax_system * const system,
             sizeof(real) * n, TRUE, SAFE_ZONE,
             "Sparse_MatVec_Comm_Part1::workspace->host_scratch" );
     spad = (real *) workspace->host_scratch;
-    copy_host_device( spad, (void *) x, sizeof(real) * n,
-            cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part1::x" );
+    sCudaMemcpy( spad, (void *) x, sizeof(real) * n,
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     /* exploit 3D domain decomposition of simulation space with 3-stage communication pattern */
     Dist( system, mpi_data, spad, buf_type, mpi_type );
 
-    copy_host_device( spad, (void *) x, sizeof(real) * n,
-            cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part1::x" );
+    sCudaMemcpy( (void *) x, spad, sizeof(real) * n,
+            cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 #endif
 }
 
@@ -871,13 +871,13 @@ static void Sparse_MatVec_Comm_Part2( const reax_system * const system,
                 sizeof(real) * n1, TRUE, SAFE_ZONE,
                 "Sparse_MatVec_Comm_Part2::workspace->host_scratch" );
         spad = (real *) workspace->host_scratch;
-        copy_host_device( spad, b, sizeof(real) * n1,
-                cudaMemcpyDeviceToHost, "Sparse_MatVec_Comm_Part2::b" );
+        sCudaMemcpy( spad, b, sizeof(real) * n1,
+                cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
         Coll( system, mpi_data, spad, buf_type, mpi_type );
 
-        copy_host_device( spad, b, sizeof(real) * n2,
-                cudaMemcpyHostToDevice, "Sparse_MatVec_Comm_Part2::b" );
+        sCudaMemcpy( b, spad, sizeof(real) * n2,
+                cudaMemcpyHostToDevice, __FILE__, __LINE__ );
 #endif
     }
 }
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu
index 4e819dd1..ee10c7ca 100644
--- a/PG-PuReMD/src/cuda/cuda_system_props.cu
+++ b/PG-PuReMD/src/cuda/cuda_system_props.cu
@@ -539,8 +539,8 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control,
             ( spad, &spad[control->blocks], control->blocks );
     cudaCheckError( );
 
-    copy_host_device( xcm, &spad[control->blocks], sizeof(rvec),
-            cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::xcm" );
+    sCudaMemcpy( xcm, &spad[control->blocks], sizeof(rvec),
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
     
     // vcm
     cuda_memset( spad, 0, sizeof(rvec) * (control->blocks + 1),
@@ -556,8 +556,8 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control,
         ( spad, &spad[control->blocks], control->blocks );
     cudaCheckError( );
 
-    copy_host_device( vcm, &spad[control->blocks], sizeof(rvec),
-        cudaMemcpyDeviceToHost, "Cuda_Compute_Momentum::vcm" );
+    sCudaMemcpy( vcm, &spad[control->blocks], sizeof(rvec),
+        cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
     
     // amcm
     cuda_memset( spad, 0,  sizeof(rvec) * (control->blocks + 1),
@@ -573,8 +573,8 @@ static void Cuda_Compute_Momentum( reax_system *system, control_params *control,
         ( spad, &spad[control->blocks], control->blocks );
     cudaCheckError( );
 
-    copy_host_device( amcm, &spad[control->blocks], sizeof(rvec),
-        cudaMemcpyDeviceToHost,"Cuda_Compute_Momentum::amcm" );
+    sCudaMemcpy( amcm, &spad[control->blocks], sizeof(rvec),
+        cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 }
 
 
@@ -614,9 +614,9 @@ static void Cuda_Compute_Inertial_Tensor( reax_system *system, control_params *c
         ( spad, &spad[6 * control->blocks], control->blocks );
     cudaCheckError( );
 
-    copy_host_device( t, &spad[6 * control->blocks],
+    sCudaMemcpy( t, &spad[6 * control->blocks],
         sizeof(real) * 6, cudaMemcpyDeviceToHost,
-        "Cuda_Compute_Intertial_Tensor::t" );
+        __FILE__, __LINE__ );
 }
 
 
@@ -687,9 +687,8 @@ extern "C" void Cuda_Compute_Kinetic_Energy( reax_system *system,
      * and this call finishes the global reduction across all blocks */
     Cuda_Reduction_Sum( kinetic_energy, &kinetic_energy[system->n], system->n );
 
-    copy_host_device( &data->my_en.e_kin, &kinetic_energy[system->n],
-            sizeof(real), cudaMemcpyDeviceToHost,
-            "Cuda_Compute_Kinetic_Energy::tmp" );
+    sCudaMemcpy( &data->my_en.e_kin, &kinetic_energy[system->n],
+            sizeof(real), cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     ret = MPI_Allreduce( &data->my_en.e_kin, &data->sys_en.e_kin,
             1, MPI_DOUBLE, MPI_SUM, comm );
@@ -722,8 +721,8 @@ void Cuda_Compute_Total_Mass( reax_system *system, control_params *control,
 
     Cuda_Reduction_Sum( spad, &spad[system->n], system->n );
 
-    copy_host_device( &my_M, &spad[system->n], sizeof(real), 
-            cudaMemcpyDeviceToHost, "total_mass::my_M" );
+    sCudaMemcpy( &my_M, &spad[system->n], sizeof(real), 
+            cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     ret = MPI_Allreduce( &my_M, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
     Check_MPI_Error( ret, __FILE__, __LINE__ );
@@ -877,9 +876,8 @@ void Cuda_Compute_Pressure( reax_system* system, control_params *control,
               control->blocks );
         cudaCheckError( );
 
-        copy_host_device( &int_press, &rvec_spad[system->n + control->blocks],
-                sizeof(rvec), cudaMemcpyDeviceToHost,
-                "Cuda_Compute_Pressure::int_press" );
+        sCudaMemcpy( &int_press, &rvec_spad[system->n + control->blocks],
+                sizeof(rvec), cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
     }
 
     /* sum up internal and external pressure */
diff --git a/PG-PuReMD/src/cuda/cuda_utils.cu b/PG-PuReMD/src/cuda/cuda_utils.cu
index a3dbe4cb..952ab10b 100644
--- a/PG-PuReMD/src/cuda/cuda_utils.cu
+++ b/PG-PuReMD/src/cuda/cuda_utils.cu
@@ -109,42 +109,35 @@ void cuda_check_malloc( void **ptr, size_t *cur_size, size_t new_size, const cha
 }
 
 
-void copy_host_device( void *host, void *dev, size_t size,
-        cudaMemcpyKind dir, const char *msg )
+/* Safe wrapper around cudaMemcpy
+ *
+ * dest: address to be copied to
+ * src: address to be copied from
+ * size: num. bytes to copy
+ * dir: CUDA enum specifying address types for dest and src
+ * filename: NULL-terminated source filename where function call originated
+ * line: line of source filen where function call originated
+ */
+void sCudaMemcpy( void * const dest, void const * const src, size_t size,
+        cudaMemcpyKind dir, const char * const filename, int line )
 {
-    cudaError_t retVal = cudaErrorNotReady;
+    int rank;
+    cudaError_t ret;
 
-    if ( dir == cudaMemcpyHostToDevice )
-    {
-        retVal = cudaMemcpy( dev, host, size, cudaMemcpyHostToDevice );
-    }
-    else
-    {
-        retVal = cudaMemcpy( host, dev, size, cudaMemcpyDeviceToHost );
-    }
+    ret = cudaMemcpy( dest, src, size, dir );
 
-    if ( retVal != cudaSuccess )
+    if ( ret != cudaSuccess )
     {
-        fprintf( stderr,
-                "[ERROR] could not copy resource %s from host to device\n    [INFO] CUDA API error code: %d\n",
-                msg, retVal );
-        exit( INSUFFICIENT_MEMORY );
-    }
-}
+        MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+        const char *str = cudaGetErrorString( ret );
 
+        fprintf( stderr, "[ERROR] CUDA error: memory copy failure\n" );
+        fprintf( stderr, "  [INFO] At line %d in file %.*s on MPI processor %d\n",
+                line, (int) strlen(filename), filename, rank );
+        fprintf( stderr, "  [INFO] Error code: %d\n", ret );
+        fprintf( stderr, "  [INFO] Error message: %.*s\n", (int) strlen(str), str );
 
-void copy_device( void *dest, void *src, size_t size, const char *msg )
-{
-    cudaError_t retVal;
-
-    retVal = cudaMemcpy( dest, src, size, cudaMemcpyDeviceToDevice );
-
-    if ( retVal != cudaSuccess )
-    {
-        fprintf( stderr,
-                "[ERROR] could not copy resource %s from device to device\n    [INFO] CUDA API error code: %d\n",
-                msg, retVal );
-        exit( INSUFFICIENT_MEMORY );
+        MPI_Abort( MPI_COMM_WORLD, RUNTIME_ERROR );
     }
 }
 
diff --git a/PG-PuReMD/src/cuda/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h
index 6a3a6efd..ac4288b1 100644
--- a/PG-PuReMD/src/cuda/cuda_utils.h
+++ b/PG-PuReMD/src/cuda/cuda_utils.h
@@ -12,9 +12,8 @@ void cuda_memset( void *, int , size_t , const char * );
 
 void cuda_check_malloc( void **, size_t *, size_t, const char * );
 
-void copy_host_device( void *, void *, size_t, enum cudaMemcpyKind, const char * );
-
-void copy_device( void *, void *, size_t, const char * );
+void sCudaMemcpy( void * const, void const * const, size_t,
+        enum cudaMemcpyKind, const char * const, int );
 
 void Cuda_Print_Mem_Usage( );
 
diff --git a/PG-PuReMD/src/cuda/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
index ecab2563..989b26bc 100644
--- a/PG-PuReMD/src/cuda/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
@@ -1365,8 +1365,8 @@ static int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params
 
     Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds );
 
-    copy_host_device( &system->total_thbodies, system->d_total_thbodies, sizeof(int),
-            cudaMemcpyDeviceToHost, "Cuda_Estimate_Storage_Three_Body::d_total_thbodies" );
+    sCudaMemcpy( &system->total_thbodies, system->d_total_thbodies,
+            sizeof(int), cudaMemcpyDeviceToHost, __FILE__, __LINE__ );
 
     if ( data->step - data->prev_steps == 0 )
     {
-- 
GitLab