From b73145d43b3d86222a3e9ed84bf1c195d8177057 Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Tue, 11 Jul 2017 13:29:09 -0400
Subject: [PATCH] PG-PuReMD: fix nvcc build issue where incorrect C++ compiler
 was utilized. Fix race condition with three body list computation. Remove
 debug code. Other misc. refactoring.

---
 PG-PuReMD/aclocal.m4                 |   1 +
 PG-PuReMD/configure.ac               |  25 ++++--
 PG-PuReMD/src/cuda_allocate.cu       |  69 +++++++++++----
 PG-PuReMD/src/cuda_forces.cu         |  80 ++++++++++++------
 PG-PuReMD/src/cuda_forces.h          |   2 +
 PG-PuReMD/src/cuda_hydrogen_bonds.cu |  32 +++++--
 PG-PuReMD/src/cuda_lin_alg.cu        |   7 +-
 PG-PuReMD/src/cuda_neighbors.cu      |  41 ++++-----
 PG-PuReMD/src/cuda_reduction.cu      |  16 ++--
 PG-PuReMD/src/cuda_valence_angles.cu | 122 ++++++++++-----------------
 PG-PuReMD/src/forces.c               |   9 +-
 PG-PuReMD/src/init_md.c              |  23 +----
 PG-PuReMD/src/integrate.c            |  12 ---
 PG-PuReMD/src/io_tools.c             |   2 +-
 PG-PuReMD/src/io_tools.h             |   3 +
 PG-PuReMD/src/parallelreax.c         |  12 ++-
 16 files changed, 246 insertions(+), 210 deletions(-)

diff --git a/PG-PuReMD/aclocal.m4 b/PG-PuReMD/aclocal.m4
index 1e5be5b9..06c98b55 100644
--- a/PG-PuReMD/aclocal.m4
+++ b/PG-PuReMD/aclocal.m4
@@ -1151,4 +1151,5 @@ AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 
 m4_include([../m4/acx_mpi.m4])
+m4_include([../m4/ax_compiler_vendor.m4])
 m4_include([../m4/ax_cuda.m4])
diff --git a/PG-PuReMD/configure.ac b/PG-PuReMD/configure.ac
index d4cb0d4d..6fc8c567 100644
--- a/PG-PuReMD/configure.ac
+++ b/PG-PuReMD/configure.ac
@@ -13,21 +13,23 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])], [AC_SUBST([AM_DEFAULT_VERB
 
 AC_CONFIG_MACRO_DIR([../m4])
 
-AC_LANG([C])
-
-AC_CONFIG_SRCDIR([src/torsion_angles.h])
-AC_CONFIG_HEADERS([src/config.h])
-
 # Headline formatter
 AC_DEFUN([CONFIGURE_HEADLINE],
 [
         echo; echo "+++ $1 +++"
 ])
 
+AC_LANG([C])
+
 # Checks for programs.
 AC_PROG_CC([icc gcc cc])
 AC_PROG_CPP
 
+AX_COMPILER_VENDOR
+
+AC_CONFIG_SRCDIR([src/torsion_angles.h])
+AC_CONFIG_HEADERS([src/config.h])
+
 # Checks for libraries.
 AC_SEARCH_LIBS([exp], [m])
 AC_SEARCH_LIBS([sqrt], [m])
@@ -111,11 +113,24 @@ fi
 AC_SUBST(MPI_CFLAGS)
 AC_SUBST(MPI_LIBS)
 
+AC_LANG([C++])
+
+# Checks for programs.
+AC_PROG_CXX([icpc g++ c++])
+AC_PROG_CXXCPP
+
+AX_COMPILER_VENDOR
+
 # Check for CUDA support.
 if test "x$BUILD_GPU" = "xyes"; then
 	CONFIGURE_HEADLINE([ CUDA support ])
 	AX_CUDA
+
         NVCCFLAGS=
+	if test "$ax_cv_cxx_compiler_vendor" = "intel"
+	then
+		NVCCFLAGS+=" -ccbin=icpc"
+	fi
 	if test "BUILD_DEBUG" = "true"
 	then
 		NVCCFLAGS+=" -g -G"
diff --git a/PG-PuReMD/src/cuda_allocate.cu b/PG-PuReMD/src/cuda_allocate.cu
index dcf99114..8d2404f4 100644
--- a/PG-PuReMD/src/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda_allocate.cu
@@ -66,13 +66,19 @@ void dev_alloc_grid( reax_system *system )
     ivec_Copy( device->ghost_hbond_span, host->ghost_hbond_span );
     ivec_Copy( device->ghost_bond_span, host->ghost_bond_span );
 
-    cuda_malloc( (void **) &device->str, sizeof(int) * total, TRUE, "grid:str" );
-    cuda_malloc( (void **) &device->end, sizeof(int) * total, TRUE, "grid:end" );
-    cuda_malloc( (void **) &device->cutoff, sizeof(real) * total, TRUE, "grid:cutoff" );
-
-    cuda_malloc( (void **) &device->nbrs_x, sizeof(ivec) * total * host->max_nbrs, TRUE, "grid:nbrs_x" );
-    cuda_malloc( (void **) &device->nbrs_cp, sizeof(rvec) * total * host->max_nbrs, TRUE, "grid:nbrs_cp" );
-    cuda_malloc( (void **) &device->rel_box, sizeof(ivec) * total, TRUE, "grid:rel_box" );
+    cuda_malloc( (void **) &device->str, sizeof(int) * total, TRUE,
+            "dev_alloc_grid::grid->str" );
+    cuda_malloc( (void **) &device->end, sizeof(int) * total, TRUE,
+            "dev_alloc_grid::grid->end" );
+    cuda_malloc( (void **) &device->cutoff, sizeof(real) * total, TRUE,
+            "dev_alloc_grid::grid->cutoff" );
+
+    cuda_malloc( (void **) &device->nbrs_x, sizeof(ivec) * total * host->max_nbrs,
+            TRUE, "dev_alloc_grid::grid->nbrs_x" );
+    cuda_malloc( (void **) &device->nbrs_cp, sizeof(rvec) * total * host->max_nbrs,
+            TRUE, "dev_alloc_grid::grid->nbrs_cp" );
+    cuda_malloc( (void **) &device->rel_box, sizeof(ivec) * total,
+            TRUE, "dev_alloc_grid::grid->rel_box" );
 
     /*
        int block_size = 512;
@@ -129,8 +135,10 @@ void dev_dealloc_grid_cell_atoms( reax_system *system )
     for (int i = 0; i < total; i++)
     {
         copy_host_device( &local_cell, &device->cells[i], 
-                sizeof(grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc" );
-        cuda_free( local_cell.atoms, "grid_cell:atoms" );
+                sizeof(grid_cell), cudaMemcpyDeviceToHost,
+                "dev_dealloc_grid_cell_atoms::grid" );
+        cuda_free( local_cell.atoms,
+                "dev_dealloc_grid_cell_atoms::grid_cell.atoms" );
     }
 }
 
@@ -239,31 +247,63 @@ void dev_alloc_system( reax_system *system )
 
 void dev_realloc_system( reax_system *system, int local_cap, int total_cap, char *msg )
 {
+    int *temp;
+
+    temp = (int *) scratch;
+
     /* free the existing storage for atoms, leave other info allocated */
     cuda_free( system->d_my_atoms, "system::d_my_atoms" );
     cuda_malloc( (void **) &system->d_my_atoms, sizeof(reax_atom) * total_cap, 
             TRUE, "system::d_my_atoms" );
 
+    //TODO: record old total_cap before increase, use here
+    copy_device( temp, system->d_far_nbrs, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
     cuda_free( system->d_far_nbrs, "system::d_far_nbrs" );
     cuda_malloc( (void **) &system->d_far_nbrs,
             system->total_cap * sizeof(int), TRUE, "system::d_far_nbrs" );
+    copy_device( system->d_far_nbrs, temp, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_max_far_nbrs, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
     cuda_free( system->d_max_far_nbrs, "system::d_max_far_nbrs" );
     cuda_malloc( (void **) &system->d_max_far_nbrs,
             system->total_cap * sizeof(int), TRUE, "system::d_max_far_nbrs" );
+    copy_device( system->d_max_far_nbrs, temp, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
 
+    copy_device( temp, system->d_bonds, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
     cuda_free( system->d_bonds, "system::d_bonds" );
     cuda_malloc( (void **) &system->d_bonds,
             system->total_cap * sizeof(int), TRUE, "system::d_bonds" );
+    copy_device( system->d_bonds, temp, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_max_bonds, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
     cuda_free( system->d_max_bonds, "system::d_max_bonds" );
     cuda_malloc( (void **) &system->d_max_bonds,
             system->total_cap * sizeof(int), TRUE, "system::d_max_bonds" );
+    copy_device( system->d_max_bonds, temp, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
 
+    copy_device( temp, system->d_hbonds, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
     cuda_free( system->d_hbonds, "system::d_hbonds" );
     cuda_malloc( (void **) &system->d_hbonds,
             system->total_cap * sizeof(int), TRUE, "system::d_hbonds" );
+    copy_device( system->d_hbonds, temp, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
+
+    copy_device( temp, system->d_max_hbonds, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
     cuda_free( system->d_max_hbonds, "system::d_max_hbonds" );
     cuda_malloc( (void **) &system->d_max_hbonds,
             system->total_cap * sizeof(int), TRUE, "system::d_max_hbonds" );
+    copy_device( system->d_max_hbonds, temp, system->total_cap * sizeof(int),
+            "dev_realloc_system::temp" );
 }
 
 
@@ -626,7 +666,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
             realloc->far_nbrs = FALSE;
         }
     }
-    fprintf( stderr, "    [CUDA_REALLOCATE::far nbrs]\n" );
 
     /* charge coef matrix */
     //if( nflag || realloc->Htop >= system->max_sparse_entries * DANGER_ZONE ) {
@@ -673,7 +712,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
         //workspace->U = NULL;
         realloc->Htop = 0;
     }
-    fprintf( stderr, "    [CUDA_REALLOCATE::charge matrix\n" );
 
     /* hydrogen bonds list */
     if ( control->hbond_cut > 0.0 && system->numH > 0 )
@@ -692,7 +730,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
             realloc->hbonds = FALSE;
         }
     }
-    fprintf( stderr, "    [CUDA_REALLOCATE::hbonds\n" );
 
     /* bonds list */
     if ( Nflag == TRUE || realloc->bonds == TRUE )
@@ -707,7 +744,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
         Cuda_Init_Bond_Indices( system );
         realloc->bonds = FALSE;
     }
-    fprintf( stderr, "    [CUDA_REALLOCATE::bonds\n" );
 
     /* 3-body list */
     if ( Nflag == TRUE || realloc->num_3body > 0 )
@@ -723,7 +759,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
                 (*dev_lists + BONDS)->num_intrs, system->total_thbodies );
         realloc->num_3body = -1;
     }
-    fprintf( stderr, "    [CUDA_REALLOCATE::thbody\n" );
 
     /* grid */
     if ( renbr && realloc->gcell_atoms > -1 )
@@ -752,11 +787,10 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
         //MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
 
         //FIX - 1 - Tested the reallocation logic
-        //dev_dealloc_grid_cell_atoms (system);
-        //dev_alloc_grid_cell_atoms (system, realloc->gcell_atoms);
+        //dev_dealloc_grid_cell_atoms( system );
+        //dev_alloc_grid_cell_atoms( system, realloc->gcell_atoms );
         realloc->gcell_atoms = -1;
     }
-    fprintf( stderr, "    [CUDA_REALLOCATE::grid\n" );
 
     /* mpi buffers */
     // we have to be at a renbring step -
@@ -829,7 +863,6 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
         Deallocate_MPI_Buffers( mpi_data );
         Allocate_MPI_Buffers( mpi_data, system->est_recv, system->my_nbrs, msg );
     }
-    fprintf( stderr, "    [CUDA_REALLOCATE::MPI buffers\n" );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d: reallocate done\n",
diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda_forces.cu
index 67009b7c..a840cbe5 100644
--- a/PG-PuReMD/src/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda_forces.cu
@@ -342,11 +342,10 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr
         int step, reax_list **lists, int *thbody )
 {
     int ret;
-    void *d_temp_storage = NULL;
-    size_t temp_storage_bytes = 0;
 
     ret = SUCCESS;
-    cuda_memset( thbody, 0, (*dev_lists + BONDS)->num_intrs * sizeof(int), "scratch::thbody" );
+//    cuda_memset( thbody, 0, (*dev_lists + BONDS)->num_intrs * sizeof(int), "scratch::thbody" );
+    cuda_memset( thbody, 0, system->total_bonds * sizeof(int), "scratch::thbody" );
 
     Estimate_Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
         ( system->d_my_atoms, (control_params *)control->d_control_params, 
@@ -354,7 +353,8 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    Cuda_Reduction_Sum( thbody, system->d_total_thbodies, (*dev_lists + BONDS)->num_intrs );
+//    Cuda_Reduction_Sum( thbody, system->d_total_thbodies, (*dev_lists + BONDS)->num_intrs );
+    Cuda_Reduction_Sum( thbody, system->d_total_thbodies, system->total_bonds );
 
     copy_host_device( &(system->total_thbodies), system->d_total_thbodies, sizeof(int),
             cudaMemcpyDeviceToHost, "Cuda_Estimate_Storage_Three_Body::d_total_thbodies" );
@@ -362,12 +362,16 @@ int Cuda_Estimate_Storage_Three_Body( reax_system *system, control_params *contr
     if ( step == 0 )
     {
         /* create Three-body list */
-        Dev_Make_List( (*dev_lists + BONDS)->num_intrs, system->total_thbodies,
+//        Dev_Make_List( (*dev_lists + BONDS)->num_intrs, system->total_thbodies,
+//                TYP_THREE_BODY, *dev_lists + THREE_BODIES );
+        Dev_Make_List( system->total_bonds, system->total_thbodies,
                 TYP_THREE_BODY, *dev_lists + THREE_BODIES );
     }
 
+//    if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs ||
+//            (*dev_lists + THREE_BODIES)->n < (*dev_lists + BONDS)->num_intrs )
     if ( system->total_thbodies > (*dev_lists + THREE_BODIES)->num_intrs ||
-            (*dev_lists + THREE_BODIES)->n < (*dev_lists + BONDS)->num_intrs )
+            (*dev_lists + THREE_BODIES)->n < system->total_bonds )
     {
         system->total_thbodies = (*dev_lists + THREE_BODIES)->num_intrs * SAFE_ZONE;
         dev_workspace->realloc.num_3body = system->total_thbodies;
@@ -1190,7 +1194,6 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace,
 
         ret = FAILURE;
     }
-    fprintf( stderr, "        [sparse_matrix: %d]\n", ret );
 
     /* 3bodies list: since a more accurate estimate of the num.
      * of three body interactions requires that bond orders have
@@ -1200,6 +1203,39 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace,
 }
 
 
+CUDA_GLOBAL void k_print_forces( reax_atom *my_atoms, rvec *f, int n )
+{
+    int i; 
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= n)
+    {
+        return;
+    }
+
+    printf( "%8d: %24.15f, %24.15f, %24.15f\n",
+            my_atoms[i].orig_id,
+            f[i][0],
+            f[i][1],
+            f[i][2] );
+}
+
+
+void Print_Forces( reax_system *system )
+{
+    int blocks;
+    
+    blocks = (system->n) / DEF_BLOCK_SIZE + 
+        (((system->n % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
+
+    k_print_forces <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, dev_workspace->f, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
 CUDA_GLOBAL void k_init_bond_orders( reax_atom *my_atoms, reax_list far_nbrs, 
         reax_list bonds, real *total_bond_order, int N )
 {
@@ -1345,7 +1381,6 @@ int Cuda_Init_Forces( reax_system *system, control_params *control,
         /* validate lists */
         ret = Cuda_Validate_Lists( system, workspace, dev_lists, control,
                 data->step, system->numH );
-        fprintf( stderr, "      [CUDA_VALIDATE_LISTS: %d] STEP %d\n", ret, data->step );
     }
 
     return ret;
@@ -1372,6 +1407,8 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     real *spad = (real *) scratch;
     rvec *rvec_spad;
 
+    ret = SUCCESS;
+
     if ( compute_bonded_part1 == FALSE )
     {
         /* 1. Bond Order Interactions */
@@ -1414,7 +1451,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                 cudaGetLastError( ), t_elapsed );
         fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" );
 #endif
-        fprintf( stderr, "      [CUDA_COMPUTE_BONDED_FORCES:BO]\n" );
 
         /* 2. Bond Energy Interactions */
         t_start = Get_Time( );
@@ -1438,7 +1474,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                 cudaGetLastError( ), t_elapsed );
         fprintf( stderr, "Cuda_Bond_Energy Done... \n" );
 #endif
-        fprintf( stderr, "      [CUDA_COMPUTE_BONDED_FORCES:Bond E]\n" );
 
         /* 3. Atom Energy Interactions */
         t_start = Get_Time( );
@@ -1482,7 +1517,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                 cudaGetLastError( ), t_elapsed );
         fprintf( stderr, "test_LonePair_postprocess Done... \n");
 #endif
-        fprintf( stderr, "      [CUDA_COMPUTE_BONDED_FORCES:Atom E]\n" );
 
         compute_bonded_part1 = TRUE;
     }
@@ -1493,7 +1527,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
     thbody = (int *) scratch;
     ret = Cuda_Estimate_Storage_Three_Body( system, control, data->step,
             dev_lists, thbody );
-    fprintf( stderr, "        [three_body: %d]\n", ret );
 
 #if defined(DEBUG)
     fprintf( stderr, "system->total_thbodies = %d, lists:THREE_BODIES->num_intrs = %d,\n",
@@ -1505,7 +1538,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 
     if ( ret == SUCCESS )
     {
-        Cuda_Init_Three_Body_Indices( thbody, system->total_thbodies );
+        Cuda_Init_Three_Body_Indices( thbody, system->total_bonds );
 
         cuda_memset( spad, 0, 6 * sizeof(real) * system->N + sizeof(rvec) * system->N * 2, "scratch" );
         Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
@@ -1514,7 +1547,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
               (control_params *)control->d_control_params,
               *dev_workspace, *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES),
               system->n, system->N, system->reax_param.num_atom_types, 
-              spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N));
+              spad, spad + 2 * system->N, spad + 4 * system->N, (rvec *)(spad + 6 * system->N));
         cudaThreadSynchronize( );
         cudaCheckError( );
 
@@ -1560,7 +1593,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                 t_elapsed );
         fprintf( stderr, "Three_Body_Interactions Done... \n" );
 #endif
-        fprintf( stderr, "      [CUDA_COMPUTE_BONDED_FORCES:Valence Angles]\n" );
 
         /* 5. Torsion Angles Interactions */
         t_start = Get_Time( );
@@ -1614,7 +1646,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                 cudaGetLastError( ), t_elapsed );
         fprintf( stderr, " Four_Body_ Done... \n");
 #endif
-        fprintf( stderr, "      [CUDA_COMPUTE_BONDED_FORCES:Torsion]\n" );
 
         /* 6. Hydrogen Bonds Interactions */
         // FIX - 4 - Added additional check here
@@ -1627,9 +1658,9 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
             hbs = (system->n * HB_KER_THREADS_PER_ATOM / HB_BLOCK_SIZE) + 
                 (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);
 
-//            Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE >>>
-            Cuda_Hydrogen_Bonds_MT <<< hbs, HB_BLOCK_SIZE, 
-                    HB_BLOCK_SIZE * (2 * sizeof(real) + 2 * sizeof(rvec)) >>>
+            Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE >>>
+//            Cuda_Hydrogen_Bonds_MT <<< hbs, HB_BLOCK_SIZE, 
+//                    HB_BLOCK_SIZE * (2 * sizeof(real) + 2 * sizeof(rvec)) >>>
                     ( system->d_my_atoms, system->reax_param.d_sbp,
                       system->reax_param.d_hbp, system->reax_param.d_gp,
                       (control_params *) control->d_control_params,
@@ -1638,7 +1669,6 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                       spad, (rvec *) (spad + 2 * system->n) );
             cudaThreadSynchronize( );
             cudaCheckError( );
-            fprintf( stderr, "      [CUDA_COMPUTE_BONDED_FORCES:Hbonds]\n" );
 
             /* reduction for E_HB */
             Cuda_Reduction_Sum( spad,
@@ -1666,20 +1696,18 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                    *(*dev_lists + BONDS), system->N );
             cudaThreadSynchronize( );
             cudaCheckError( );
-            fprintf( stderr, "      [CUDA_COMPUTE_BONDED_FORCES:Hbonds_PostProcess]\n" );
 
             /* post process step2 */
             hnbrs_blocks = (system->N * HB_POST_PROC_KER_THREADS_PER_ATOM / HB_POST_PROC_BLOCK_SIZE) +
                 (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1);
 
-//            Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof(rvec) >>>
-//                ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS) );
-            Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, 
-                    HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>>
+            Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof(rvec) >>>
+                ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS) );
+//            Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, 
+//                    HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>>
                 ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS), system->N );
             cudaThreadSynchronize( );
             cudaCheckError( );
-            fprintf( stderr, "      [CUDA_COMPUTE_BONDED_FORCES:Hbonds_HNbrs]\n" );
 
             t_elapsed = Get_Timing_Info( t_start );
 
diff --git a/PG-PuReMD/src/cuda_forces.h b/PG-PuReMD/src/cuda_forces.h
index a4ae2655..7e635005 100644
--- a/PG-PuReMD/src/cuda_forces.h
+++ b/PG-PuReMD/src/cuda_forces.h
@@ -35,6 +35,8 @@ void Cuda_Compute_NonBonded_Forces( reax_system *, control_params *,
         simulation_data *, storage *, reax_list **, output_controls *,
         mpi_datatypes * );
 
+void Print_Forces( reax_system * );
+
 
 #ifdef __cplusplus
 }
diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda_hydrogen_bonds.cu
index 89682a39..95eda081 100644
--- a/PG-PuReMD/src/cuda_hydrogen_bonds.cu
+++ b/PG-PuReMD/src/cuda_hydrogen_bonds.cu
@@ -657,8 +657,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms,
     workspace = &p_workspace;
     hbonds = &p_hbonds;
 
-    start = Dev_Start_Index( i, hbonds );
-    end = Dev_End_Index( i, hbonds );
+    start = Dev_Start_Index( atoms[i].Hindex, hbonds );
+    end = Dev_End_Index( atoms[i].Hindex, hbonds );
     pj = start + threadIdx.x;
 #if defined(__SM_35__)
     rvec_MakeZero( __f );
@@ -682,8 +682,10 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms,
         pj += blockDim.x;
     }
 
+    __syncthreads( );
+
 #if defined(__SM_35__)
-    for ( int s = 16; s >= 1; s/=2 )
+    for ( int s = 16; s >= 1; s /= 2 )
     {
         __f[0] += shfl( __f[0], s );
         __f[1] += shfl( __f[1], s );
@@ -699,22 +701,31 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *atoms,
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 16] );
     }
+    __syncthreads( );
+
     if ( threadIdx.x < 8 )
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 8] );
     }
+    __syncthreads( );
+
     if ( threadIdx.x < 4 )
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 4] );
     }
+    __syncthreads( );
+
     if ( threadIdx.x < 2 )
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 2] );
     }
+    __syncthreads( );
+
     if ( threadIdx.x < 1 )
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 1] );
     }
+    __syncthreads( );
 
     if ( threadIdx.x == 0 )
     {
@@ -757,8 +768,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms,
     workspace = &( p_workspace );
     hbonds = &p_hbonds;
     i = group_id;
-    start = Dev_Start_Index( i, hbonds );
-    end = Dev_End_Index( i, hbonds );
+    start = Dev_Start_Index( atoms[i].Hindex, hbonds );
+    end = Dev_End_Index( atoms[i].Hindex, hbonds );
     pj = start + lane_id;
 #if defined(__SM_35__)
     rvec_MakeZero( __f );
@@ -781,6 +792,8 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms,
         pj += __THREADS_PER_ATOM__;
     }
 
+    __syncthreads( );
+
 #if defined(__SM_35__)
     for ( s = __THREADS_PER_ATOM__ >> 1; s >= 1; s /= 2 )
     {
@@ -798,22 +811,31 @@ CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *atoms,
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 16] );
     }
+    __syncthreads( );
+
     if ( lane_id < 8 )
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 8] );
     }
+    __syncthreads( );
+
     if ( lane_id < 4 )
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 4] );
     }
+    __syncthreads( );
+
     if ( lane_id < 2 )
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 2] );
     }
+    __syncthreads( );
+
     if ( lane_id < 1 )
     {
         rvec_Add( __f[threadIdx.x], __f[threadIdx.x + 1] );
     }
+    __syncthreads( );
 
     if ( lane_id == 0 )
     {
diff --git a/PG-PuReMD/src/cuda_lin_alg.cu b/PG-PuReMD/src/cuda_lin_alg.cu
index b7254a39..4f37d577 100644
--- a/PG-PuReMD/src/cuda_lin_alg.cu
+++ b/PG-PuReMD/src/cuda_lin_alg.cu
@@ -568,7 +568,7 @@ void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
 }
 
 
-void Cuda_Dual_Matvec(sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size)
+void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size )
 {
     int blocks;
 
@@ -590,14 +590,13 @@ void Cuda_Dual_Matvec(sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size)
     k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
                       sizeof(rvec2) * MATVEC_BLOCK_SIZE >>>
 #endif
-                      (*H, a, b, n);
-
+            ( *H, a, b, n );
     cudaThreadSynchronize( );
     cudaCheckError( );
 }
 
 
-void Cuda_Matvec(sparse_matrix *H, real *a, real *b, int n, int size)
+void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size )
 {
     int blocks;
 
diff --git a/PG-PuReMD/src/cuda_neighbors.cu b/PG-PuReMD/src/cuda_neighbors.cu
index 54f6d021..9c4fdee7 100644
--- a/PG-PuReMD/src/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda_neighbors.cu
@@ -76,7 +76,7 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms,
     {
         for ( i = 0; i < 3; i++ )
         {
-            c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
+            c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);   
             if ( c[i] >= g.native_end[i] )
             {
                 c[i] = g.native_end[i] - 1;
@@ -240,7 +240,7 @@ CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms,
     {
         for ( i = 0; i < 3; i++ )
         {
-            c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
+            c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
             if ( c[i] >= g.native_end[i] )
             {
                 c[i] = g.native_end[i] - 1;
@@ -370,13 +370,14 @@ CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms,
                     //got the indices
                     nbr_data = my_start + nbrssofar[my_bucket] + tnbr[threadIdx.x] - 1;
                     nbr_data->nbr = m;
+
                     if ( l < m )
                     {
                         dvec[0] = atom2->x[0] - atom1->x[0];
                         dvec[1] = atom2->x[1] - atom1->x[1];
                         dvec[2] = atom2->x[2] - atom1->x[2];
                         d = rvec_Norm_Sqr( dvec );
-                        nbr_data->d = SQRT(d);
+                        nbr_data->d = SQRT( d );
                         rvec_Copy( nbr_data->dvec, dvec );
                         ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
                                 -1, g.rel_box[index_grid_3d(i, j, k, &g)] );
@@ -387,7 +388,7 @@ CUDA_GLOBAL void k_mt_generate_neighbor_lists( reax_atom *my_atoms,
                         dvec[1] = atom1->x[1] - atom2->x[1];
                         dvec[2] = atom1->x[2] - atom2->x[2];
                         d = rvec_Norm_Sqr( dvec );
-                        nbr_data->d = SQRT(d);
+                        nbr_data->d = SQRT( d );
                         rvec_Copy( nbr_data->dvec, dvec );
                         /*
                            CHANGE ORIGINAL
@@ -437,25 +438,25 @@ void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
 #endif
 
     /* one thread per atom implementation */
-//    blocks = (system->N / NBRS_BLOCK_SIZE) +
-//        ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-//    k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>>
-//        ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
-//          *(*dev_lists + FAR_NBRS), system->n, system->N );
-//    cudaThreadSynchronize( );
-//    cudaCheckError( );
-
-    /* multiple threads per atom implementation */
-    blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + 
-        (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-    k_mt_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE, 
-        //sizeof(int) * (NBRS_BLOCK_SIZE + NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM) >>>
-        sizeof(int) * 2 * NBRS_BLOCK_SIZE >>>
-            ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
-              *(*dev_lists + FAR_NBRS), system->n, system->N );
+    blocks = (system->N / NBRS_BLOCK_SIZE) +
+        ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+    k_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
+          *(*dev_lists + FAR_NBRS), system->n, system->N );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
+    /* multiple threads per atom implementation */
+//    blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + 
+//        (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+//    k_mt_generate_neighbor_lists <<< blocks, NBRS_BLOCK_SIZE, 
+//        //sizeof(int) * (NBRS_BLOCK_SIZE + NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM) >>>
+//        sizeof(int) * 2 * NBRS_BLOCK_SIZE >>>
+//            ( system->d_my_atoms, system->my_ext_box, system->d_my_grid,
+//              *(*dev_lists + FAR_NBRS), system->n, system->N );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+
 #if defined(LOG_PERFORMANCE)
     if( system->my_rank == MASTER_NODE )
     {
diff --git a/PG-PuReMD/src/cuda_reduction.cu b/PG-PuReMD/src/cuda_reduction.cu
index a9661449..02d800ee 100644
--- a/PG-PuReMD/src/cuda_reduction.cu
+++ b/PG-PuReMD/src/cuda_reduction.cu
@@ -41,7 +41,7 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n )
 
     /* allocate temporary storage */
     cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
-            "cub::sum::temp_storage" );
+            "Cuda_Reduction_Sum::temp_storage" );
 
     /* run sum-reduction */
     cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
@@ -50,7 +50,7 @@ void Cuda_Reduction_Sum( int *d_array, int *d_dest, size_t n )
     cudaCheckError( );
 
     /* deallocate temporary storage */
-    cuda_free( d_temp_storage, "cub::sum::temp_storage" );
+    cuda_free( d_temp_storage, "Cuda_Reduction_Sum::temp_storage" );
 }
 
 
@@ -71,7 +71,7 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n )
 
     /* allocate temporary storage */
     cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
-            "cub::sum::temp_storage" );
+            "Cuda_Reduction_Sum::temp_storage" );
 
     /* run sum-reduction */
     cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
@@ -80,7 +80,7 @@ void Cuda_Reduction_Sum( real *d_array, real *d_dest, size_t n )
     cudaCheckError( );
 
     /* deallocate temporary storage */
-    cuda_free( d_temp_storage, "cub::sum::temp_storage" );
+    cuda_free( d_temp_storage, "Cuda_Reduction_Sum::temp_storage" );
 }
 
 
@@ -133,7 +133,7 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n )
 
     /* allocate temporary storage */
     cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
-            "cub::max::temp_storage" );
+            "Cuda_Reduction_Max::temp_storage" );
 
     /* run exclusive prefix sum */
     cub::DeviceReduce::Max( d_temp_storage, temp_storage_bytes,
@@ -142,7 +142,7 @@ void Cuda_Reduction_Max( int *d_array, int *d_dest, size_t n )
     cudaCheckError( );
 
     /* deallocate temporary storage */
-    cuda_free( d_temp_storage, "cub::max::temp_storage" );
+    cuda_free( d_temp_storage, "Cuda_Reduction_Max::temp_storage" );
 }
 
 
@@ -163,7 +163,7 @@ void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n )
 
     /* allocate temporary storage */
     cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
-            "cub::devicescan::temp_storage" );
+            "Cuda_Scan_Excl_Sum::temp_storage" );
 
     /* run exclusive prefix sum */
     cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes,
@@ -172,7 +172,7 @@ void Cuda_Scan_Excl_Sum( int *d_src, int *d_dest, size_t n )
     cudaCheckError( );
 
     /* deallocate temporary storage */
-    cuda_free( d_temp_storage, "cub::devicescan::temp_storage" );
+    cuda_free( d_temp_storage, "Cuda_Scan_Excl_Sum::temp_storage" );
 }
 
 
diff --git a/PG-PuReMD/src/cuda_valence_angles.cu b/PG-PuReMD/src/cuda_valence_angles.cu
index c0936fc0..d778c3b2 100644
--- a/PG-PuReMD/src/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda_valence_angles.cu
@@ -152,45 +152,40 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
             i = pbond_ij->nbr;
             r_ij = pbond_ij->d;
             type_i = my_atoms[i].type;
-            // fprintf( out_control->eval, "i: %d\n", i );
 
             /* first copy 3-body intrs from previously computed ones where i>k.
                in the second for-loop below,
                we compute only new 3-body intrs where i < k */
-            /*
+
             // The copy loop commented out because strange asynchronous issues started to surface
             // Each kernel now manually generates everything
-
-            for( pk = start_j; pk < pi; ++pk ) {
-
-            //printf("%d,%d \n", j, pk );
-
-            // fprintf( out_control->eval, "pk: %d\n", pk );
-            start_pk = Dev_Start_Index( pk, thb_intrs );
-            end_pk = Dev_End_Index( pk, thb_intrs );
-
-            for( t = start_pk; t < end_pk; ++t )
-            if( thb_intrs->select.three_body_list[t].thb == i ) {
-            p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] );
-            p_kji = &(thb_intrs->select.three_body_list[t]);
-
-            p_ijk->thb = bonds->select.bond_list[pk].nbr;
-            p_ijk->pthb  = pk;
-            p_ijk->theta = p_kji->theta;
-            rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-            rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-            rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-
-            ++num_thb_intrs;
-            printf("\n");
-            break;
-            }
-            }
-             */
+//            for( pk = start_j; pk < pi; ++pk )
+//            {
+//                start_pk = Dev_Start_Index( pk, thb_intrs );
+//                end_pk = Dev_End_Index( pk, thb_intrs );
+//
+//                for( t = start_pk; t < end_pk; ++t )
+//                {
+//                    if( thb_intrs->select.three_body_list[t].thb == i )
+//                    {
+//                        p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] );
+//                        p_kji = &(thb_intrs->select.three_body_list[t]);
+//
+//                        p_ijk->thb = bonds->select.bond_list[pk].nbr;
+//                        p_ijk->pthb  = pk;
+//                        p_ijk->theta = p_kji->theta;
+//                        rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
+//                        rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
+//                        rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
+//
+//                        ++num_thb_intrs;
+//                        break;
+//                    }
+//                }
+//            }
 
             /* and this is the second for loop mentioned above */
             //for( pk = pi+1; pk < end_j; ++pk ) {
-
             // Except that now the loop goes all the way from start_j to end_j
             for( pk = start_j; pk < end_j; ++pk )
             {
@@ -239,26 +234,10 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                         bo_ij->BO * bo_jk->BO > SQR(control->thb_cut) )
                 {
                     r_jk = pbond_jk->d;
-                    thbh = &( d_thbh[ index_thbp (type_i,type_j,type_k,num_atom_types) ] );
-
-                    /* if( system->my_atoms[i].orig_id < system->my_atoms[k].orig_id )
-                       fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
-                       system->my_atoms[i].orig_id,
-                       system->my_atoms[j].orig_id,
-                       system->my_atoms[k].orig_id,
-                       bo_ij->BO, bo_jk->BO, p_ijk->theta );
-                       else
-                       fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n",
-                       system->my_atoms[k].orig_id,
-                       system->my_atoms[j].orig_id,
-                       system->my_atoms[i].orig_id,
-                       bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
+                    thbh = &( d_thbh[ index_thbp(type_i, type_j, type_k, num_atom_types) ] );
 
                     for ( cnt = 0; cnt < thbh->cnt; ++cnt )
                     {
-                        // fprintf( out_control->eval, "%6d%6d%6d -- exists in thbp\n",
-                        //          i+1, j+1, k+1 );
-
                         if ( FABS(thbh->prm[cnt].p_val1) > 0.001 )
                         {
                             thbp = &( thbh->prm[cnt] );
@@ -294,7 +273,8 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                             {
                                 expval12theta = p_val1 * (1.0 - expval2theta);
                             }
-                            else // To avoid linear Me-H-Me angles (6/6/06)
+                            // To avoid linear Me-H-Me angles (6/6/06)
+                            else
                             {
                                 expval12theta = p_val1 * -expval2theta;
                             }
@@ -315,8 +295,8 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
 
                             if ( pk < pi )
                             {
-                                data_e_ang [j] += e_ang =
-                                    f7_ij * f7_jk * f8_Dj * expval12theta;
+                                e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
+                                data_e_ang[j] += e_ang;
 
                             }
                             /* END ANGLE ENERGY*/
@@ -362,14 +342,13 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                             /* Similar to above comment regarding if statement */
                             if ( pk < pi )
                             {
-
                                 e_coa =
                                     p_coa1 / (1. + exp_coa2) *
                                     EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) *
                                     EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) *
                                     EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) *
                                     EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
-                                data_e_coa [j] += e_coa;
+                                data_e_coa[j] += e_coa;
                             }
 
                             CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
@@ -385,20 +364,15 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                             // we must again check for pk<pi for entire forces part
                             if ( pk < pi )
                             {
-                                /*
-                                   bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
-                                   bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
-                                   workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
-                                   workspace->CdDelta[i] += CEcoa4;
-                                   workspace->CdDelta[k] += CEcoa5;
-                                 */
                                 bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
                                 bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
                                 workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
+//                                workspace->CdDelta[i] += CEcoa4;
+//                                workspace->CdDelta[k] += CEcoa5;
                                 pbond_ij->va_CdDelta += CEcoa4;
                                 pbond_jk->va_CdDelta += CEcoa5;
 
-                                for( t = start_j; t < end_j; ++t )
+                                for ( t = start_j; t < end_j; ++t )
                                 {
                                     pbond_jt = &( bonds->select.bond_list[t] );
                                     bo_jt = &(pbond_jt->bo_data);
@@ -406,25 +380,17 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                                     temp = CUBE( temp_bo_jt );
                                     pBOjt7 = temp * temp * temp_bo_jt;
 
-                                    // fprintf( out_control->eval, "%6d%12.8f\n",
-                                    // workspace->reverse_map[bonds->select.bond_list[t].nbr],
-                                    // (CEval6 * pBOjt7) );
-
                                     bo_jt->Cdbo += (CEval6 * pBOjt7);
                                     bo_jt->Cdbopi += CEval5;
                                     bo_jt->Cdbopi2 += CEval5;
                                 }
 
-                                if( control->virial == 0 )
+                                if ( control->virial == 0 )
                                 {
-                                    /*
-                                       rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di );
-                                       rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
-                                       rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk );
-                                     */
-
+//                                    rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di );
                                     rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di );
                                     rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
+//                                    rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk );
                                     rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk );
                                 }
                                 else
@@ -432,19 +398,19 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                                     /* terms not related to bond order derivatives are
                                        added directly into forces and pressure vector/tensor */
                                     rvec_Scale( force, CEval8, p_ijk->dcos_di );
-                                    //rvec_Add( workspace->f[i], force );
+//                                    rvec_Add( workspace->f[i], force );
                                     rvec_Add( pbond_ij->va_f, force );
                                     rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                    //rvec_Add( data->my_ext_press, ext_press );
-                                    rvec_Add( my_ext_press [j], ext_press );
+//                                    rvec_Add( data->my_ext_press, ext_press );
+                                    rvec_Add( my_ext_press[j], ext_press );
 
                                     rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
 
                                     rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-                                    //rvec_Add( workspace->f[k], force );
+//                                    rvec_Add( workspace->f[k], force );
                                     rvec_Add( pbond_jk->va_f, force );
                                     rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( my_ext_press [j], ext_press );
+                                    rvec_Add( my_ext_press[j], ext_press );
                                 }
                             }
 
@@ -597,7 +563,7 @@ CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms,
 {
     int i, j, k, pi, pk;
     int start_j, end_j;
-    int cnt, num_thb_intrs;
+    int num_thb_intrs;
     real BOA_ij, BOA_jk;
     bond_data *pbond_ij, *pbond_jk;
     bond_order_data *bo_ij, *bo_jk;
@@ -629,7 +595,7 @@ CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms,
         {
             i = pbond_ij->nbr;
 
-            for( pk = start_j; pk < end_j; ++pk )
+            for ( pk = start_j; pk < end_j; ++pk )
             {
                 if ( pk == pi )
                 {
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index ae60867e..2426e824 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -87,6 +87,7 @@ void Init_Force_Functions( control_params *control )
     Interaction_Functions[0] = BO;
     Interaction_Functions[1] = Bonds; //Dummy_Interaction;
     Interaction_Functions[2] = Atom_Energy; //Dummy_Interaction;
+    Interaction_Functions[2] = Atom_Energy; //Dummy_Interaction;
     Interaction_Functions[3] = Valence_Angles; //Dummy_Interaction;
     Interaction_Functions[4] = Torsion_Angles; //Dummy_Interaction;
     if ( control->hbond_cut > 0.0 )
@@ -1882,7 +1883,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
     if ( charge_flag == TRUE )
     {
         retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
-        fprintf( stderr, "    [CUDA_INIT_FORCES: %d] STEP %d\n", retVal, data->step );
 
 //        int i;
 //        static reax_list **temp_lists;
@@ -1936,7 +1936,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
 
         /********* bonded interactions ************/
         retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
-        fprintf( stderr, "    [CUDA_COMPUTE_BONDED_FORCES: %d] STEP %d\n", retVal, data->step );
 
 #if defined(LOG_PERFORMANCE)
         //MPI_Barrier( MPI_COMM_WORLD );
@@ -1960,7 +1959,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
         if ( charge_flag == TRUE )
         {
             Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
-            fprintf( stderr, "    [CUDA_QEQ: %d] STEP %d\n", retVal, data->step );
         }
 
 #if defined(LOG_PERFORMANCE)
@@ -1980,7 +1978,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
         /********* nonbonded interactions ************/
         Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
                 lists, out_control, mpi_data );
-        fprintf( stderr, "    [CUDA_COMPUTE_NONBONDED_FORCES: %d] STEP %d\n", retVal, data->step );
 
 #if defined(LOG_PERFORMANCE)
         //MPI_Barrier( MPI_COMM_WORLD );
@@ -1997,7 +1994,6 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
 
         /*********** total force ***************/
         Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
-        fprintf( stderr, "    [CUDA_COMPUTE_TOTAL_FORCE: %d] STEP %d\n", retVal, data->step );
 
 #if defined(LOG_PERFORMANCE)
         //MPI_Barrier( MPI_COMM_WORLD );
@@ -2011,7 +2007,10 @@ int Cuda_Compute_Forces( reax_system *system, control_params *control,
                 system->my_rank, data->step );
         //Print_Total_Force( system, data, workspace );
         MPI_Barrier( MPI_COMM_WORLD );
+
 #endif
+
+//        Print_Forces( system );
     }
 
     return retVal;
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index 2d09ec4c..faf15dd3 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -107,11 +107,12 @@ void Generate_Initial_Velocities( reax_system *system, real T )
     int i;
     real m, scale, norm;
 
-
     if ( T <= 0.1 )
     {
         for ( i = 0; i < system->n; i++ )
+        {
             rvec_MakeZero( system->my_atoms[i].v );
+        }
     }
     else
     {
@@ -248,7 +249,6 @@ int Cuda_Init_System( reax_system *system, control_params *control,
     int nrecv[MAX_NBRS];
 
     Setup_New_Grid( system, control, MPI_COMM_WORLD );
-    fprintf( stderr, "    [SETUP NEW GRID]\n" );
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d GRID:\n", system->my_rank );
@@ -256,9 +256,7 @@ int Cuda_Init_System( reax_system *system, control_params *control,
 #endif
 
     Bin_My_Atoms( system, &(workspace->realloc) );
-    fprintf( stderr, "    [BIN MY ATOMS]\n" );
     Reorder_My_Atoms( system, workspace );
-    fprintf( stderr, "    [REORDER MY ATOMS]\n" );
 
     /* estimate N and total capacity */
     for ( i = 0; i < MAX_NBRS; ++i )
@@ -270,16 +268,12 @@ int Cuda_Init_System( reax_system *system, control_params *control,
     system->max_recved = 0;
     system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
             Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE );
-    fprintf( stderr, "    [SEND_RECV:ESTIMATE_BOUNDARY_ATOMS]\n" );
     system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
     Bin_Boundary_Atoms( system );
-    fprintf( stderr, "    [BIN_BOUNDARY_ATOMS]\n" );
 
     /* Sync atoms here to continue the computation */
     dev_alloc_system( system );
-    fprintf( stderr, "    [DEV ALLOC SYSTEM]\n" );
     Sync_System( system );
-    fprintf( stderr, "    [SYNC SYSTEM]\n" );
 
     /* estimate numH and Hcap */
     Cuda_Reset_Atoms( system, control );
@@ -294,10 +288,8 @@ int Cuda_Init_System( reax_system *system, control_params *control,
 #endif
 
     Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D );
-    fprintf( stderr, "    [CUDA COMPUTE TOTAL MASS]\n" );
 
     Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );
-    fprintf( stderr, "    [CUDA COMPUTE CENTER OF MASS]\n" );
 
 //    if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE )
 //    {
@@ -308,11 +300,9 @@ int Cuda_Init_System( reax_system *system, control_params *control,
     if ( !control->restart || (control->restart && control->random_vel) )
     {
         Generate_Initial_Velocities( system, control->T_init );
-        fprintf( stderr, "    [GENERATE INITIAL VELOCITIES]\n" );
     }
 
     Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-    fprintf( stderr, "    [CUDA COMPUTE K.E.]\n" );
 
     return SUCCESS;
 }
@@ -1138,7 +1128,6 @@ void Cuda_Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
-    fprintf( stderr, "  [INIT MPI DATATYPES]\n" );
 
     /* SYSTEM */
     if ( Cuda_Init_System( system, control, data, workspace, mpi_data, msg ) == FAILURE )
@@ -1148,22 +1137,18 @@ void Cuda_Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
-    fprintf( stderr, "  [CUDA INIT SYSTEM]\n" );
 
     /* GRID */
     dev_alloc_grid( system );
     Sync_Grid( &system->my_grid, &system->d_my_grid );
-    fprintf( stderr, "  [DEV ALLOC GRID]\n" );
 
     //validate_grid( system );
 
     /* SIMULATION_DATA */
     Cuda_Init_Simulation_Data( system, control, data, msg );
-    fprintf( stderr, "  [CUDA INIT SIMULATION DATA]\n" );
 
     /* WORKSPACE */
     Cuda_Init_Workspace( system, control, workspace, msg );
-    fprintf( stderr, "  [CUDA INIT WORKSPACE]\n" );
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
@@ -1173,7 +1158,6 @@ void Cuda_Initialize( reax_system *system, control_params *control,
 
     /* CONTROL */
     dev_alloc_control( control );
-    fprintf( stderr, "  [DEV ALLOC CONTROL]\n" );
 
     /* LISTS */
     if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) ==
@@ -1184,7 +1168,6 @@ void Cuda_Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
-    fprintf( stderr, "  [CUDA INIT LISTS]\n" );
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
@@ -1198,7 +1181,6 @@ void Cuda_Initialize( reax_system *system, control_params *control,
                  system->my_rank );
         MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
     }
-    fprintf( stderr, "  [INIT OUTPUT FILES]\n" );
 
 #if defined(DEBUG)
     fprintf( stderr, "p%d: output files opened\n", system->my_rank );
@@ -1214,7 +1196,6 @@ void Cuda_Initialize( reax_system *system, control_params *control,
                      system->my_rank );
             MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
         }
-        fprintf( stderr, "  [INIT LOOKUP TABLES]\n" );
 
 #if defined(DEBUG)
         fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank );
diff --git a/PG-PuReMD/src/integrate.c b/PG-PuReMD/src/integrate.c
index bfbbe1f7..88b406b5 100644
--- a/PG-PuReMD/src/integrate.c
+++ b/PG-PuReMD/src/integrate.c
@@ -366,14 +366,12 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
     ret = SUCCESS;
 
     Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
-    fprintf( stderr, "  [CUDA_REALLOCATE] STEP %d\n", data->step );
 
     if ( verlet_part1_done == FALSE )
     {
         /* velocity verlet, 1st part */
         bNVT_update_velocity_part1( system, dt );
         verlet_part1_done = TRUE;
-        fprintf( stderr, "  [bNVT_UPDATE_VEL_PART1] STEP %d\n", data->step );
 
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
@@ -383,19 +381,14 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
         if ( renbr )
         {
             Update_Grid( system, control, mpi_data->world );
-            fprintf( stderr, "  [UPDATE_GRID] STEP %d\n", data->step );
         }
 
         Output_Sync_Atoms( system );
-        fprintf( stderr, "  [OUTPUT_SYNC_ATOMS] STEP %d\n", data->step );
         Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
-        fprintf( stderr, "  [COMM_ATOMS] STEP %d\n", data->step );
         Sync_Atoms( system );
-        fprintf( stderr, "  [SYNC_ATOMS] STEP %d\n", data->step );
 
         /* synch the Grid to the Device here */
         Sync_Grid( &system->my_grid, &system->d_my_grid );
-        fprintf( stderr, "  [SYNC_GRID] STEP %d\n", data->step );
 
         init_blocks( system );
 
@@ -406,7 +399,6 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
     }
     
     Cuda_Reset( system, control, data, workspace, lists );
-    fprintf( stderr, "  [CUDA_RESET] STEP %d\n", data->step );
 
     if ( renbr )
     {
@@ -419,7 +411,6 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
             //TODO: move far_nbrs reallocation checks outside of renbr frequency check
             ret = Cuda_Estimate_Neighbors( system, data->step );
             estimate_nbrs_done = 1;
-            fprintf( stderr, "  [CUDA_ESTIMATE_NEIGHBORS: %d] STEP %d\n", ret, data->step );
         }
 
         if ( ret == SUCCESS && estimate_nbrs_done == 1 )
@@ -439,14 +430,12 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
     {
         ret = Cuda_Compute_Forces( system, control, data, workspace,
                 lists, out_control, mpi_data );
-        fprintf( stderr, "  [CUDA_COMPUTE_FORCES: %d] STEP %d\n", ret, data->step );
     }
 
     if ( ret == SUCCESS )
     {
         /* velocity verlet, 2nd part */
         bNVT_update_velocity_part2( system, dt );
-        fprintf( stderr, "  [bNVT_UPDATE_VEL_PART2] STEP %d\n", data->step );
 
 #if defined(DEBUG_FOCUS)
         fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
@@ -455,7 +444,6 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
 
         /* temperature scaler */
         Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-        fprintf( stderr, "  [CUDA_COMPUTE_KINETIC_ENERGY] STEP %d\n", data->step );
 
         lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
         if ( lambda < MIN_dT )
diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c
index f3a341db..9c028d42 100644
--- a/PG-PuReMD/src/io_tools.c
+++ b/PG-PuReMD/src/io_tools.c
@@ -1190,7 +1190,7 @@ void Output_Results( reax_system *system, control_params *control,
                 out_control->energy_update_freq > 0 &&
                 data->step % out_control->energy_update_freq == 0 )
         {
-#if !defined(DEBUG) && !defined(DEBUG_FOCUS)
+#if defined(DEBUG) && defined(DEBUG_FOCUS)
             fprintf( out_control->out,
                      "%-6d%14.2f%14.2f%14.2f%11.2f%13.2f%13.5f\n",
                      data->step, data->sys_en.e_tot, data->sys_en.e_pot,
diff --git a/PG-PuReMD/src/io_tools.h b/PG-PuReMD/src/io_tools.h
index 789991c3..6ae2d6d8 100644
--- a/PG-PuReMD/src/io_tools.h
+++ b/PG-PuReMD/src/io_tools.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+
 int Init_Output_Files( reax_system*, control_params*,
                        output_controls*, mpi_datatypes*, char* );
 int Close_Output_Files( reax_system*, control_params*,
@@ -106,4 +107,6 @@ void Print_Bond_List( reax_system*, control_params*, simulation_data*,
                       reax_list**, output_controls*);
 
 #endif
+
+
 #endif
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index a3071712..4d677687 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -326,6 +326,7 @@ int main( int argc, char* argv[] )
 #if defined(__CUDA_DEBUG__)
     Reset( system, control, data, workspace, lists );
 #endif
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank );
 #endif
@@ -400,13 +401,11 @@ int main( int argc, char* argv[] )
     retries = 0;
     while ( data->step <= control->nsteps && retries < MAX_RETRIES )
     {
-        fprintf( stderr, "[BEGIN] STEP %d\n", data->step );
         ret = SUCCESS;
 
         if ( control->T_mode && retries == 0 )
         {
             Temperature_Control( control, data );
-            fprintf( stderr, "  [TEMPERATURE CONTROL] STEP %d\n", data->step );
         }
 
 #if defined(DEBUG)
@@ -418,7 +417,6 @@ int main( int argc, char* argv[] )
 #endif
     
         ret = Cuda_Evolve( system, control, data, workspace, lists, out_control, mpi_data );
-        fprintf( stderr, "[EVOLVE] STEP %d\n", data->step );
     
 #if defined(DEBUG)
         t_end = Get_Timing_Info( t_begin );
@@ -434,7 +432,6 @@ int main( int argc, char* argv[] )
             ret = Cuda_Post_Evolve( system, control, data, workspace, lists,
                     out_control, mpi_data );
         }
-        fprintf( stderr, "[POST EVOLVE] STEP %d\n", data->step );
 
 #if defined(__CUDA_DEBUG__)
         Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
@@ -484,13 +481,14 @@ int main( int argc, char* argv[] )
         else
         {
             ++retries;
-            fprintf( stderr, "INFO: retrying step %d...\n", data->step );
+            fprintf( stderr, "[INFO] p%d: retrying step %d...\n", system->my_rank, data->step );
         }
     }
 
     if ( retries >= MAX_RETRIES )
     {
-        fprintf( stderr, "Maximum retries reached for this step. Terminating...\n" );
+        fprintf( stderr, "Maximum retries reached for this step (%d). Terminating...\n",
+              retries );
         MPI_Abort( MPI_COMM_WORLD, MAX_RETRIES_REACHED );
     }
 
@@ -635,7 +633,7 @@ int main( int argc, char* argv[] )
         else
         {
             ++retries;
-            fprintf( stderr, "INFO: retrying step %d...\n", data->step );
+            fprintf( stderr, "[INFO] p%d: retrying step %d...\n", system->my_rank, data->step );
         }
     }
 
-- 
GitLab