From 5afec69ffe89c0b2d3c3f2b5eeefe44156ede746 Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Wed, 7 Jun 2017 12:46:31 -0400
Subject: [PATCH] PG-PuReMD: add CUB (v1.6.4) for optimized CUDA primitives.
 Change three-body lists to use CUB.

---
 .gitmodules                          |   4 +
 PG-PuReMD/src/allocate.c             |  22 +++---
 PG-PuReMD/src/cub                    |   1 +
 PG-PuReMD/src/cuda_allocate.cu       |   3 +
 PG-PuReMD/src/cuda_forces.cu         |  77 ++++++++-----------
 PG-PuReMD/src/cuda_neighbors.cu      |  25 +++---
 PG-PuReMD/src/cuda_reduction.cu      | 109 ++++++++++++++++++++++-----
 PG-PuReMD/src/cuda_reduction.h       |   1 +
 PG-PuReMD/src/cuda_reset_tools.cu    |   1 -
 PG-PuReMD/src/cuda_valence_angles.cu |  26 +++----
 PG-PuReMD/src/init_md.c              |   3 +-
 PG-PuReMD/src/parallelreax.c         |   2 +-
 PG-PuReMD/src/reax_types.h           |   1 +
 13 files changed, 172 insertions(+), 103 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 PG-PuReMD/src/cub

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..1129b088
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "PG-PuReMD/src/cub"]
+	path = PG-PuReMD/src/cub
+	url = https://github.com/NVlabs/cub.git
+	branch = 89de7ab20167909bc2c4f8acd397671c47cf3c0d
diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c
index a3af86dc..25c89abc 100644
--- a/PG-PuReMD/src/allocate.c
+++ b/PG-PuReMD/src/allocate.c
@@ -1207,7 +1207,7 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
 
         /* delete three-body list */
         Dev_Delete_List( *dev_lists + THREE_BODIES );
-//        Delete_List( *lists + THREE_BODIES );
+        Delete_List( *lists + THREE_BODIES );
 
         /* recreate Three-body list */
         Dev_Make_List( (*dev_lists + BONDS)->num_intrs, realloc->num_3body,
@@ -1255,30 +1255,31 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
     // to ensure correct values at mpi_buffers for update_boundary_positions
     if ( !renbr )
     {
-        mpi_flag = 0;
+        mpi_flag = FALSE;
     }
     // check whether in_buffer capacity is enough
     else if ( system->max_recved >= system->est_recv * 0.90 )
     {
-        mpi_flag = 1;
+        mpi_flag = TRUE;
     }
     else
     {
         // otherwise check individual outgoing buffers
-        mpi_flag = 0;
+        mpi_flag = FALSE;
         for ( p = 0; p < MAX_NBRS; ++p )
         {
             nbr_pr = &( system->my_nbrs[p] );
             nbr_data = &( mpi_data->out_buffers[p] );
+
             if ( nbr_data->cnt >= nbr_pr->est_send * 0.90 )
             {
-                mpi_flag = 1;
+                mpi_flag = TRUE;
                 break;
             }
         }
     }
 
-    if ( mpi_flag )
+    if ( mpi_flag == TRUE )
     {
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: reallocating mpi_buf: old_recv=%d\n",
@@ -1305,13 +1306,14 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
 
 #if defined(DEBUG_FOCUS)
         fprintf( stderr, "p%d: reallocating mpi_buf: recv=%d send=%d total=%dMB\n",
-                 system->my_rank, system->est_recv, total_send,
-                 (int)((system->est_recv + total_send)*sizeof(boundary_atom) /
-                       (1024 * 1024)));
+                system->my_rank, system->est_recv, total_send,
+                (int)((system->est_recv + total_send)*sizeof(boundary_atom) /
+                      (1024 * 1024)));
+
         for ( p = 0; p < MAX_NBRS; ++p )
         {
             fprintf( stderr, "p%d: nbr%d new_send=%d\n",
-                     system->my_rank, p, system->my_nbrs[p].est_send );
+                    system->my_rank, p, system->my_nbrs[p].est_send );
         }
 #endif
 
diff --git a/PG-PuReMD/src/cub b/PG-PuReMD/src/cub
new file mode 160000
index 00000000..89de7ab2
--- /dev/null
+++ b/PG-PuReMD/src/cub
@@ -0,0 +1 @@
+Subproject commit 89de7ab20167909bc2c4f8acd397671c47cf3c0d
diff --git a/PG-PuReMD/src/cuda_allocate.cu b/PG-PuReMD/src/cuda_allocate.cu
index 9d56b4c9..9a31eed3 100644
--- a/PG-PuReMD/src/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda_allocate.cu
@@ -156,6 +156,9 @@ void dev_alloc_system( reax_system *system )
     cuda_malloc( (void **) &system->d_my_atoms, system->total_cap * sizeof(reax_atom),
             TRUE, "system:d_my_atoms" );
 
+    /* list reallocation */
+    cuda_malloc( (void **) &system->d_num_thbodies, sizeof(int), TRUE, "system:d_num_thbodies" );
+
     /* simulation boxes */
     cuda_malloc( (void **) &system->d_big_box, sizeof(simulation_box), TRUE, "system:d_big_box" );
     cuda_malloc( (void **) &system->d_my_box, sizeof(simulation_box), TRUE, "system:d_my_box" );
diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda_forces.cu
index ebe53090..82a0137e 100644
--- a/PG-PuReMD/src/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda_forces.cu
@@ -22,9 +22,7 @@
 #include "tool_box.h"
 #include "cuda_nonbonded.h"
 
-
-extern "C" void Make_List( int, int, int, reax_list* );
-extern "C" void Delete_List( reax_list* );
+#include "cub/cub/device/device_reduce.cuh"
 
 
 CUDA_GLOBAL void k_disable_hydrogen_bonding( control_params *control )
@@ -291,28 +289,36 @@ void Cuda_Estimate_Storages( reax_system *system, control_params *control,
 void Cuda_Estimate_Storages_Three_Body( reax_system *system, control_params *control, 
         reax_list **lists, int *num_3body, int *thbody )
 {
-    int i;
-    real *spad = (real *) scratch;
+    void *d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
 
-    cuda_memset( spad, 0, (*dev_lists + BONDS)->num_intrs * sizeof(int), "scratch" );
+    cuda_memset( thbody, 0, (*dev_lists + BONDS)->num_intrs * sizeof(int), "scratch::thbody" );
 
     Estimate_Cuda_Valence_Angles <<<BLOCKS_N, BLOCK_SIZE>>>
         ( system->d_my_atoms, (control_params *)control->d_control_params, 
-          *(*dev_lists + BONDS), system->n, system->N, (int *)spad );
+          *(*dev_lists + BONDS), system->n, system->N, (int *)thbody );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    copy_host_device( thbody, spad, (*dev_lists + BONDS)->num_intrs * sizeof(int),
-            cudaMemcpyDeviceToHost, "thb:offsets" );
+    /* determine temporary device storage requirements */
+    cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
+            thbody, system->d_num_thbodies, (*dev_lists + BONDS)->num_intrs );
 
-    *num_3body = 0;
-    for ( i = 0; i < (*dev_lists + BONDS)->num_intrs; i++ )
-    {
-        *num_3body += thbody[i];
-        thbody[i] += thbody[i - 1];
-    }
+    /* allocate temporary storage */
+    cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
+            "cub::sum::temp_storage" );
 
-    system->num_thbodies = thbody[(*dev_lists + BONDS)->num_intrs - 1];
+    /* run sum-reduction */
+    cub::DeviceReduce::Sum( d_temp_storage, temp_storage_bytes,
+            thbody, system->d_num_thbodies, (*dev_lists + BONDS)->num_intrs );
+
+    /* deallocate temporary storage */
+    cuda_free( d_temp_storage, "cub::sum::temp_storage" );
+
+    copy_host_device( num_3body, system->d_num_thbodies, sizeof(int),
+            cudaMemcpyDeviceToHost, "d_num_thbodies" );
+
+    system->num_thbodies = *num_3body;
 }
 
 
@@ -1161,53 +1167,32 @@ int Cuda_Validate_Lists( reax_system *system, storage *workspace,
     }
 
     /* three body interactions */
-    cuda_memset( spad, 0, (*lists + BONDS)->num_intrs * sizeof (int), "scratch" );
-    Estimate_Cuda_Valence_Angles <<<BLOCKS_N, BLOCK_SIZE>>>
-        ( system->d_my_atoms, (control_params *)control->d_control_params, 
-          *(*lists + BONDS), system->n, system->N, (int *)spad);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    fprintf( stderr, "      [ESTIMATE_CUDA_VALENCE_ANGLES]\n" );
-
-    thbody = (int *) host_scratch;
-    memset( thbody, 0, sizeof(int) * (*lists + BONDS)->num_intrs );
-    copy_host_device( thbody, spad, (*lists + BONDS)->num_intrs * sizeof(int),
-            cudaMemcpyDeviceToHost, "thb:offsets" );
-
-    total_3body = 0;
-    for (i = 0; i < (*lists + BONDS)->num_intrs; i++)
-    {
-        total_3body += thbody[i];
-        thbody[i] += thbody[i - 1];
-    }
+    thbody = (int *) scratch;
 
-    system->num_thbodies = thbody[(*lists + BONDS)->num_intrs - 1];
+    Cuda_Estimate_Storages_Three_Body( system, control, dev_lists,
+            &total_3body, thbody );
 
     if ( system->num_thbodies > (*lists + THREE_BODIES)->num_intrs ||
                 (*lists + THREE_BODIES)->n < (*lists + BONDS)->num_intrs )
     {
-        realloc->num_3body = total_3body;
-        system->num_thbodies = total_3body;
+        system->num_thbodies = total_3body * SAFE_ZONE;
+        realloc->num_3body = system->num_thbodies;
         ret = FAILURE;
     }
+
+#if defined(DEBUG)
     fprintf( stderr, "system->num_thbodies = %d, lists:THREE_BODIES->num_intrs = %d,\n",
             system->num_thbodies, (*lists + THREE_BODIES)->num_intrs );
     fprintf( stderr, "lists:THREE_BODIES->n = %d, lists:BONDS->num_intrs = %d,\n",
             (*lists + THREE_BODIES)->n, (*lists + BONDS)->num_intrs );
     fprintf( stderr, "total_3body = %d\n", total_3body );
+#endif
 
     if ( ret == SUCCESS )
     {
-        /* copy the indexes into the thb list */
-        copy_host_device( thbody, (*lists + THREE_BODIES)->index + 1,
-                sizeof(int) * ((*lists + BONDS)->num_intrs - 1),
-                cudaMemcpyHostToDevice, "dev_thb:index" );
-        copy_host_device( thbody, (*lists + THREE_BODIES)->end_index + 1,
-                sizeof(int) * ((*lists + BONDS)->num_intrs - 1),
-                cudaMemcpyHostToDevice, "dev_thb:end_index" );
+        Cuda_Init_Three_Body_Indices( thbody, (*dev_lists + BONDS)->num_intrs );
     }
 
-
     return ret;
 }
 
diff --git a/PG-PuReMD/src/cuda_neighbors.cu b/PG-PuReMD/src/cuda_neighbors.cu
index 7f0f848c..094ea8a4 100644
--- a/PG-PuReMD/src/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda_neighbors.cu
@@ -28,8 +28,7 @@
 #include "cuda_utils.h"
 #include "tool_box.h"
 
-//extern "C" real Get_Time( );
-//extern "C" real Get_Timing_Info( real );
+#include "cub/cub/device/device_scan.cuh"
 
 
 CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x ) 
@@ -722,14 +721,22 @@ void Cuda_Init_Bond_Indices( int *indices, int entries )
 
 void Cuda_Init_Three_Body_Indices( int *indices, int entries )
 {
-    int i;
     reax_list *thbody = *dev_lists + THREE_BODIES;
+    void *d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
+
+    /* determine temporary device storage requirements */
+    cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes,
+            indices, thbody->index, entries );
+
+    /* allocate temporary storage */
+    cuda_malloc( &d_temp_storage, temp_storage_bytes, FALSE,
+            "cub::devicescan::temp_storage" );
 
-    copy_host_device( indices, thbody->index + 1,
-            sizeof(int) * (entries - 1),
-            cudaMemcpyHostToDevice, "dev_thb:index" );
-    copy_host_device( indices, thbody->end_index,
-            sizeof(int) * entries,
-            cudaMemcpyHostToDevice, "dev_thb:end_index" );
+    /* run exclusive prefix sum */
+    cub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes,
+            indices, thbody->index, entries );
 
+    /* deallocate temporary storage */
+    cuda_free( d_temp_storage, "cub::devicescan::temp_storage" );
 }
diff --git a/PG-PuReMD/src/cuda_reduction.cu b/PG-PuReMD/src/cuda_reduction.cu
index 86d99322..fee4595c 100644
--- a/PG-PuReMD/src/cuda_reduction.cu
+++ b/PG-PuReMD/src/cuda_reduction.cu
@@ -6,6 +6,81 @@
 #include "vector.h"
 
 
+CUDA_GLOBAL void k_reduction_int( const int *input, int *per_block_results,
+        const size_t n )
+{
+#if defined(__SM_35__)
+    extern __shared__ int my_iresults[];
+    int idata;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int x = 0, z, offset;
+
+    if( i < n )
+    {
+        x = input[i];
+    }
+
+    idata = x;
+    __syncthreads();
+
+    for( z = 16; z >=1; z/=2 )
+    {
+        idata += shfl( idata, z );
+    }
+
+    if ( threadIdx.x % 32 == 0 )
+    {
+        my_iresults[threadIdx.x >> 5] = idata;
+    }
+
+    __syncthreads();
+
+    for( offset = blockDim.x >> 6; offset > 0; offset >>= 1 )
+    {
+        if( threadIdx.x < offset )
+        {
+            my_iresults[threadIdx.x] += my_iresults[threadIdx.x + offset];
+        }
+
+        __syncthreads();
+    }
+
+    if( threadIdx.x == 0 )
+    {
+        per_block_results[blockIdx.x] = my_iresults[0];
+    }
+
+#else
+    extern __shared__ int idata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int x = 0, offset;
+
+    if( i < n )
+    {
+        x = input[i];
+    }
+
+    idata[threadIdx.x] = x;
+    __syncthreads( );
+
+    for ( offset = blockDim.x / 2; offset > 0; offset >>= 1 )
+    {
+        if( threadIdx.x < offset )
+        {
+            idata[threadIdx.x] += idata[threadIdx.x + offset];
+        }
+
+        __syncthreads();
+    }
+
+    if( threadIdx.x == 0 )
+    {
+        per_block_results[blockIdx.x] = idata[0];
+    }
+#endif
+}
+
+
 CUDA_GLOBAL void k_reduction( const real *input, real *per_block_results,
         const size_t n )
 {
@@ -85,7 +160,7 @@ CUDA_GLOBAL void k_reduction_rvec( rvec *input, rvec *results, size_t n )
 #if defined(__SM_35__)
     extern __shared__ rvec my_rvec[];
     rvec sdata;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x, z, offset;
 
     rvec_MakeZero( sdata );
 
@@ -94,9 +169,9 @@ CUDA_GLOBAL void k_reduction_rvec( rvec *input, rvec *results, size_t n )
         rvec_Copy( sdata, input[i] );
     }
 
-    __syncthreads();
+    __syncthreads( );
 
-    for( int z = 16; z >=1; z/=2 )
+    for( z = 16; z >=1; z/=2 )
     {
         sdata[0] += shfl( sdata[0], z);
         sdata[1] += shfl( sdata[1], z);
@@ -108,16 +183,16 @@ CUDA_GLOBAL void k_reduction_rvec( rvec *input, rvec *results, size_t n )
         rvec_Copy( my_rvec[threadIdx.x >> 5] , sdata );
     }
 
-    __syncthreads ();
+    __syncthreads( );
 
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 )
+    for( offset = blockDim.x >> 6; offset > 0; offset >>= 1 )
     {
         if( threadIdx.x < offset )
         {
             rvec_Add( my_rvec[threadIdx.x], my_rvec[threadIdx.x + offset] );
         }
 
-        __syncthreads();
+        __syncthreads( );
     }
 
     if( threadIdx.x == 0 )
@@ -127,33 +202,33 @@ CUDA_GLOBAL void k_reduction_rvec( rvec *input, rvec *results, size_t n )
 
 #else
     extern __shared__ rvec svec_data[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x, offset;
     rvec x;
 
     rvec_MakeZero( x );
 
-    if(i < n)
+    if ( i < n )
     {
         rvec_Copy( x, input[i] );
     }
 
-    rvec_Copy(svec_data[threadIdx.x], x);
-    __syncthreads();
+    rvec_Copy( svec_data[threadIdx.x], x );
+    __syncthreads( );
 
-    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    for ( offset = blockDim.x / 2; offset > 0; offset >>= 1 )
     {
-        if(threadIdx.x < offset)
+        if ( threadIdx.x < offset )
         {
-            rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]);
+            rvec_Add( svec_data[threadIdx.x], svec_data[threadIdx.x + offset] );
         }
 
-        __syncthreads();
+        __syncthreads( );
     }
 
-    if(threadIdx.x == 0)
+    if ( threadIdx.x == 0 )
     {
-        //rvec_Copy (results[blockIdx.x], svec_data[0]);
-        rvec_Add (results[blockIdx.x], svec_data[0]);
+        //rvec_Copy( results[blockIdx.x], svec_data[0] );
+        rvec_Add( results[blockIdx.x], svec_data[0] );
     }
 #endif
 }
diff --git a/PG-PuReMD/src/cuda_reduction.h b/PG-PuReMD/src/cuda_reduction.h
index a494cc32..0960018e 100644
--- a/PG-PuReMD/src/cuda_reduction.h
+++ b/PG-PuReMD/src/cuda_reduction.h
@@ -8,6 +8,7 @@
 #define  FINAL    1
 
 
+CUDA_GLOBAL void k_reduction_int( const int *, int *, const size_t );
 CUDA_GLOBAL void k_reduction( const real *, real *, const size_t );
 CUDA_GLOBAL void k_reduction_rvec( rvec *, rvec *, size_t );
 CUDA_GLOBAL void k_reduction_rvec2( rvec2 *, rvec2 *, size_t );
diff --git a/PG-PuReMD/src/cuda_reset_tools.cu b/PG-PuReMD/src/cuda_reset_tools.cu
index 66ee2cbd..3da7a9c5 100644
--- a/PG-PuReMD/src/cuda_reset_tools.cu
+++ b/PG-PuReMD/src/cuda_reset_tools.cu
@@ -42,7 +42,6 @@ void Cuda_Reset_Workspace( reax_system *system, storage *workspace )
 
 CUDA_GLOBAL void k_reset_hindex( reax_atom *my_atoms, int N )
 {
-    int Hindex = 0;
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i >= N)
diff --git a/PG-PuReMD/src/cuda_valence_angles.cu b/PG-PuReMD/src/cuda_valence_angles.cu
index dfe9e769..c0936fc0 100644
--- a/PG-PuReMD/src/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda_valence_angles.cu
@@ -139,25 +139,16 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
 
     for( pi = start_j; pi < end_j; ++pi )
     {
-
-        //num_thb_intrs = pi * THREE_BODY_OFFSET;
-        //Dev_Set_Start_Index( pi, num_thb_intrs, thb_intrs );
         num_thb_intrs = Dev_Start_Index( pi, thb_intrs );
 
         pbond_ij = &(bonds->select.bond_list[pi]);
         bo_ij = &(pbond_ij->bo_data);
         BOA_ij = bo_ij->BO - control->thb_cut;
 
-        //TODO REMOVE THIS
-        //TODO REMOVE THIS
-        //TODO REMOVE THIS
-        //TODO REMOVE THIS
-        //TODO REMOVE THIS
-
-        if( BOA_ij/*bo_ij->BO*/ > 0.0 &&
+        //if( BOA_ij/*bo_ij->BO*/ > 0.0) {
+        if ( BOA_ij > 0.0 &&
                 ( j < n || pbond_ij->nbr < n ) )
         {
-            //      if( BOA_ij/*bo_ij->BO*/ > 0.0) {
             i = pbond_ij->nbr;
             r_ij = pbond_ij->d;
             type_i = my_atoms[i].type;
@@ -245,7 +236,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                 ++num_thb_intrs;
 
                 if ( j < n && BOA_jk > 0.0 &&
-                        bo_ij->BO * bo_jk->BO > SQR(control->thb_cut)/*0*/ )
+                        bo_ij->BO * bo_jk->BO > SQR(control->thb_cut) )
                 {
                     r_jk = pbond_jk->d;
                     thbh = &( d_thbh[ index_thbp (type_i,type_j,type_k,num_atom_types) ] );
@@ -350,12 +341,12 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                                need to prevent all energies becoming duplicates */
                             if ( pk < pi )
                             {
-                                data_e_pen [j] += e_pen =
-                                    p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                                e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                                data_e_pen[j] += e_pen;
                             }
 
                             CEpen1 = e_pen * Cf9j / f9_Dj;
-                            temp   = -2.0 * p_pen2 * e_pen;
+                            temp = -2.0 * p_pen2 * e_pen;
                             CEpen2 = temp * (BOA_ij - 2.0);
                             CEpen3 = temp * (BOA_jk - 2.0);
                             /* END PENALTY ENERGY */
@@ -372,12 +363,13 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
                             if ( pk < pi )
                             {
 
-                                data_e_coa [j] += e_coa =
+                                e_coa =
                                     p_coa1 / (1. + exp_coa2) *
                                     EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) *
                                     EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) *
                                     EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) *
                                     EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+                                data_e_coa [j] += e_coa;
                             }
 
                             CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
@@ -561,7 +553,7 @@ CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms,
             }
         }
 
-        Dev_Set_End_Index(pi, num_thb_intrs, thb_intrs );
+        Dev_Set_End_Index( pi, num_thb_intrs, thb_intrs );
     }
 }
 
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index 8bdc36a6..b050392a 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -1037,8 +1037,7 @@ int Cuda_Init_Lists( reax_system *system, control_params *control,
 #endif
 
     /* 3bodies list */
-    thbody = (int *) host_scratch;
-    memset( thbody, 0, sizeof(int) * (*dev_lists + BONDS)->num_intrs );
+    thbody = (int *) scratch;
 
     Cuda_Estimate_Storages_Three_Body( system, control, dev_lists,
             &total_3body, thbody );
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index 5220aa58..b5681c33 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -156,7 +156,7 @@ int Cuda_Post_Evolve( reax_system* system, control_params* control,
 
 
 #ifdef HAVE_CUDA
-void init_blocks(reax_system *system)
+void init_blocks( reax_system *system )
 {
     compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n );
     compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 );
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index 0ddb0523..7876218f 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -1095,6 +1095,7 @@ typedef struct
     int max_sparse_entries;
     /**/
     int num_thbodies;
+    int *d_num_thbodies;
     //TODO: move to reax_atom
     int max_hbonds;
 } reax_system;
-- 
GitLab