From 541940e83c0d6beec7cedb3b1eb8ba617ae9c40d Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu>
Date: Mon, 27 Jun 2016 17:22:52 -0400
Subject: [PATCH] Refactor CUDA storage initialization.

---
 PG-PuReMD/src/cuda_forces.cu | 135 +++++++++++++++++++++++------------
 PG-PuReMD/src/cuda_helpers.h |   6 +-
 2 files changed, 93 insertions(+), 48 deletions(-)

diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda_forces.cu
index 063554bf..195642e2 100644
--- a/PG-PuReMD/src/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda_forces.cu
@@ -57,8 +57,10 @@ CUDA_GLOBAL void ker_estimate_storages (reax_atom *my_atoms,
     reax_atom *atom_i, *atom_j;
 
     i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-
+    if (i >= N)
+    {
+        return;
+    }
 
     //Commented in CUDA_KERNEL
     //for( i = 0; i < N; ++i ) { 
@@ -68,25 +70,29 @@ CUDA_GLOBAL void ker_estimate_storages (reax_atom *my_atoms,
     end_i   = Dev_End_Index(i, &far_nbrs);
     sbp_i = &(sbp[type_i]);
 
-    if( i < n ) { 
+    if( i < n )
+    { 
         local = 1;
         cutoff = control->nonb_cut;
         //++(*Htop);
         atomicAdd (Htop, 1);
         ihb = sbp_i->p_hbond;
     }   
-    else {
+    else
+    {
         local = 0;
         cutoff = control->bond_cut;
         ihb = -1; 
     } 
 
-    for( pj = start_i; pj < end_i; ++pj ) { 
+    for( pj = start_i; pj < end_i; ++pj )
+    { 
         nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
         j = nbr_pj->nbr;
         atom_j = &(my_atoms[j]);
 
-        if (nbr_pj->d <= control->nonb_cut) {
+        if (nbr_pj->d <= control->nonb_cut)
+        {
             type_j = my_atoms[j].type;
             sbp_j = &(sbp[type_j]);
             ihb = sbp_i->p_hbond;
@@ -98,40 +104,56 @@ CUDA_GLOBAL void ker_estimate_storages (reax_atom *my_atoms,
                     && (j < n)
                     && (i > n)
                )
+            {
                 atomicAdd (&hb_top [i], 1);
+            }
 
-            if (i >= n) ihb = -1;
+            if (i >= n)
+            {
+                ihb = -1;
+            }
         }
 
 
 
-        if(nbr_pj->d <= cutoff) {
+        if(nbr_pj->d <= cutoff)
+        {
             type_j = my_atoms[j].type;
             r_ij = nbr_pj->d;
             sbp_j = &(sbp[type_j]);
             twbp = &(tbp[index_tbp (type_i,type_j,num_atom_types)]);
 
-            if( local ) {
+            if( local )
+            {
                 //if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
                 if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
+                {
                     //++(*Htop);
                     atomicAdd (Htop, 1);
+                }
                 else if( j < n || atom_i->orig_id > atom_j->orig_id ) //tryQEq ||1
+                {
                     //++(*Htop);
                     atomicAdd (Htop, 1);
+                }
 
                 if( control->hbond_cut > 0.1 && (ihb==1 || ihb==2) &&
                         nbr_pj->d <= control->hbond_cut 
-                  ) {
+                  )
+                {
                     jhb = sbp_j->p_hbond;
                     if( (ihb == 1) && (jhb == 2))
+                    {
                         //++hb_top[i];
                         atomicAdd (&hb_top[i], 1);
+                    }
                     //else if( j < n && ihb == 2 && jhb == 1 )
                     //else if( ihb == 2 && jhb == 1 && j < n)
                     else if( ihb == 2 && jhb == 1 && j < n)
+                    {
                         //++hb_top[j];
                         atomicAdd (&hb_top[i], 1);
+                    }
                 }
             }
 
@@ -176,8 +198,13 @@ CUDA_GLOBAL void ker_estimate_storages (reax_atom *my_atoms,
 CUDA_GLOBAL void ker_init_system_atoms(reax_atom *my_atoms, int N, 
         int *hb_top, int *bond_top)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
+    int i;
+    
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N)
+    {
+        return;
+    }
 
     my_atoms[i].num_bonds = bond_top [i];
     my_atoms[i].num_hbonds = hb_top [i];
@@ -189,69 +216,83 @@ void Cuda_Estimate_Storages(reax_system *system, control_params *control,
         int *Htop, int *hb_top, 
         int *bond_top, int *num_3body)
 {
+    int i;
     int blocks = 0;
-    int *l_Htop, *l_hb_top, *l_bond_top, *l_num_3body;
-    int *tmp = (int *)scratch;
+    int *d_Htop, *d_hb_top, *d_bond_top, *d_num_3body;
+    int bond_count = 0;
+    int hbond_count = 0;
+    int max_bonds = 0, min_bonds = 999999;
+    int max_hbonds = 0, min_hbonds = 999999;
 
     *Htop = 0;
     //memset( hb_top, 0, sizeof(int) * local_cap);
-    memset( hb_top, 0, sizeof(int) * total_cap);
+    memset( hb_top, 0, sizeof(int) * total_cap );
     memset( bond_top, 0, sizeof(int) * total_cap );
     *num_3body = 0;
 
-    //cuda_memset (tmp, 0, 1 + 1 + sizeof (int) * (local_cap+ total_cap), "Cuda_Estimate_Storages");
-    cuda_memset (tmp, 0, sizeof (int) * (1 + 1 + total_cap+ total_cap), "Cuda_Estimate_Storages");
-
-    l_Htop = tmp; 
-    l_num_3body = l_Htop + 1;
-    l_hb_top = l_num_3body + 1;
-    //l_bond_top = l_hb_top + local_cap;
-    l_bond_top = l_hb_top + total_cap;
+    cuda_memset ( d_Htop, 0, sizeof (int), "Cuda_Estimate_Storages" );
+    cuda_memset ( d_hb_top, 0, sizeof (int) * total_cap, "Cuda_Estimate_Storages" );
+//    cuda_memset ( d_bond_top, 0, sizeof (int) * local_cap, "Cuda_Estimate_Storages" );
+    cuda_memset ( d_bond_top, 0, sizeof (int) * total_cap, "Cuda_Estimate_Storages" );
+    cuda_memset ( d_num_3body, 0, sizeof (int), "Cuda_Estimate_Storages" );
 
-    blocks = system->N / ST_BLOCK_SIZE + 
-        ((system->N % ST_BLOCK_SIZE == 0) ? 0 : 1);
+    blocks = (int) CEIL((real)system->N / ST_BLOCK_SIZE);
 
     ker_estimate_storages <<< blocks, ST_BLOCK_SIZE>>>
         (system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
          (control_params *)control->d_control_params, *(*dev_lists + FAR_NBRS), system->reax_param.num_atom_types,
          system->n, system->N, system->Hcap, system->total_cap, 
-         l_Htop, l_num_3body, l_bond_top, l_hb_top );
+         d_Htop, d_num_3body, d_bond_top, d_hb_top );
     cudaThreadSynchronize ();
     cudaCheckError ();
 
-    copy_host_device( Htop, l_Htop, sizeof (int), cudaMemcpyDeviceToHost, "Htop");
-    copy_host_device( num_3body, l_num_3body, sizeof (int), cudaMemcpyDeviceToHost, "num_3body");
-    //copy_host_device( hb_top, l_hb_top, sizeof (int) * local_cap, cudaMemcpyDeviceToHost, "hb_top");
-    copy_host_device( hb_top, l_hb_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "hb_top");
-    copy_host_device( bond_top, l_bond_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "bond_top");
-
+    copy_host_device( Htop, d_Htop, sizeof (int), cudaMemcpyDeviceToHost, "Htop");
+    copy_host_device( num_3body, d_num_3body, sizeof (int), cudaMemcpyDeviceToHost, "num_3body");
+    //copy_host_device( hb_top, d_hb_top, sizeof (int) * local_cap, cudaMemcpyDeviceToHost, "hb_top");
+    copy_host_device( hb_top, d_hb_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "hb_top");
+    copy_host_device( bond_top, d_bond_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "bond_top");
 
-    int bond_count = 0;
-    int hbond_count = 0;
-    int max_bonds = 0, min_bonds = 999999;
-    int max_hbonds = 0, min_hbonds = 999999;
+    for (i = 0; i < system->N; i++)
+    {
+        if (bond_top[i] >= max_bonds)
+        {
+            max_bonds = bond_top[i];
+        }
+        if (bond_top[i] <= min_bonds)
+        {
+            min_bonds = bond_top[i];
+        }
 
-    for (int i = 0; i < system->N; i++) {
-        if (bond_top[i] >= max_bonds) max_bonds = bond_top[i];
-        if (bond_top[i] <= min_bonds) min_bonds = bond_top[i];
         bond_count += bond_top[i];
     }
     system->max_bonds = max_bonds * SAFER_ZONE;
+
     //for (int i = 0; i < system->n; i++)
-    for (int i = 0; i < system->N; i++){
-        if (hb_top[i] >= max_hbonds) max_hbonds = hb_top[i];
-        if (hb_top[i] <= min_hbonds) min_hbonds = hb_top[i];
+    for (i = 0; i < system->N; i++)
+    {
+        if (hb_top[i] >= max_hbonds)
+        {
+            max_hbonds = hb_top[i];
+        }
+        if (hb_top[i] <= min_hbonds)
+        {
+            min_hbonds = hb_top[i];
+        }
+
         hbond_count += hb_top [i];
     }
     system->max_hbonds = max_hbonds * SAFER_ZONE;
-    //fprintf (stderr, " TOTAL DEVICE BOND COUNT: %d \n", bond_count);
-    //fprintf (stderr, " TOTAL DEVICE HBOND COUNT: %d \n", hbond_count);
-    //fprintf (stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", *Htop);
+
+//#if defined(DEBUG)
+    fprintf (stderr, " TOTAL DEVICE BOND COUNT: %d \n", bond_count);
+    fprintf (stderr, " TOTAL DEVICE HBOND COUNT: %d \n", hbond_count);
+    fprintf (stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", *Htop);
     fprintf (stderr, "p:%d --> Bonds(%d, %d) HBonds (%d, %d) *******\n", 
             system->my_rank, min_bonds, max_bonds, min_hbonds, max_hbonds);
+//#endif
 
     ker_init_system_atoms <<<blocks, ST_BLOCK_SIZE>>>
-        (system->d_my_atoms, system->N, l_hb_top, l_bond_top );
+        (system->d_my_atoms, system->N, d_hb_top, d_bond_top );
     cudaThreadSynchronize ();
     cudaCheckError ();
 }
@@ -964,7 +1005,7 @@ int Cuda_Validate_Lists (reax_system *system, storage *workspace, reax_list **li
                 return FAILURE;
             }
             if (end_index[i] - index[i] >= max_bonds)
-                max_bonds = index[i] - index[i];
+                max_bonds = end_index[i] - index[i];
         }
         realloc->num_bonds = max_bonds;
 
diff --git a/PG-PuReMD/src/cuda_helpers.h b/PG-PuReMD/src/cuda_helpers.h
index 1fea3e26..0d6282a8 100644
--- a/PG-PuReMD/src/cuda_helpers.h
+++ b/PG-PuReMD/src/cuda_helpers.h
@@ -1,9 +1,9 @@
-
 #ifndef __CUDA_HELPERS__
 #define __CUDA_HELPERS__
 
 #include "reax_types.h"
 
+
 CUDA_DEVICE inline int cuda_strcmp (char *a, char *b, int len)
 {
     char *src, *dst;
@@ -26,6 +26,7 @@ CUDA_DEVICE inline int cuda_strcmp (char *a, char *b, int len)
     return 0;
 }
 
+
 CUDA_DEVICE inline real atomicAdd(real* address, real val)
 {
     unsigned long long int* address_as_ull =
@@ -38,9 +39,11 @@ CUDA_DEVICE inline real atomicAdd(real* address, real val)
                         __double_as_longlong(val + __longlong_as_double(assumed)));
     }
     while (assumed != old);
+
     return __longlong_as_double(old);
 }
 
+
 CUDA_DEVICE inline void atomic_rvecAdd( rvec ret, rvec v )
 {
     atomicAdd ( &ret[0], v[0] );
@@ -48,6 +51,7 @@ CUDA_DEVICE inline void atomic_rvecAdd( rvec ret, rvec v )
     atomicAdd ( &ret[2], v[2] );
 }
 
+
 CUDA_DEVICE inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
 {
     atomicAdd ( &ret[0], c * v[0] );
-- 
GitLab