From 792df8c2a3a202b7a9b939ffdb061ef87ed92e23 Mon Sep 17 00:00:00 2001
From: KAO <ohearnk@seawolf.cis.gvsu.edu>
Date: Thu, 24 Nov 2016 10:58:31 -0500
Subject: [PATCH] PuReMD-GPU: code clean-up, and begin updating build system.

---
 PuReMD-GPU/Makefile.am                    |   8 +-
 PuReMD-GPU/configure.ac                   |   6 +-
 PuReMD-GPU/src/GMRES.cu                   |   3 +-
 PuReMD-GPU/src/allocate.cu                |  16 +-
 PuReMD-GPU/src/bond_orders.cu             |   2 +-
 PuReMD-GPU/src/bond_orders.h              |   3 +
 PuReMD-GPU/src/cuda_copy.cu               |   5 +-
 PuReMD-GPU/src/cuda_copy.h                |   2 +-
 PuReMD-GPU/src/cuda_helpers.h             |  40 +++--
 PuReMD-GPU/src/cuda_init.cu               |  16 +-
 PuReMD-GPU/src/cuda_utils.cu              |   5 +-
 PuReMD-GPU/src/cuda_utils.h               |  10 +-
 PuReMD-GPU/src/four_body_interactions.cu  |  36 ++---
 PuReMD-GPU/src/init_md.cu                 |   8 +-
 PuReMD-GPU/src/list.h                     |   4 +-
 PuReMD-GPU/src/mytypes.h                  | 182 +++++++++++-----------
 PuReMD-GPU/src/reduction.cu               |   2 +-
 PuReMD-GPU/src/single_body_interactions.h |   3 +-
 PuReMD-GPU/src/system_props.cu            |   4 +-
 PuReMD-GPU/src/system_props.h             |   4 +-
 PuReMD-GPU/src/two_body_interactions.h    |   3 +-
 21 files changed, 183 insertions(+), 179 deletions(-)

diff --git a/PuReMD-GPU/Makefile.am b/PuReMD-GPU/Makefile.am
index d4332370..1a914816 100644
--- a/PuReMD-GPU/Makefile.am
+++ b/PuReMD-GPU/Makefile.am
@@ -18,12 +18,12 @@ NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict
 #NVCCFLAGS += --ptxas-options -v
 
 bin_PROGRAMS = bin/puremd-gpu
-bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c src/reset_utils.c src/param.c src/pdb_tools.c \
+bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c src/restart.c src/param.c src/pdb_tools.c \
 	src/GMRES.cu src/QEq.cu src/allocate.cu src/bond_orders.cu \
 	src/box.cu src/forces.cu src/four_body_interactions.cu \
 	src/grid.cu src/init_md.cu src/integrate.cu src/list.cu \
 	src/lookup.cu src/neighbors.cu \
-	src/restart.cu src/single_body_interactions.cu \
+	src/reset_utils.cu src/single_body_interactions.cu \
 	src/system_props.cu src/three_body_interactions.cu \
 	src/traj.cu src/two_body_interactions.cu src/vector.cu \
 	src/testmd.cu \
@@ -31,12 +31,12 @@ bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c src/reset_utils.c src/p
 	src/center_mass.cu src/helpers.cu src/validation.cu src/matvec.cu
 
 include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
-        src/reset_utils.h src/param.h src/pdb_tools.h \
+        src/restart.h src/param.h src/pdb_tools.h \
 	src/GMRES.h src/QEq.h src/allocate.h src/bond_orders.h \
 	src/box.h src/forces.h src/four_body_interactions.h \
 	src/grid.h src/init_md.h src/integrate.h src/list.h \
 	src/lookup.h src/neighbors.h \
-	src/restart.h src/single_body_interactions.h \
+	src/reset_utils.h src/single_body_interactions.h \
 	src/system_props.h src/three_body_interactions.h \
 	src/traj.h src/two_body_interactions.h src/vector.h \
 	src/cuda_utils.h src/cuda_copy.h src/cuda_init.h src/reduction.h \
diff --git a/PuReMD-GPU/configure.ac b/PuReMD-GPU/configure.ac
index c947ed02..9736714d 100644
--- a/PuReMD-GPU/configure.ac
+++ b/PuReMD-GPU/configure.ac
@@ -50,7 +50,7 @@ AC_SEARCH_LIBS([cublasDdot], [cublas])
 AC_SEARCH_LIBS([cudaThreadSynchronize], [cuda])
 AC_SEARCH_LIBS([cudaCheckError], [cuda])
 # FIXME: Replace `main' with a function in `-lcudart':
-#AC_CHECK_LIB([cudart], [main])
+AC_CHECK_LIB([cudart], [main])
 AC_SEARCH_LIBS([cusparseCheckError], [cusparse])
 AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse])
 AC_SEARCH_LIBS([cusparseSetMatType], [cusparse])
@@ -89,10 +89,6 @@ AC_CHECK_TYPES([cusparseHandle_t], [],
 	       [AC_MSG_FAILURE([cusparseHandle_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
 AC_CHECK_TYPES([cusparseMatDescr_t], [], 
 	       [AC_MSG_FAILURE([cusparseMatDescr_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
-#AC_CHECK_TYPES([CUSPARSE_MATRIX_TYPE_GENERAL], [], 
-#	       [AC_MSG_FAILURE([CUSPARSE_MATRIX_TYPE_GENERAL type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
-#AC_CHECK_TYPES([CUSPARSE_INDEX_BASE_ZERO], [], 
-#	       [AC_MSG_FAILURE([CUSPARSE_INDEX_BASE_ZERO type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
 
 AC_CONFIG_FILES([Makefile])
 
diff --git a/PuReMD-GPU/src/GMRES.cu b/PuReMD-GPU/src/GMRES.cu
index 2c1b0cf6..1c5c28c8 100644
--- a/PuReMD-GPU/src/GMRES.cu
+++ b/PuReMD-GPU/src/GMRES.cu
@@ -480,11 +480,10 @@ int Cublas_GMRES(reax_system *system, static_storage *workspace, real *b, real t
     real cc, tmp1, tmp2, temp, bnorm;
     real v_add_tmp;
     sparse_matrix *H = &workspace->H;
-
     real t_start, t_elapsed;
-
     real *spad = (real *)scratch;
     real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
+    cublasHandle_t cublasHandle;
 
     N = H->n;
 
diff --git a/PuReMD-GPU/src/allocate.cu b/PuReMD-GPU/src/allocate.cu
index 37b80693..bae0dde8 100644
--- a/PuReMD-GPU/src/allocate.cu
+++ b/PuReMD-GPU/src/allocate.cu
@@ -26,8 +26,8 @@
 
 void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 {
-    Delete_List( far_nbrs );
-    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs )){
+    Delete_List( far_nbrs, TYP_HOST );
+    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_HOST )){
         fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
         exit( INIT_ERR );
     }
@@ -153,7 +153,7 @@ int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top,
         hb_top[i] += hb_top[i-1];
     num_hbonds = hb_top[n-1];
 
-    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) {
+    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds, TYP_HOST ) ) {
         fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
         exit( INIT_ERR );
     }
@@ -238,7 +238,7 @@ int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
         if( h_index[i] >= 0 )
             hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS);
 
-    Delete_List( hbonds );
+    Delete_List( hbonds, TYP_HOST );
 
     Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
 
@@ -336,7 +336,7 @@ int Allocate_Bond_List( int n, int *bond_top, list *bonds )
         bond_top[i] += bond_top[i-1];
     num_bonds = bond_top[n-1];
 
-    if( !Make_List(n, num_bonds, TYP_BOND, bonds ) ) {
+    if( !Make_List(n, num_bonds, TYP_BOND, bonds, TYP_HOST ) ) {
         fprintf( stderr, "not enough space for bonds list. terminating!\n" );
         exit( INIT_ERR );
     }
@@ -372,7 +372,7 @@ int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
         bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
     }
 
-    Delete_List( bonds );
+    Delete_List( bonds, TYP_HOST );
 
     Allocate_Bond_List( n, bond_top, bonds );
     *num_bonds = bond_top[n-1];
@@ -585,14 +585,14 @@ void Reallocate( reax_system *system, static_storage *workspace, list **lists,
 
     if( realloc->num_3body > 0 ) {
         fprintf (stderr, " Reallocating 3Body \n");
-        Delete_List( (*lists)+THREE_BODIES );
+        Delete_List( (*lists)+THREE_BODIES, TYP_HOST );
 
         if( num_bonds == -1 )
             num_bonds = ((*lists)+BONDS)->num_intrs;
         realloc->num_3body *= SAFE_ZONE;
 
         if( !Make_List( num_bonds, realloc->num_3body,
-                    TYP_THREE_BODY, (*lists)+THREE_BODIES ) ) {
+                    TYP_THREE_BODY, (*lists)+THREE_BODIES, TYP_HOST ) ) {
             fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
             exit( INIT_ERR );
         }
diff --git a/PuReMD-GPU/src/bond_orders.cu b/PuReMD-GPU/src/bond_orders.cu
index 57f5baac..d39f1b41 100644
--- a/PuReMD-GPU/src/bond_orders.cu
+++ b/PuReMD-GPU/src/bond_orders.cu
@@ -19,12 +19,12 @@
   ----------------------------------------------------------------------*/
 
 #include "bond_orders.h"
+
 #include "list.h"
 #include "lookup.h"
 #include "print_utils.h"
 #include "vector.h"
 
-
 #include "index_utils.h"
 #include "cuda_utils.h"
 #include "cuda_helpers.h"
diff --git a/PuReMD-GPU/src/bond_orders.h b/PuReMD-GPU/src/bond_orders.h
index 19fea911..3987a5c6 100644
--- a/PuReMD-GPU/src/bond_orders.h
+++ b/PuReMD-GPU/src/bond_orders.h
@@ -21,8 +21,10 @@
 #ifndef __BOND_ORDERS_H_
 #define __BOND_ORDERS_H_
 
+
 #include "mytypes.h"
 
+
 typedef struct
 {
     real C1dbo, C2dbo, C3dbo;
@@ -31,6 +33,7 @@ typedef struct
     real C1dDelta, C2dDelta, C3dDelta;
 } dbond_coefficients;
 
+
 #ifdef TEST_FORCES
 void Get_dBO( reax_system*, list**, int, int, real, rvec* );
 void Get_dBOpinpi2( reax_system*, list**, int, int, real, real, rvec*, rvec* );
diff --git a/PuReMD-GPU/src/cuda_copy.cu b/PuReMD-GPU/src/cuda_copy.cu
index 2db79e37..a29d38d3 100644
--- a/PuReMD-GPU/src/cuda_copy.cu
+++ b/PuReMD-GPU/src/cuda_copy.cu
@@ -18,12 +18,11 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-
-
-
 #include "cuda_copy.h"
+
 #include "vector.h"
 
+
 void Sync_Host_Device (grid *host, grid *dev, enum cudaMemcpyKind dir)
 {
     copy_host_device (host->top, dev->top, 
diff --git a/PuReMD-GPU/src/cuda_copy.h b/PuReMD-GPU/src/cuda_copy.h
index 561a49fb..6f2509e8 100644
--- a/PuReMD-GPU/src/cuda_copy.h
+++ b/PuReMD-GPU/src/cuda_copy.h
@@ -24,7 +24,7 @@
 #define __CUDA_COPY_H_
 
 #include "cuda_utils.h"
-#include "cuda.h"
+
 #include "mytypes.h"
 #include "list.h"
 
diff --git a/PuReMD-GPU/src/cuda_helpers.h b/PuReMD-GPU/src/cuda_helpers.h
index e021cf84..adbf73ce 100644
--- a/PuReMD-GPU/src/cuda_helpers.h
+++ b/PuReMD-GPU/src/cuda_helpers.h
@@ -21,9 +21,11 @@
 #ifndef __CUDA_HELPERS__
 #define __CUDA_HELPERS__
 
+
 #include "mytypes.h"
 
-DEVICE inline int cuda_strcmp (char *a, char *b, int len)
+
+DEVICE static inline int cuda_strcmp(char *a, char *b, int len)
 {
     char *src, *dst;
 
@@ -32,20 +34,26 @@ DEVICE inline int cuda_strcmp (char *a, char *b, int len)
 
     for (int i = 0; i < len; i++)
     {
-
         if (*dst == '\0')
+        {
             return 0;
+        }
 
-        if (*src != *dst)  return 1;
+        if (*src != *dst)
+        {
+            return 1;
+        }
 
-        src ++;
-        dst ++;
+        src++;
+        dst++;
     }
 
     return 0;
 }
 
-DEVICE inline real atomicAdd(real* address, real val)
+
+#if __CUDA_ARCH__ < 600
+DEVICE static inline real atomicAdd(double* address, double val)
 {
     unsigned long long int* address_as_ull =
         (unsigned long long int*)address;
@@ -59,19 +67,23 @@ DEVICE inline real atomicAdd(real* address, real val)
     while (assumed != old);
     return __longlong_as_double(old);
 }
+#endif
+
 
-DEVICE inline void atomic_rvecAdd( rvec ret, rvec v )
+DEVICE static inline void atomic_rvecAdd( rvec ret, rvec v )
 {
-    atomicAdd ( &ret[0], v[0] );
-    atomicAdd ( &ret[1], v[1] );
-    atomicAdd ( &ret[2], v[2] );
+    atomicAdd( (double*)&ret[0], (double)v[0] );
+    atomicAdd( (double*)&ret[1], (double)v[1] );
+    atomicAdd( (double*)&ret[2], (double)v[2] );
 }
 
-DEVICE inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
+
+DEVICE static inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
 {
-    atomicAdd ( &ret[0], c * v[0] );
-    atomicAdd ( &ret[1], c * v[1] );
-    atomicAdd ( &ret[2], c * v[2] );
+    atomicAdd( (double*)&ret[0], (double)(c * v[0]) );
+    atomicAdd( (double*)&ret[1], (double)(c * v[1]) );
+    atomicAdd( (double*)&ret[2], (double)(c * v[2]) );
 }
 
+
 #endif
diff --git a/PuReMD-GPU/src/cuda_init.cu b/PuReMD-GPU/src/cuda_init.cu
index 09515038..6bad5e22 100644
--- a/PuReMD-GPU/src/cuda_init.cu
+++ b/PuReMD-GPU/src/cuda_init.cu
@@ -18,15 +18,15 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-
-
-
 #include "cuda_init.h"
+
 #include "cuda_utils.h"
 #include "cuda_copy.h"
+
 #include "vector.h"
 #include "reset_utils.h"
 
+
 void Cuda_Init_System ( reax_system *system)
 {
     cuda_malloc ( (void **) &system->d_atoms, system->N * REAX_ATOM_SIZE, 1, RES_SYSTEM_ATOMS );    
@@ -55,17 +55,20 @@ void Cuda_Init_System ( reax_system *system)
     system->reaxprm.d_gp.vdw_type = 0;
 }
 
+
 void Cuda_Init_Control (control_params *control)
 {
     cuda_malloc ((void **)&control->d_control, CONTROL_PARAMS_SIZE, 1, RES_CONTROL_PARAMS );
     copy_host_device (control, control->d_control, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
 }
 
+
 void Cuda_Init_Simulation_Data (simulation_data *data)
 {
     cuda_malloc ((void **) &(data->d_simulation_data), SIMULATION_DATA_SIZE, 1, RES_SIMULATION_DATA );
 }
 
+
 GLOBAL void Initialize_Grid (ivec *nbrs, rvec *nbrs_cp, int N)
 {
     int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -80,6 +83,7 @@ GLOBAL void Initialize_Grid (ivec *nbrs, rvec *nbrs_cp, int N)
     nbrs_cp[index][2] = -1;
 }
 
+
 void Cuda_Init_Grid (grid *host, grid *dev)
 {
     int total = host->ncell[0] * host->ncell[1] * host->ncell[2];
@@ -112,6 +116,7 @@ void Cuda_Init_Grid (grid *host, grid *dev)
     cudaCheckError ();
 }
 
+
 GLOBAL void Init_Workspace_Arrays (single_body_parameters *sbp, reax_atom *atoms, 
         static_storage workspace, int N)
 {
@@ -127,6 +132,7 @@ GLOBAL void Init_Workspace_Arrays (single_body_parameters *sbp, reax_atom *atoms
     workspace.b[i+N] = -1.0;
 }
 
+
 GLOBAL void Init_Map_Serials (int *input, int N)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -135,6 +141,7 @@ GLOBAL void Init_Map_Serials (int *input, int N)
     input[i] = -1;
 }
 
+
 void Cuda_Init_Workspace_System (reax_system *system, static_storage *workspace )
 {
     int blocks, block_size = BLOCK_SIZE;
@@ -262,6 +269,7 @@ void Cuda_Init_Workspace( reax_system *system, control_params *control,
     Cuda_Reset_Workspace( system, workspace );
 }
 
+
 void Cuda_Init_Workspace_Device ( static_storage *workspace )
 {
     workspace->realloc.estimate_nbrs = -1;
@@ -273,6 +281,7 @@ void Cuda_Init_Workspace_Device ( static_storage *workspace )
     workspace->realloc.gcell_atoms = -1;
 }
 
+
 void Cuda_Init_Sparse_Matrix (sparse_matrix *matrix, int entries, int N)
 {
     cuda_malloc ((void **) &matrix->start, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX );
@@ -284,6 +293,7 @@ void Cuda_Init_Sparse_Matrix (sparse_matrix *matrix, int entries, int N)
 
 }
 
+
 void Cuda_Init_Scratch ()
 {
     cuda_malloc ((void **) &scratch, SCRATCH_SIZE, 0, RES_SCRATCH );
diff --git a/PuReMD-GPU/src/cuda_utils.cu b/PuReMD-GPU/src/cuda_utils.cu
index 2c632c05..fc8b132a 100644
--- a/PuReMD-GPU/src/cuda_utils.cu
+++ b/PuReMD-GPU/src/cuda_utils.cu
@@ -18,11 +18,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-
-
-
 #include "cuda_utils.h"
-#include "mytypes.h"
+
 
 void cuda_malloc (void **ptr, int size, int memset, int err_code) {
 
diff --git a/PuReMD-GPU/src/cuda_utils.h b/PuReMD-GPU/src/cuda_utils.h
index ba793e40..363b7c14 100644
--- a/PuReMD-GPU/src/cuda_utils.h
+++ b/PuReMD-GPU/src/cuda_utils.h
@@ -21,14 +21,14 @@
 #ifndef __CUDA_UTILS_H_
 #define __CUDA_UTILS_H_
 
-#include "cuda.h"
-#include "cublas_v2.h"
-#include "cusparse_v2.h"
-#include "stdlib.h"
-#include "stdio.h"
+#include "mytypes.h"
+
+#include <stdlib.h>
+#include <stdio.h>
 
 #define IDX2C(i,j,ld) (((j)*(ld))+(i))
 
+
 static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta)
 {
     cublasSscal (handle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm);
diff --git a/PuReMD-GPU/src/four_body_interactions.cu b/PuReMD-GPU/src/four_body_interactions.cu
index d7bf757e..af134bab 100644
--- a/PuReMD-GPU/src/four_body_interactions.cu
+++ b/PuReMD-GPU/src/four_body_interactions.cu
@@ -993,33 +993,23 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                 if( control->ensemble == NVE || control->ensemble == NVT ||control->ensemble == bNVT) {
                                     /* dcos_theta_ijk */
                                     //PERFORMANCE IMPACT
-                                    atomic_rvecScaledAdd (pbond_ij->i_f, 
-                                            CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors7 + CEconj4, p_ijk->dcos_dj );
-                                    atomic_rvecScaledAdd( pbond_jk->k_f, 
-                                            CEtors7 + CEconj4, p_ijk->dcos_di );
+                                    atomic_rvecScaledAdd( pbond_ij->i_f, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                    atomic_rvecScaledAdd( pbond_jk->k_f, CEtors7 + CEconj4, p_ijk->dcos_di );
 
 
                                     /* dcos_theta_jkl */
                                     //PERFORMANCE IMPACT
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors8 + CEconj5, p_jkl->dcos_di );
-                                    atomic_rvecScaledAdd( pbond_jk->i_f, 
-                                            CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                    atomic_rvecScaledAdd( pbond_kl->k_f, 
-                                            CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors8 + CEconj5, p_jkl->dcos_di );
+                                    atomic_rvecScaledAdd( pbond_jk->i_f, CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                    atomic_rvecScaledAdd( pbond_kl->k_f, CEtors8 + CEconj5, p_jkl->dcos_dk );
 
                                     /* dcos_omega */
                                     //PERFORMANCE IMPACT
-                                    atomic_rvecScaledAdd( pbond_ij->i_f, 
-                                            CEtors9 + CEconj6, dcos_omega_di );
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors9 + CEconj6, dcos_omega_dj );
-                                    atomic_rvecScaledAdd( pbond_jk->i_f, 
-                                            CEtors9 + CEconj6, dcos_omega_dk );
-                                    atomic_rvecScaledAdd( pbond_kl->k_f, 
-                                            CEtors9 + CEconj6, dcos_omega_dl );
+                                    atomic_rvecScaledAdd( pbond_ij->i_f, CEtors9 + CEconj6, dcos_omega_di );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors9 + CEconj6, dcos_omega_dj );
+                                    atomic_rvecScaledAdd( pbond_jk->i_f, CEtors9 + CEconj6, dcos_omega_dk );
+                                    atomic_rvecScaledAdd( pbond_kl->k_f, CEtors9 + CEconj6, dcos_omega_dl );
                                 }
                                 else {
                                     ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
@@ -1033,8 +1023,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                     //rvec_Add (sh_press [threadIdx.x], ext_press);
 
                                     //PERFORMANCE IMPACT
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors7 + CEconj4, p_ijk->dcos_dj );
 
                                     rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
                                     //PERFORMANCE IMPACT
@@ -1047,8 +1036,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
 
                                     /* dcos_theta_jkl */
                                     //PERFORMANCE IMPACT
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors8 + CEconj5, p_jkl->dcos_di );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors8 + CEconj5, p_jkl->dcos_di );
 
                                     rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
                                     //PERFORMANCE IMPACT
diff --git a/PuReMD-GPU/src/init_md.cu b/PuReMD-GPU/src/init_md.cu
index e1912d3c..87b84a76 100644
--- a/PuReMD-GPU/src/init_md.cu
+++ b/PuReMD-GPU/src/init_md.cu
@@ -967,7 +967,7 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
             fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs);
 #endif
 
-            if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS) ) {
+            if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS), TYP_HOST ) {
                 fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
                 exit( INIT_ERR );
             }
@@ -1036,7 +1036,7 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
 #endif
 
             /* 3bodies list */
-            if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES)) {
+            if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES), TYP_HOST) {
                 fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
                 exit( INIT_ERR );
             }
@@ -1046,12 +1046,12 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
                     num_3body * sizeof(three_body_interaction_data) / (1024*1024) );
 #endif
 #ifdef TEST_FORCES
-            if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) {
+            if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA, TYP_HOST )) {
                 fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
                 exit( INIT_ERR );
             }
 
-            if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) {
+            if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO, TYP_HOST ) ) {
                 fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
                 exit( INIT_ERR );
             }
diff --git a/PuReMD-GPU/src/list.h b/PuReMD-GPU/src/list.h
index f341c2e2..66456acb 100644
--- a/PuReMD-GPU/src/list.h
+++ b/PuReMD-GPU/src/list.h
@@ -23,8 +23,8 @@
 
 #include "mytypes.h"
 
-HOST char Make_List( int, int, int, list* , int proc = TYP_HOST);
-HOST void Delete_List( list* , int proc = TYP_HOST);
+HOST char Make_List( int, int, int, list* , int );
+HOST void Delete_List( list* , int );
 
 
 inline HOST_DEVICE int Num_Entries(int i, list* l)
diff --git a/PuReMD-GPU/src/mytypes.h b/PuReMD-GPU/src/mytypes.h
index c7d42ee9..91b9438d 100644
--- a/PuReMD-GPU/src/mytypes.h
+++ b/PuReMD-GPU/src/mytypes.h
@@ -18,8 +18,34 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __MYTYPES_H_
-#define __MYTYPES_H_
+#if !(defined(__MYTYPES_H_) || defined(__CUDA_MYTYPES_H_))
+
+#ifdef __CUDACC__
+  #ifndef __CUDA_MYTYPES_H_
+    #define __CUDA_MYTYPES_H_
+    #define HOST __host__
+    #define DEVICE __device__
+    #define GLOBAL __global__
+    #define HOST_DEVICE __host__ __device__
+
+    #include <cuda.h>
+    #include <cublas_v2.h>
+    #include <cusparse_v2.h>
+  #endif
+#else
+  #ifndef __MYTYPES_H_
+    #define __MYTYPES_H_
+    #define HOST
+    #define DEVICE
+    #define GLOBAL
+    #define HOST_DEVICE
+  #endif
+#endif
+
+#if (defined(HAVE_CONFIG_H) && !defined(__CONFIG_H_))
+  #define __CONFIG_H_
+  #include "config.h"
+#endif
 
 #include "math.h"
 //#include "random.h"
@@ -30,29 +56,12 @@
 #include "time.h"
 #include "zlib.h"
 
-
 //#define DEBUG_FOCUS
 //#define TEST_FORCES
 //#define TEST_ENERGY
 //#define REORDER_ATOMS  // turns on nbrgen opt by re-ordering atoms
 //#define LGJ
 
-#ifdef __USE_GPU__
-
-#include "cublas_v2.h"
-#include "cusparse_v2.h"
-
-#define HOST __host__
-#define DEVICE __device__
-#define GLOBAL __global__
-#define HOST_DEVICE __host__ __device__
-#else
-#define HOST
-#define DEVICE
-#define GLOBAL
-#define HOST_DEVICE
-#endif
-
 #define EXP    exp
 #define SQRT   sqrt
 #define POW    pow
@@ -140,10 +149,10 @@
 #define RES_GRID_MARK       0x03
 #define RES_GRID_START      0x04
 #define RES_GRID_END        0x05
-#define     RES_GRID_NBRS       0x06
-#define     RES_GRID_NBRS_CP    0x07
+#define RES_GRID_NBRS       0x06
+#define RES_GRID_NBRS_CP    0x07
 
-#define     RES_SYSTEM_ATOMS            0x10
+#define RES_SYSTEM_ATOMS            0x10
 #define RES_SYSTEM_SIMULATION_BOX   0x11
 
 #define RES_REAX_INT_SBP    0x20
@@ -154,58 +163,58 @@
 
 #define RES_SIMULATION_DATA 0x30
 
-#define      RES_STORAGE                        0x401
-#define      RES_STORAGE_HBOND_INDEX        0x402
-#define      RES_STORAGE_TOTAL_BOND_ORDER   0x403
-#define      RES_STORAGE_DELTAP             0x404
-#define      RES_STORAGE_DELTAP_BOC         0x404
-#define      RES_STORAGE_DDELTAP_SELF       0x405
-#define      RES_STORAGE_DELTA              0x406
-#define      RES_STORAGE_DELTA_LP           0x407
-#define      RES_STORAGE_DELTA_LP_TEMP      0x408
-#define      RES_STORAGE_DDELTA_LP          0x409
-#define      RES_STORAGE_DDELTA_LP_TEMP 0x40A
-#define      RES_STORAGE_DELTA_E                0x40B
-#define      RES_STORAGE_DELTA_BOC          0x40C
-#define      RES_STORAGE_NL                 0x40D
-#define      RES_STORAGE_NLP_TEMP           0x40E
-#define      RES_STORAGE_CLP                    0x40F
-#define      RES_STORAGE_CDDELTA                0x410
-#define      RES_STORAGE_VLPEX              0x411
-#define      RES_STORAGE_DROPTOL                0x412
-#define      RES_STORAGE_W                      0x413
-#define      RES_STORAGE_HDIA_INV           0x414
-#define      RES_STORAGE_B                      0x415
-#define      RES_STORAGE_B_S                    0x416
-#define      RES_STORAGE_B_T                    0x417
-#define      RES_STORAGE_B_PRC              0x418
-#define      RES_STORAGE_B_PRM              0x419
-#define      RES_STORAGE_S_T                    0x41A
-#define      RES_STORAGE_S                      0x41B
-#define      RES_STORAGE_T                      0x41C
-#define      RES_STORAGE_Y                      0x41D
-#define      RES_STORAGE_Z                      0x41E
-#define      RES_STORAGE_G                      0x41F
-#define      RES_STORAGE_HS                 0x420
-#define      RES_STORAGE_HC                 0x421
-#define      RES_STORAGE_RN                 0x422
-#define      RES_STORAGE_V                  0x423
-#define      RES_STORAGE_H                      0x424
-#define      RES_STORAGE_R                      0x425
-#define      RES_STORAGE_D                      0x426
-#define      RES_STORAGE_Q                      0x427
-#define      RES_STORAGE_P                      0x428
-#define      RES_STORAGE_A                      0x429
-#define      RES_STORAGE_F_OLD              0x42A
-#define      RES_STORAGE_V_CONST                0x42B
-#define      RES_STORAGE_MARK                   0x42C
-#define      RES_STORAGE_OLD_MARK           0x42D
-#define      RES_STORAGE_X_OLD              0x42E
-#define      RES_STORAGE_NLP                    0x42F
-#define      RES_STORAGE_MAP_SERIALS        0x430
-#define     RES_STORAGE_RESTRICTED          0x431
-#define      RES_STORAGE_RESTRICTED_LIST    0x432
-#define      RES_STORAGE_ORIG_ID                0x433
+#define RES_STORAGE                    0x401
+#define RES_STORAGE_HBOND_INDEX        0x402
+#define RES_STORAGE_TOTAL_BOND_ORDER   0x403
+#define RES_STORAGE_DELTAP             0x404
+#define RES_STORAGE_DELTAP_BOC         0x404
+#define RES_STORAGE_DDELTAP_SELF       0x405
+#define RES_STORAGE_DELTA              0x406
+#define RES_STORAGE_DELTA_LP           0x407
+#define RES_STORAGE_DELTA_LP_TEMP      0x408
+#define RES_STORAGE_DDELTA_LP          0x409
+#define RES_STORAGE_DDELTA_LP_TEMP 0x40A
+#define RES_STORAGE_DELTA_E                0x40B
+#define RES_STORAGE_DELTA_BOC          0x40C
+#define RES_STORAGE_NL                 0x40D
+#define RES_STORAGE_NLP_TEMP           0x40E
+#define RES_STORAGE_CLP                    0x40F
+#define RES_STORAGE_CDDELTA                0x410
+#define RES_STORAGE_VLPEX              0x411
+#define RES_STORAGE_DROPTOL                0x412
+#define RES_STORAGE_W                      0x413
+#define RES_STORAGE_HDIA_INV           0x414
+#define RES_STORAGE_B                      0x415
+#define RES_STORAGE_B_S                    0x416
+#define RES_STORAGE_B_T                    0x417
+#define RES_STORAGE_B_PRC              0x418
+#define RES_STORAGE_B_PRM              0x419
+#define RES_STORAGE_S_T                    0x41A
+#define RES_STORAGE_S                      0x41B
+#define RES_STORAGE_T                      0x41C
+#define RES_STORAGE_Y                      0x41D
+#define RES_STORAGE_Z                      0x41E
+#define RES_STORAGE_G                      0x41F
+#define RES_STORAGE_HS                 0x420
+#define RES_STORAGE_HC                 0x421
+#define RES_STORAGE_RN                 0x422
+#define RES_STORAGE_V                  0x423
+#define RES_STORAGE_H                      0x424
+#define RES_STORAGE_R                      0x425
+#define RES_STORAGE_D                      0x426
+#define RES_STORAGE_Q                      0x427
+#define RES_STORAGE_P                      0x428
+#define RES_STORAGE_A                      0x429
+#define RES_STORAGE_F_OLD              0x42A
+#define RES_STORAGE_V_CONST                0x42B
+#define RES_STORAGE_MARK                   0x42C
+#define RES_STORAGE_OLD_MARK           0x42D
+#define RES_STORAGE_X_OLD              0x42E
+#define RES_STORAGE_NLP                    0x42F
+#define RES_STORAGE_MAP_SERIALS        0x430
+#define RES_STORAGE_RESTRICTED          0x431
+#define RES_STORAGE_RESTRICTED_LIST    0x432
+#define RES_STORAGE_ORIG_ID                0x433
 
 #define RES_CONTROL_PARAMS  0x50
 
@@ -224,7 +233,6 @@
 
 #define RES_SCRATCH                     0x90
 
-
 #define LIST_INDEX                      0x00
 #define  LIST_END_INDEX                 0x01
 #define LIST_FAR_NEIGHBOR_DATA      0x10
@@ -288,7 +296,6 @@
 #define MATVEC_THREADS_PER_ROW              32
 
 
-
 enum {TYP_HOST, TYP_DEVICE};
 
 typedef double real;
@@ -309,7 +316,6 @@ enum {WRITE_ASCII, WRITE_BINARY, RF_N};
 enum {XYZ, PDB, BGF, ASCII_RESTART, BINARY_RESTART, GF_N};
 
 
-
 /* Global params mapping */
 /*
 l[0]  = p_boc1
@@ -352,7 +358,6 @@ l[36] = N/A
 l[37] = version number
 l[38] = p_coa3
 */
-
 typedef struct
 {
     int n_global;
@@ -361,7 +366,6 @@ typedef struct
 } global_parameters;
 
 
-
 typedef struct
 {
     /* Line one in field file */
@@ -405,7 +409,6 @@ typedef struct
 } single_body_parameters;
 
 
-
 /* Two Body Parameters */
 typedef struct
 {
@@ -435,7 +438,6 @@ typedef struct
 } two_body_parameters;
 
 
-
 /* 3-body parameters */
 typedef struct
 {
@@ -458,7 +460,6 @@ typedef struct
 } three_body_header;
 
 
-
 /* hydrogen-bond parameters */
 typedef struct
 {
@@ -466,7 +467,6 @@ typedef struct
 } hbond_parameters;
 
 
-
 /* 4-body parameters */
 typedef struct
 {
@@ -560,7 +560,6 @@ typedef struct
     int   *end;
     ivec  *nbrs;
     rvec  *nbrs_cp;
-
 } grid;
 
 
@@ -768,8 +767,6 @@ typedef struct
     reax_timing timing;
     //CUDA
     reax_timing d_timing;
-
-
     void *d_simulation_data;
 } simulation_data;
 
@@ -837,6 +834,7 @@ typedef struct
     rvec dBO, dBOpi, dBOpi2;
 } dbond_data;
 
+
 typedef struct
 {
     real BO, BO_s, BO_pi, BO_pi2;
@@ -847,6 +845,7 @@ typedef struct
     rvec dBOp, dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
 } bond_order_data;
 
+
 typedef struct
 {
     int nbr;
@@ -886,6 +885,7 @@ typedef struct
     real val;
 } sparse_matrix_entry;
 
+
 typedef struct
 {
     int n, m;
@@ -914,6 +914,7 @@ typedef struct
     int gcell_atoms;
 } reallocate_data;
 
+
 typedef struct
 {
     /* bond order related storage */
@@ -999,7 +1000,6 @@ typedef struct
 } list;
 
 
-
 typedef struct
 {
     FILE *trj;
@@ -1070,12 +1070,12 @@ typedef struct
 } LR_data;
 
 
-
 typedef struct
 {
     real a, b, c, d;
 } cubic_spline_coef;
 
+
 typedef struct
 {
     real xmin, xmax;
@@ -1126,8 +1126,7 @@ typedef void (*get_far_neighbors_function)(rvec, rvec, simulation_box*,
         int*);
 
 
-// CUDA structures
-//
+/* CUDA structures */
 extern list *dev_lists;
 extern static_storage *dev_workspace;
 extern LR_lookup_table *d_LR;
@@ -1138,7 +1137,7 @@ extern void *scratch;
 extern int BLOCKS, BLOCKS_POW_2, BLOCK_SIZE;
 extern int MATVEC_BLOCKS;
 
-#ifdef __USE_GPU__
+#ifdef __CUDACC__
 extern cublasStatus_t cublasStatus;
 extern cublasHandle_t cublasHandle;
 
@@ -1148,5 +1147,4 @@ extern cusparseMatDescr_t matdescriptor;
 #endif
 
 
-
 #endif
diff --git a/PuReMD-GPU/src/reduction.cu b/PuReMD-GPU/src/reduction.cu
index 48fb5efc..99eead52 100644
--- a/PuReMD-GPU/src/reduction.cu
+++ b/PuReMD-GPU/src/reduction.cu
@@ -19,8 +19,8 @@
   ----------------------------------------------------------------------*/
 
 #include "reduction.h"
+
 #include "vector.h"
-#include "mytypes.h"
 
 
 GLOBAL void Cuda_reduction(const real *input, real *per_block_results, const size_t n)
diff --git a/PuReMD-GPU/src/single_body_interactions.h b/PuReMD-GPU/src/single_body_interactions.h
index dd266797..b31d9431 100644
--- a/PuReMD-GPU/src/single_body_interactions.h
+++ b/PuReMD-GPU/src/single_body_interactions.h
@@ -21,7 +21,8 @@
 #ifndef __SINGLE_BODY_INTERACTIONS_H_
 #define __SINGLE_BODY_INTERACTIONS_H_
 
-#include <mytypes.h>
+#include "mytypes.h"
+
 
 void LonePair_OverUnder_Coordination_Energy( reax_system*, control_params*,
         simulation_data*, static_storage*,
diff --git a/PuReMD-GPU/src/system_props.cu b/PuReMD-GPU/src/system_props.cu
index 3ec39134..e0bb2626 100644
--- a/PuReMD-GPU/src/system_props.cu
+++ b/PuReMD-GPU/src/system_props.cu
@@ -100,7 +100,7 @@ void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data )
     //cuda_malloc ((void **)&partial_sums, sizeof (real) * (blocks + 1), 1, 0);
     cuda_memset (partial_sums, 0, REAL_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH );
 
-    Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+    k_Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
         (system->reaxprm.d_sbp, system->d_atoms, partial_sums, system->N);
     cudaThreadSynchronize ();
     cudaCheckError ();
@@ -133,7 +133,7 @@ void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data )
 }
 
 
-GLOBAL void Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) 
+GLOBAL void k_Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) 
 {
     extern __shared__ real sdata[];
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/PuReMD-GPU/src/system_props.h b/PuReMD-GPU/src/system_props.h
index d287992f..d152c254 100644
--- a/PuReMD-GPU/src/system_props.h
+++ b/PuReMD-GPU/src/system_props.h
@@ -21,7 +21,7 @@
 #ifndef __SYSTEM_PROP_H_
 #define __SYSTEM_PROP_H_
 
-#include <mytypes.h>
+#include "mytypes.h"
 
 real Get_Time( );
 
@@ -43,7 +43,7 @@ void Compute_Pressure( reax_system*, simulation_data*, static_storage* );
 void Compute_Pressure_Isotropic( reax_system*, control_params*, simulation_data*, output_controls* );
 
 void prep_dev_system (reax_system *system);
-GLOBAL void Compute_Total_Mass (single_body_parameters *, reax_atom *, real *, size_t );
+GLOBAL void k_Compute_Total_Mass (single_body_parameters *, reax_atom *, real *, size_t );
 //GLOBAL void Compute_Kinetic_Energy (single_body_parameters *, reax_atom *, unsigned int , simulation_data *, real *);
 
 
diff --git a/PuReMD-GPU/src/two_body_interactions.h b/PuReMD-GPU/src/two_body_interactions.h
index 41483222..efc19927 100644
--- a/PuReMD-GPU/src/two_body_interactions.h
+++ b/PuReMD-GPU/src/two_body_interactions.h
@@ -21,9 +21,10 @@
 #ifndef __TWO_BODY_INTERACTIONS_H_
 #define __TWO_BODY_INTERACTIONS_H_
 
-#include <mytypes.h>
+#include "mytypes.h"
 #include "index_utils.h"
 
+
 void Bond_Energy( reax_system*, control_params*, simulation_data*,
                   static_storage*, list**, output_controls* );
 void vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*,
-- 
GitLab