From 3de10e335f390b8af4c48c5a1516f2969bc59fd1 Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Tue, 14 Jun 2016 12:55:11 -0400
Subject: [PATCH] Autotools: add support for compiling shared sources between
 NVCC and CC trajectories.

---
 puremd_rc_1003/PG-PuReMD/Makefile.am          |  45 +++---
 puremd_rc_1003/PG-PuReMD/configure.ac         |  46 +++----
 puremd_rc_1003/PG-PuReMD/src/allocate.c       |   8 ++
 puremd_rc_1003/PG-PuReMD/src/center_mass.h    |   2 +-
 .../PG-PuReMD/src/cuda_bond_orders.h          |   2 +-
 puremd_rc_1003/PG-PuReMD/src/cuda_bonds.cu    |   2 +-
 puremd_rc_1003/PG-PuReMD/src/cuda_forces.cu   |   2 +-
 .../PG-PuReMD/src/cuda_hydrogen_bonds.h       |   2 +-
 .../PG-PuReMD/src/cuda_integrate.cu           |   2 +-
 .../PG-PuReMD/src/cuda_linear_solvers.cu      |   2 +-
 puremd_rc_1003/PG-PuReMD/src/cuda_lookup.cu   |   2 +-
 .../PG-PuReMD/src/cuda_multi_body.cu          |   2 +-
 .../PG-PuReMD/src/cuda_neighbors.cu           |   2 +-
 .../PG-PuReMD/src/cuda_nonbonded.cu           |   2 +-
 .../PG-PuReMD/src/cuda_post_evolve.cu         |   2 +-
 puremd_rc_1003/PG-PuReMD/src/cuda_qEq.cu      |   2 +-
 .../PG-PuReMD/src/cuda_reax_constants.h       |  97 -------------
 puremd_rc_1003/PG-PuReMD/src/cuda_shuffle.h   |   2 +-
 .../PG-PuReMD/src/cuda_torsion_angles.h       |   2 +-
 puremd_rc_1003/PG-PuReMD/src/cuda_utils.h     |   2 +-
 .../PG-PuReMD/src/cuda_valence_angles.h       |   2 +-
 puremd_rc_1003/PG-PuReMD/src/dev_list.h       |   2 +-
 puremd_rc_1003/PG-PuReMD/src/dual_matvec.h    |   2 +-
 puremd_rc_1003/PG-PuReMD/src/forces.c         |  12 ++
 puremd_rc_1003/PG-PuReMD/src/grid.c           |   4 +
 puremd_rc_1003/PG-PuReMD/src/index_utils.h    |   1 -
 puremd_rc_1003/PG-PuReMD/src/init_md.c        |  20 ++-
 puremd_rc_1003/PG-PuReMD/src/integrate.c      |   5 +-
 puremd_rc_1003/PG-PuReMD/src/linear_solvers.c |  10 ++
 puremd_rc_1003/PG-PuReMD/src/lookup.c         |   7 +-
 puremd_rc_1003/PG-PuReMD/src/matvec.h         |   2 +-
 puremd_rc_1003/PG-PuReMD/src/parallelreax.c   |  74 ++++++++--
 puremd_rc_1003/PG-PuReMD/src/qEq.c            |   8 ++
 puremd_rc_1003/PG-PuReMD/src/reax_types.h     | 130 +++++++++++++++---
 puremd_rc_1003/PG-PuReMD/src/reduction.h      |   2 +-
 puremd_rc_1003/PG-PuReMD/src/reset_tools.c    |   8 +-
 puremd_rc_1003/PG-PuReMD/src/system_props.c   |  28 ++--
 puremd_rc_1003/PG-PuReMD/src/traj.c           |   9 +-
 puremd_rc_1003/PG-PuReMD/src/validation.cu    |   2 +-
 puremd_rc_1003/PG-PuReMD/src/vector.h         |   2 -
 40 files changed, 337 insertions(+), 221 deletions(-)
 delete mode 100644 puremd_rc_1003/PG-PuReMD/src/cuda_reax_constants.h

diff --git a/puremd_rc_1003/PG-PuReMD/Makefile.am b/puremd_rc_1003/PG-PuReMD/Makefile.am
index f39c038a..9c51de81 100644
--- a/puremd_rc_1003/PG-PuReMD/Makefile.am
+++ b/puremd_rc_1003/PG-PuReMD/Makefile.am
@@ -1,42 +1,36 @@
-if BUILD_CUDA
+if USE_CUDA
 SUFFIXES = .cu
 include cuda.am
 endif
 
-AM_CFLAGS = -Wall -O3 -funroll-loops -fstrict-aliasing
-AM_CPPFLAGS = -I$(srcdir)
-AM_LDFLAGS  = 
-
-if BUILD_CUDA
-AM_CFLAGS += $(CUDA_CFLAGS)
-AM_CPPFLAGS += $(CUDA_CFLAGS)
+AM_CFLAGS = -Wall -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)
+AM_CPPFLAGS =
+AM_LDFLAGS = $(MPI_LDFLAGS)
 
+if USE_CUDA
 # default CUDA nvcc flags
 #   Note: cc 13 for Tesla
 #   Note: cc 20 for Fermi
 #   Note: cc 30 for Kepler K10
 #   Note: cc 35 for Kepler K20
-NVCCFLAGS +=  -D__USE_GPU__ -use_fast_math 
-NVCCFLAGS += -gencode arch=compute_35,code=sm_35 --compiler-options "-O3 -funroll-loops -fstrict-aliasing" -O3
-NVCCFLAGS += --ptxas-options -v
-NVCCFLAGS += --compiler-options -I$(srcdir) 
-NVCCFLAGS += --compiler-options -fno-strict-aliasing
-NVCCFLAGS += --compiler-options -Wno-unused-function
-NVCCFLAGS += --compiler-options -Wno-unused-parameter
-NVCCFLAGS += --compiler-options "$(MPI_CFLAGS)"
+NVCCFLAGS += -use_fast_math 
+NVCCFLAGS += -gencode arch=compute_35,code=sm_35
+NVCCFLAGS += --compiler-options "$(DEFS) -O3 -funroll-loops -fstrict-aliasing $(MPI_CFLAGS)"
+#NVCCFLAGS += --ptxas-options -v
 endif
 
 
 bin_PROGRAMS = src/pg-puremd
 src_pg_puremd_SOURCES = src/allocate.c src/basic_comm.c src/ffield.c src/grid.c src/list.c \
 	src/lookup.c src/io_tools.c src/reset_tools.c src/restart.c src/random.c \
-	src/tool_box.c src/traj.c src/vector.c src/analyze.c src/box.c src/system_props.c \
+	src/tool_box.c src/traj.c src/analyze.c src/box.c src/system_props.c \
 	src/control.c src/comm_tools.c src/geo_tools.c src/linear_solvers.c src/neighbors.c \
 	src/qEq.c src/bond_orders.c src/multi_body.c src/bonds.c src/valence_angles.c \
 	src/hydrogen_bonds.c src/torsion_angles.c src/nonbonded.c src/forces.c \
 	src/integrate.c src/init_md.c src/parallelreax.c
+src_pg_puremd_LDADD = src/vector.o
 
-if BUILD_CUDA
+if USE_CUDA
 src_pg_puremd_SOURCES += src/cuda_utils.cu src/dev_alloc.cu src/cuda_environment.cu \
       src/dev_system_props.cu src/reduction.cu src/center_mass.cu \
       src/cuda_copy.cu src/cuda_reset_tools.cu src/dev_list.cu \
@@ -49,12 +43,21 @@ src_pg_puremd_SOURCES += src/cuda_utils.cu src/dev_alloc.cu src/cuda_environment
 
 # dummy source to cause C linking
 nodist_EXTRA_src_pg_puremd_SOURCES = src/dummy.c
+
+src_vector.o:
+	$(AM_V_NVCC)$(NVCC) $(NVCCFLAGS) -maxrregcount=$(MAX_REG_COUNT) -o src/vector.o -c src/vector.c
+else
+src_vector.o:
+	$(AM_V_CC)$(CC) $(DEFS) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CCFLAGS) \
+	-maxrregcount=$(MAX_REG_COUNT) -o src/vector.o -c src/vector.c
 endif
 
 
-src_pg_puremd_CFLAGS = $(AM_CFLAGS) $(MPI_CFLAGS)
-src_pg_puremd_LDFLAGS = $(AM_LDFLAGS) $(MPI_LDFLAGS)
+src_pg_puremd_CFLAGS = $(AM_CFLAGS) $(CFLAGS)
+src_pg_puremd_CPPFLAGS = $(AM_CPPFLAGS) $(CPPFLAGS)
+src_pg_puremd_LDFLAGS = $(AM_LDFLAGS) $(LDFLAGS)
 
-if BUILD_CUDA
+if USE_CUDA
+src_pg_puremd_CFLAGS += $(CUDA_CFLAGS)
 src_pg_puremd_LDFLAGS += $(CUDA_LIBS)
 endif
diff --git a/puremd_rc_1003/PG-PuReMD/configure.ac b/puremd_rc_1003/PG-PuReMD/configure.ac
index 5a121aac..6f07b964 100644
--- a/puremd_rc_1003/PG-PuReMD/configure.ac
+++ b/puremd_rc_1003/PG-PuReMD/configure.ac
@@ -47,29 +47,6 @@ AC_FUNC_REALLOC
 AC_FUNC_STRTOD
 AC_CHECK_FUNCS([gettimeofday memset])
 
-# Check for CUDA support.
-if test "x$BUILD_GPU" = "xyes"; then
-	CONFIGURE_HEADLINE([ CUDA support ])
-	AX_CUDA
-        NVCCFLAGS=
-	if test "BUILD_DEBUG" = "true"
-	then
-		NVCCFLAGS+=" -g -G"
-	fi
-
-	AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
-fi
-AM_CONDITIONAL([BUILD_CUDA], [test "x${BUILD_GPU}" = "xyes"])
-
-if test "BUILD_PROF" = "true"
-then
-	if test "x$BUILD_GPU" = "xyes"; then
-		NVCCFLAGS+=" --compiler-options ${gprof_flags}"
-	else
-		CFLAGS+=" --compiler-options ${gprof_flags}"
-	fi
-fi
-
 # Check for MPI support.
 CONFIGURE_HEADLINE([ MPI compiler ])
 ACX_MPI([], [AC_MSG_ERROR([could not find mpi library])])
@@ -132,6 +109,29 @@ fi
 AC_SUBST(MPI_CFLAGS)
 AC_SUBST(MPI_LDFLAGS)
 
+# Check for CUDA support.
+if test "x$BUILD_GPU" = "xyes"; then
+	CONFIGURE_HEADLINE([ CUDA support ])
+	AX_CUDA
+        NVCCFLAGS=
+	if test "BUILD_DEBUG" = "true"
+	then
+		NVCCFLAGS+=" -g -G"
+	fi
+	AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
+else
+	AM_CONDITIONAL(USE_CUDA, test "x" = "xyes")
+fi
+
+if test "BUILD_PROF" = "true"
+then
+	if test "x$BUILD_GPU" = "xyes"; then
+		NVCCFLAGS+=" --compiler-options ${gprof_flags}"
+	else
+		CFLAGS+=" ${gprof_flags}"
+	fi
+fi
+
 AC_CONFIG_FILES([Makefile])
 
 AC_OUTPUT
diff --git a/puremd_rc_1003/PG-PuReMD/src/allocate.c b/puremd_rc_1003/PG-PuReMD/src/allocate.c
index a87a259c..5b07ecd8 100644
--- a/puremd_rc_1003/PG-PuReMD/src/allocate.c
+++ b/puremd_rc_1003/PG-PuReMD/src/allocate.c
@@ -390,6 +390,7 @@ void Reallocate_Neighbor_List( reax_list *far_nbrs, int n, int num_intrs )
   }
 }
 
+#ifdef HAVE_CUDA
 void Cuda_Reallocate_Neighbor_List( reax_list *far_nbrs, int n, int num_intrs )
 {
   Dev_Delete_List( far_nbrs);
@@ -398,6 +399,7 @@ void Cuda_Reallocate_Neighbor_List( reax_list *far_nbrs, int n, int num_intrs )
     MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
   }
 }
+#endif
 
 
 /*
@@ -480,6 +482,7 @@ int Reallocate_HBonds_List( reax_system *system, reax_list *hbonds )
   return total_hbonds;
 }
 
+#ifdef HAVE_CUDA
 int Cuda_Reallocate_HBonds_List( int n, int num_intrs, reax_list *hbonds )
 {
   Dev_Delete_List( hbonds);
@@ -489,6 +492,7 @@ int Cuda_Reallocate_HBonds_List( int n, int num_intrs, reax_list *hbonds )
   }
   return SUCCESS;
 }
+#endif
 
 int Reallocate_Bonds_List( reax_system *system, reax_list *bonds,
 			   int *total_bonds, int *est_3body )
@@ -513,6 +517,7 @@ int Reallocate_Bonds_List( reax_system *system, reax_list *bonds,
   return SUCCESS;
 }
 
+#ifdef HAVE_CUDA
 int Cuda_Reallocate_Bonds_List( int n, int num_intrs, reax_list *bonds)
 {
   Dev_Delete_List( bonds);
@@ -523,6 +528,7 @@ int Cuda_Reallocate_Bonds_List( int n, int num_intrs, reax_list *bonds)
 
   return SUCCESS;
 }
+#endif
 
 
 /*************       grid        *************/
@@ -1053,6 +1059,7 @@ void ReAllocate( reax_system *system, control_params *control,
 #endif
 }
 
+#ifdef HAVE_CUDA
 void Cuda_ReAllocate( reax_system *system, control_params *control, 
 		 simulation_data *data, storage *workspace, reax_list **lists, 
 		 mpi_datatypes *mpi_data )
@@ -1370,3 +1377,4 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
   MPI_Barrier( MPI_COMM_WORLD );
 #endif
 }
+#endif
diff --git a/puremd_rc_1003/PG-PuReMD/src/center_mass.h b/puremd_rc_1003/PG-PuReMD/src/center_mass.h
index 8a40d55c..cf0ef160 100644
--- a/puremd_rc_1003/PG-PuReMD/src/center_mass.h
+++ b/puremd_rc_1003/PG-PuReMD/src/center_mass.h
@@ -3,7 +3,7 @@
 #define __CENTER_MASS_H__
 
 #include "reax_types.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *, reax_atom *,
                             rvec *res_xcm, 
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_bond_orders.h b/puremd_rc_1003/PG-PuReMD/src/cuda_bond_orders.h
index cabca8c9..fbda3b76 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_bond_orders.h
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_bond_orders.h
@@ -3,7 +3,7 @@
 #define __CUDA_BOND_ORDERS_H__
 
 #include "reax_types.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 #include "vector.h"
 
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_bonds.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_bonds.cu
index 20b9d7ea..e3c7bdec 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_bonds.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_bonds.cu
@@ -21,7 +21,7 @@
 
 #include "reax_types.h"
 #include "index_utils.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "dev_list.h"
 
 
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_forces.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_forces.cu
index 7c47cb6f..4244a3b5 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_forces.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_forces.cu
@@ -1,7 +1,7 @@
 
 #include "cuda_forces.h"
 
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "reax_types.h"
 #include "dev_list.h"
 #include "list.h"
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_hydrogen_bonds.h b/puremd_rc_1003/PG-PuReMD/src/cuda_hydrogen_bonds.h
index c8fcdfb4..8ec471be 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_hydrogen_bonds.h
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_hydrogen_bonds.h
@@ -23,7 +23,7 @@
 #define __HBONDS_H_
 
 #include "reax_types.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs (  reax_atom *,
 														storage ,
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_integrate.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_integrate.cu
index 92bffd71..6bcdee88 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_integrate.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_integrate.cu
@@ -1,6 +1,6 @@
 
 #include "cuda_integrate.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 #include "vector.h"
 #include "cuda_utils.h"
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_linear_solvers.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_linear_solvers.cu
index c0803f91..0a2a1535 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_linear_solvers.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_linear_solvers.cu
@@ -21,7 +21,7 @@
 
 #include "cuda_linear_solvers.h"
 
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "cuda_utils.h"
 #include "reduction.h"
 #include "dual_matvec.h"
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_lookup.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_lookup.cu
index 887ec249..e3a6d1e6 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_lookup.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_lookup.cu
@@ -2,7 +2,7 @@
 #include "cuda_lookup.h"
 #include "index_utils.h"
 #include "cuda_utils.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 
 void copy_LR_table_to_device (reax_system *system, control_params *control, int *aggregated)
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_multi_body.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_multi_body.cu
index 0e6c48d3..174130aa 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_multi_body.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_multi_body.cu
@@ -19,7 +19,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "cuda_multi_body.h"
 #include "index_utils.h"
 #include "cuda_helpers.h"
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_neighbors.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_neighbors.cu
index b0d2a967..4bf1d9fb 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_neighbors.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_neighbors.cu
@@ -24,7 +24,7 @@
 #include "vector.h"
 
 #include "index_utils.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "cuda_utils.h"
 #include "tool_box.h"
 
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_nonbonded.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_nonbonded.cu
index 9d0cda8f..c9948dfb 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_nonbonded.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_nonbonded.cu
@@ -20,7 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "cuda_nonbonded.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "index_utils.h"
 #include "dev_list.h"
 #include "vector.h"
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_post_evolve.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_post_evolve.cu
index ea7d8b7d..79a26fd8 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_post_evolve.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_post_evolve.cu
@@ -1,6 +1,6 @@
 
 #include "cuda_post_evolve.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "vector.h"
 #include "cuda_utils.h"
 
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_qEq.cu b/puremd_rc_1003/PG-PuReMD/src/cuda_qEq.cu
index d5a939bf..d4fed53c 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_qEq.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_qEq.cu
@@ -21,7 +21,7 @@
 
 #include "cuda_qEq.h"
 
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "reduction.h"
 #include "cuda_utils.h"
 
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_reax_constants.h b/puremd_rc_1003/PG-PuReMD/src/cuda_reax_constants.h
deleted file mode 100644
index 2532170f..00000000
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_reax_constants.h
+++ /dev/null
@@ -1,97 +0,0 @@
-
-#ifndef	__CUDA_REAX_CONSTANTS_H__
-#define  __CUDA_REAX_CONSTANTS_H__
-
-#include "cuda.h"
-
-#ifdef __USE_GPU__
-#define CUDA_HOST __host__
-#define CUDA_DEVICE __device__
-#define CUDA_GLOBAL __global__
-#define CUDA_HOST_DEVICE __host__ __device__
-#else
-#define CUDA_HOST
-#define CUDA_DEVICE
-#define CUDA_GLOBAL
-#define CUDA_HOST_DEVICE
-#endif
-
-//BLOCK SIZES for kernels
-//
-#define				HB_SYM_BLOCK_SIZE					64
-#define				HB_KER_SYM_THREADS_PER_ATOM			16
-#define				HB_POST_PROC_BLOCK_SIZE				256
-#define				HB_POST_PROC_KER_THREADS_PER_ATOM	32
-
-#if defined( __INIT_BLOCK_SIZE__)
-#define				DEF_BLOCK_SIZE						__INIT_BLOCK_SIZE__    /* all utility functions and all */
-#define				CUDA_BLOCK_SIZE						__INIT_BLOCK_SIZE__    	/* init forces */
-#define				ST_BLOCK_SIZE						__INIT_BLOCK_SIZE__    
-#else
-#define				DEF_BLOCK_SIZE						256						/* all utility functions and all */
-#define				CUDA_BLOCK_SIZE						256						/* init forces */
-#define				ST_BLOCK_SIZE						256	
-#endif
-
-
-#if defined( __NBRS_THREADS_PER_ATOM__ )
-#define				NB_KER_THREADS_PER_ATOM				__NBRS_THREADS_PER_ATOM__
-#else
-#define				NB_KER_THREADS_PER_ATOM				16
-#endif
-
-#if defined( __NBRS_BLOCK_SIZE__)
-#define				NBRS_BLOCK_SIZE						__NBRS_BLOCK_SIZE__
-#else
-#define				NBRS_BLOCK_SIZE						256
-#endif
-
-
-#if defined( __HB_THREADS_PER_ATOM__)
-#define				HB_KER_THREADS_PER_ATOM				__HB_THREADS_PER_ATOM__
-#else
-#define				HB_KER_THREADS_PER_ATOM				32
-#endif
-
-#if defined(__HB_BLOCK_SIZE__)
-#define				HB_BLOCK_SIZE					__HB_BLOCK_SIZE__
-#else
-#define				HB_BLOCK_SIZE						256
-#endif
-
-
-#if defined( __VDW_THREADS_PER_ATOM__ )
-#define				VDW_KER_THREADS_PER_ATOM			__VDW_THREADS_PER_ATOM__
-#else
-#define				VDW_KER_THREADS_PER_ATOM			32
-#endif
-
-#if defined( __VDW_BLOCK_SIZE__)
-#define				VDW_BLOCK_SIZE						__VDW_BLOCK_SIZE__
-#else
-#define				VDW_BLOCK_SIZE						256
-#endif
-
-
-#if defined( __MATVEC_THREADS_PER_ROW__ )
-#define 			MATVEC_KER_THREADS_PER_ROW		__MATVEC_THREADS_PER_ROW__
-#else
-#define 			MATVEC_KER_THREADS_PER_ROW		32
-#endif
-
-
-#if defined( __MATVEC_BLOCK_SIZE__)
-#define				MATVEC_BLOCK_SIZE					__MATVEC_BLOCK_SIZE__
-#else
-#define				MATVEC_BLOCK_SIZE					512
-#endif
-
-
-
-
-
-
-//Validation
-#define				GPU_TOLERANCE				1e-5
-
-#endif
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_shuffle.h b/puremd_rc_1003/PG-PuReMD/src/cuda_shuffle.h
index d000864f..e0302f13 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_shuffle.h
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_shuffle.h
@@ -23,7 +23,7 @@
 #define __CUDA_SHUFFLE_H_
 
 #include "reax_types.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 #ifdef __cplusplus
 extern "C"  {
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_torsion_angles.h b/puremd_rc_1003/PG-PuReMD/src/cuda_torsion_angles.h
index aa074756..bb1542f8 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_torsion_angles.h
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_torsion_angles.h
@@ -23,7 +23,7 @@
 #define __TORSION_ANGLES_H_
 
 #include "reax_types.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *, 
                                        global_parameters , 
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_utils.h b/puremd_rc_1003/PG-PuReMD/src/cuda_utils.h
index 86975a3e..c2536b39 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_utils.h
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_utils.h
@@ -6,7 +6,7 @@
 #include "stdlib.h"
 #include "stdio.h"
 
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 #ifdef __cplusplus
 extern "C"  {
diff --git a/puremd_rc_1003/PG-PuReMD/src/cuda_valence_angles.h b/puremd_rc_1003/PG-PuReMD/src/cuda_valence_angles.h
index 56a93103..fadfc398 100644
--- a/puremd_rc_1003/PG-PuReMD/src/cuda_valence_angles.h
+++ b/puremd_rc_1003/PG-PuReMD/src/cuda_valence_angles.h
@@ -23,7 +23,7 @@
 #define __VALENCE_ANGLES_H_
 
 #include "reax_types.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "vector.h"
 
 CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *, 
diff --git a/puremd_rc_1003/PG-PuReMD/src/dev_list.h b/puremd_rc_1003/PG-PuReMD/src/dev_list.h
index 58523367..2d5c75c5 100644
--- a/puremd_rc_1003/PG-PuReMD/src/dev_list.h
+++ b/puremd_rc_1003/PG-PuReMD/src/dev_list.h
@@ -23,7 +23,7 @@
 #define __LIST_H_
 
 #include "reax_types.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/puremd_rc_1003/PG-PuReMD/src/dual_matvec.h b/puremd_rc_1003/PG-PuReMD/src/dual_matvec.h
index 8298a972..c5b93948 100644
--- a/puremd_rc_1003/PG-PuReMD/src/dual_matvec.h
+++ b/puremd_rc_1003/PG-PuReMD/src/dual_matvec.h
@@ -4,7 +4,7 @@
 #define __DUAL_MATVEC__H_
 
 #include "reax_types.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 CUDA_GLOBAL void k_dual_matvec (sparse_matrix , rvec2 *, rvec2 *, int );
 CUDA_GLOBAL void k_dual_matvec_csr(sparse_matrix , rvec2 *, rvec2 *, int );
diff --git a/puremd_rc_1003/PG-PuReMD/src/forces.c b/puremd_rc_1003/PG-PuReMD/src/forces.c
index 887a1470..d0f720a5 100644
--- a/puremd_rc_1003/PG-PuReMD/src/forces.c
+++ b/puremd_rc_1003/PG-PuReMD/src/forces.c
@@ -20,13 +20,16 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #include "index_utils.h"
+#ifdef HAVE_CUDA
 #include "cuda_forces.h"
 
 #include "cuda_linear_solvers.h"
 #include "cuda_neighbors.h"
 //#include "cuda_bond_orders.h"
 #include "validation.h"
+#endif
 
 #if defined(PURE_REAX)
 #include "forces.h"
@@ -63,8 +66,10 @@
 
 interaction_function Interaction_Functions[NUM_INTRS];
 
+#ifdef HAVE_CUDA
 void Cuda_Total_Forces (reax_system *, control_params *, simulation_data *, storage *); 
 void Cuda_Total_Forces_PURE (reax_system *, storage *); 
+#endif
 
 void Dummy_Interaction( reax_system *system, control_params *control, 
 			simulation_data *data, storage *workspace, 
@@ -198,6 +203,8 @@ void Compute_Total_Force( reax_system *system, control_params *control,
 #endif
 }
 
+
+#ifdef HAVE_CUDA
 void Cuda_Compute_Total_Force( reax_system *system, control_params *control, 
 			  simulation_data *data, storage *workspace, 
 			  reax_list **lists, mpi_datatypes *mpi_data )
@@ -225,6 +232,7 @@ void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
 #endif
 	
 }
+#endif
 
 
 void Validate_Lists( reax_system *system, storage *workspace, reax_list **lists,
@@ -1057,6 +1065,8 @@ void Compute_Forces( reax_system *system, control_params *control,
 #endif
 }
 
+
+#ifdef HAVE_CUDA
 void Cuda_Compute_Forces( reax_system *system, control_params *control, 
 		     simulation_data *data, storage *workspace, 
 		     reax_list **lists, output_controls *out_control, 
@@ -1158,6 +1168,8 @@ void Cuda_Compute_Forces( reax_system *system, control_params *control,
 #endif
 
 }
+#endif
+
 
 int validate_device (reax_system *system, simulation_data *data, storage *workspace, reax_list **lists )
 {
diff --git a/puremd_rc_1003/PG-PuReMD/src/grid.c b/puremd_rc_1003/PG-PuReMD/src/grid.c
index a25f9aec..1c756dcf 100644
--- a/puremd_rc_1003/PG-PuReMD/src/grid.c
+++ b/puremd_rc_1003/PG-PuReMD/src/grid.c
@@ -520,11 +520,15 @@ void Bin_My_Atoms( reax_system *system, reallocate_data *realloc )
   //fprintf (stderr, "*********** grid max_atoms: %d \n", g->max_atoms );
   if( max_atoms >= g->max_atoms * DANGER_ZONE ) {
     realloc->gcell_atoms = MAX( max_atoms*SAFE_ZONE, MIN_GCELL_POPL );
+#ifdef HAVE_CUDA
     dev_workspace->realloc.gcell_atoms = MAX( max_atoms*SAFE_ZONE, MIN_GCELL_POPL );
+#endif
   }
   else  {
   	realloc->gcell_atoms = -1;
+#ifdef HAVE_CUDA
   	dev_workspace->realloc.gcell_atoms = -1;
+#endif
   }
 }
 
diff --git a/puremd_rc_1003/PG-PuReMD/src/index_utils.h b/puremd_rc_1003/PG-PuReMD/src/index_utils.h
index 532473b2..b4d0d23b 100644
--- a/puremd_rc_1003/PG-PuReMD/src/index_utils.h
+++ b/puremd_rc_1003/PG-PuReMD/src/index_utils.h
@@ -1,7 +1,6 @@
 #ifndef __INDEX_UTILS_H_
 #define __INDEX_UTILS_H_
 
-#include "cuda_reax_constants.h"
 #include "reax_types.h"
 
 
diff --git a/puremd_rc_1003/PG-PuReMD/src/init_md.c b/puremd_rc_1003/PG-PuReMD/src/init_md.c
index edd7bc6e..5324b4fa 100644
--- a/puremd_rc_1003/PG-PuReMD/src/init_md.c
+++ b/puremd_rc_1003/PG-PuReMD/src/init_md.c
@@ -21,10 +21,12 @@
 
 #include "reax_types.h"
 
+#ifdef HAVE_CUDA
 #include "dev_alloc.h"
 #include "dev_list.h"
 #include "cuda_copy.h"
 #include "validation.h"
+#endif
 
 #if defined(PURE_REAX)
 #include "init_md.h"
@@ -187,6 +189,7 @@ int Init_System( reax_system *system, control_params *control,
 }
 
 
+#ifdef HAVE_CUDA
 int Cuda_Init_System( reax_system *system, control_params *control, 
 		 simulation_data *data, storage *workspace, 
 		 mpi_datatypes *mpi_data, char *msg )
@@ -265,10 +268,7 @@ int Cuda_Init_System( reax_system *system, control_params *control,
   
   return SUCCESS;
 }
-
-
-
-
+#endif
 
 
 /************************ initialize simulation data ************************/
@@ -363,6 +363,8 @@ int Init_Simulation_Data( reax_system *system, control_params *control,
   return SUCCESS;
 }
 
+
+#ifdef HAVE_CUDA
 int Cuda_Init_Simulation_Data( reax_system *system, control_params *control, 
 			  simulation_data *data, char *msg )
 {
@@ -444,6 +446,7 @@ int Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
 #endif
   return SUCCESS;
 }
+#endif
 
 
 #elif defined(LAMMPS_REAX)
@@ -543,6 +546,8 @@ int Init_Workspace( reax_system *system, control_params *control,
   return SUCCESS;
 }
 
+
+#ifdef HAVE_CUDA
 int Cuda_Init_Workspace( reax_system *system, control_params *control, 
 		    storage *workspace, char *msg )
 { 
@@ -561,6 +566,7 @@ int Cuda_Init_Workspace( reax_system *system, control_params *control,
 
   return SUCCESS;
 }
+#endif
 
 
 /************** setup communication data structures  **************/
@@ -805,6 +811,8 @@ int  Init_Lists( reax_system *system, control_params *control,
   return SUCCESS;
 }
 
+
+#ifdef HAVE_CUDA
 int  Cuda_Init_Lists( reax_system *system, control_params *control, 
 		 simulation_data *data, storage *workspace, reax_list **lists, 
 		 mpi_datatypes *mpi_data, char *msg )
@@ -1003,6 +1011,7 @@ int  Cuda_Init_Lists( reax_system *system, control_params *control,
 
   return SUCCESS;
 }
+#endif
 
 
 #if defined(PURE_REAX)
@@ -1144,7 +1153,7 @@ void Pure_Initialize( reax_system *system, control_params *control,
 }
 
 
-
+#ifdef HAVE_CUDA
 void Cuda_Initialize( reax_system *system, control_params *control, 
 		 simulation_data *data, storage *workspace, 
 		 reax_list **lists, output_controls *out_control, 
@@ -1246,6 +1255,7 @@ void Cuda_Initialize( reax_system *system, control_params *control,
 	 fprintf( stderr, "p%d: Device Initialization Done \n", system->my_rank );
 #endif
 }
+#endif
 
 
 #elif defined(LAMMPS_REAX)
diff --git a/puremd_rc_1003/PG-PuReMD/src/integrate.c b/puremd_rc_1003/PG-PuReMD/src/integrate.c
index 084439d4..21b5d555 100644
--- a/puremd_rc_1003/PG-PuReMD/src/integrate.c
+++ b/puremd_rc_1003/PG-PuReMD/src/integrate.c
@@ -32,10 +32,12 @@
 #include "tool_box.h"
 #include "vector.h"
 
+#ifdef HAVE_CUDA
 #include "cuda_integrate.h"
 #include "cuda_copy.h"
 
 #include "cuda_neighbors.h"
+#endif
 
 
 void Velocity_Verlet_NVE( reax_system* system, control_params* control, 
@@ -272,6 +274,7 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
 }
 
 
+#ifdef HAVE_CUDA
 void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, 
 				    control_params* control, 
 				    simulation_data *data,
@@ -449,7 +452,7 @@ void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system,
   MPI_Barrier( MPI_COMM_WORLD );
 #endif
 }
-
+#endif
 
 
 /* uses Berendsen-type coupling for both T and P. 
diff --git a/puremd_rc_1003/PG-PuReMD/src/linear_solvers.c b/puremd_rc_1003/PG-PuReMD/src/linear_solvers.c
index e3fa2209..65c3398e 100644
--- a/puremd_rc_1003/PG-PuReMD/src/linear_solvers.c
+++ b/puremd_rc_1003/PG-PuReMD/src/linear_solvers.c
@@ -25,7 +25,9 @@
 #include "tool_box.h"
 #include "vector.h"
 
+#ifdef HAVE_CUDA
 #include "validation.h"
+#endif
 
 #if defined(CG_PERFORMANCE)
 real t_start, t_elapsed, matvec_time, dot_time;
@@ -88,9 +90,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
   }
 #endif
 
+#ifdef HAVE_CUDA
 	check_zeros_host (x, system->N, "x");
+#endif
   Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
+#ifdef HAVE_CUDA
 	check_zeros_host (x, system->N, "x");
+#endif
 
   dual_Sparse_MatVec( H, x, workspace->q2, N );
 
@@ -235,6 +241,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
 
 
+#ifdef HAVE_CUDA
 int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, 
 	     rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout, simulation_data *data )
 {
@@ -522,6 +529,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
 
   return (i+1) + matvecs;
 }
+#endif
 
 
 void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
@@ -614,6 +622,7 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
 }
 
 
+#ifdef HAVE_CUDA
 int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, 
 	real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
@@ -720,6 +729,7 @@ int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H,
   
   return i;
 }
+#endif
 
 
 int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, 
diff --git a/puremd_rc_1003/PG-PuReMD/src/lookup.c b/puremd_rc_1003/PG-PuReMD/src/lookup.c
index 029f0a20..40393bf3 100644
--- a/puremd_rc_1003/PG-PuReMD/src/lookup.c
+++ b/puremd_rc_1003/PG-PuReMD/src/lookup.c
@@ -20,9 +20,12 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #include "index_utils.h"
 
+#ifdef HAVE_CUDA
 #include "cuda_lookup.h"
+#endif
 
 #if defined(PURE_REAX)
 #include "lookup.h"
@@ -294,14 +297,14 @@ int Init_Lookup_Tables( reax_system *system, control_params *control,
   free(fele);
   free(fCEclmb);
 
-  //////////////////////////////////////////////
+#ifdef HAVE_CUDA
   //copy the LR_Table to the device here.
-
   t_start = Get_Time ();
   copy_LR_table_to_device (system, control, aggregated);
   t_end = Get_Timing_Info ( t_start );
 
   fprintf (stderr, "Device copy of LR Lookup table: %f \n", t_end );
+#endif
 
   return 1;
 }
diff --git a/puremd_rc_1003/PG-PuReMD/src/matvec.h b/puremd_rc_1003/PG-PuReMD/src/matvec.h
index b904b6cc..680abea7 100644
--- a/puremd_rc_1003/PG-PuReMD/src/matvec.h
+++ b/puremd_rc_1003/PG-PuReMD/src/matvec.h
@@ -4,7 +4,7 @@
 #define __MATVEC__H_
 
 #include "reax_types.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 CUDA_GLOBAL void k_matvec (sparse_matrix , real *, real *, int );
 CUDA_GLOBAL void k_matvec_csr(sparse_matrix , real *, real *, int );
diff --git a/puremd_rc_1003/PG-PuReMD/src/parallelreax.c b/puremd_rc_1003/PG-PuReMD/src/parallelreax.c
index 4c73c9e6..e7c700b2 100644
--- a/puremd_rc_1003/PG-PuReMD/src/parallelreax.c
+++ b/puremd_rc_1003/PG-PuReMD/src/parallelreax.c
@@ -37,10 +37,12 @@
 #include "traj.h"
 #include "vector.h"
 
+#ifdef HAVE_CUDA
 #include "cuda_environment.h"
 #include "cuda_post_evolve.h"
 
 #include "validation.h"
+#endif
 
 evolve_function Evolve;
 evolve_function Cuda_Evolve;
@@ -121,6 +123,8 @@ void Post_Evolve( reax_system* system, control_params* control,
   Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 }
 
+
+#ifdef HAVE_CUDA
 void Cuda_Post_Evolve( reax_system* system, control_params* control, 
 		  simulation_data* data, storage* workspace, 
 		  reax_list** lists, output_controls *out_control, 
@@ -138,7 +142,10 @@ void Cuda_Post_Evolve( reax_system* system, control_params* control,
   /* compute kinetic energy of the system */
   Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 }
+#endif
 
+
+#ifdef HAVE_CUDA
 void init_blocks (reax_system *system)
 {
 	compute_blocks (&BLOCKS, &BLOCK_SIZE, system->n);
@@ -154,6 +161,7 @@ void init_blocks (reax_system *system)
 	//#endif
 
 }
+#endif
 
 
 int main( int argc, char* argv[] )
@@ -170,9 +178,10 @@ int main( int argc, char* argv[] )
   real t_begin, t_end;
 
   /* Remove this debug information later */
-  #if defined(__CUDA_DEBUG_LOG__)
+#ifdef HAVE_CUDA
+#if defined(__CUDA_DEBUG_LOG__)
   fprintf (stderr, " Size of LR Lookup table %d \n", sizeof (LR_lookup_table) );
-  #endif
+#endif
 
 #if defined( __SM_35__)
   fprintf (stderr, " nbrs block size: %d \n", NBRS_BLOCK_SIZE);
@@ -189,8 +198,7 @@ int main( int argc, char* argv[] )
 
   fprintf (stderr, " General block size: %d \n",  DEF_BLOCK_SIZE);
 #endif
-  
-
+#endif
 
   /* allocated main datastructures */
   system = (reax_system *) smalloc( sizeof(reax_system), "system" );
@@ -213,6 +221,7 @@ int main( int argc, char* argv[] )
     smalloc( sizeof(output_controls), "out_control" );
   mpi_data = (mpi_datatypes *) smalloc( sizeof(mpi_datatypes), "mpi_data" );
 
+#ifdef HAVE_CUDA
   /* allocate the cuda auxiliary data structures */
   dev_workspace = (storage *) smalloc( sizeof(storage), "dev_workspace" );
 
@@ -221,6 +230,7 @@ int main( int argc, char* argv[] )
     dev_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
     dev_lists[i]->allocated = 0;
   }
+#endif
 
   /* Initialize member variables */
   system->init_thblist = FALSE;
@@ -233,6 +243,7 @@ int main( int argc, char* argv[] )
   system->wsize = control->nprocs;
   system->global_offset = (int *)scalloc(system->wsize+1,sizeof(int),"global_offset");
 
+#ifdef HAVE_CUDA
   /* setup the CUDA Device for this process can be on the same machine
 	* or on a different machine, for now use the rank to compute the device
 	* This will only work on a single machine with 2 GPUs*/
@@ -241,6 +252,7 @@ int main( int argc, char* argv[] )
   //Cleanup_Cuda_Environment ();
   print_device_mem_usage ();
   //fprintf( stderr, "p%d: Total number of GPUs on this node -- %d\n", system->my_rank, my_device_id);
+#endif
 
   /* read system description files */
   Read_System( argv[1], argv[2], argv[3], system, control, 
@@ -250,51 +262,64 @@ int main( int argc, char* argv[] )
   MPI_Barrier( MPI_COMM_WORLD );
 #endif
 
+#ifdef HAVE_CUDA
   /* init the blocks sizes for cuda kernels */
   init_blocks (system);
+#endif
 
   /* measure total simulation time after input is read */
   if( system->my_rank == MASTER_NODE )
     t_start = Get_Time( );
 
-  /* initialize datastructures */
-  //Initialize( system, control, data, workspace, lists, out_control, mpi_data ); 
 
+  /* initialize datastructures */
+#ifdef HAVE_CUDA
   Cuda_Initialize( system, control, data, workspace, lists, out_control, mpi_data ); 
 #if defined(__CUDA_DEBUG__)
   Pure_Initialize( system, control, data, workspace, lists, out_control, mpi_data ); 
 #endif
+#else
+  Initialize( system, control, data, workspace, lists, out_control, mpi_data ); 
+#endif
 
+#ifdef HAVE_CUDA
 print_device_mem_usage ();
 
   /* init the blocks sizes for cuda kernels */
   init_blocks (system);
+#endif
 
 #if defined(DEBUG)
   fprintf( stderr, "p%d: initializated data structures\n", system->my_rank );
   MPI_Barrier( MPI_COMM_WORLD );
 #endif
   //END OF FIRST STEP
-  //
 
   // compute f_0 
   Comm_Atoms( system, control, data, workspace, lists, mpi_data, 1 );
+#ifdef HAVE_CUDA
   Sync_Atoms ( system );
   Sync_Grid (&system->my_grid, &system->d_my_grid);
   init_blocks (system);
-  #if defined(__CUDA_DENUG_LOG__)
+#if defined(__CUDA_DENUG_LOG__)
   fprintf( stderr, "p%d: Comm_Atoms synchronized \n", system->my_rank );
-  #endif
+#endif
+#endif
 
 	//Second step
+#ifdef HAVE_CUDA
   Cuda_Reset ( system, control, data, workspace, lists );
 #if defined(__CUDA_DEBUG__)
   Reset( system, control, data, workspace, lists );
 #endif
    //fprintf( stderr, "p%d: Cuda_Reset done...\n", system->my_rank );
+#else
+  Reset( system, control, data, workspace, lists );
+#endif
 
 
 	//Third Step
+#ifdef HAVE_CUDA
   Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
 #if defined(__CUDA_DEBUG__)
   Generate_Neighbor_Lists( system, data, workspace, lists );
@@ -302,9 +327,13 @@ print_device_mem_usage ();
   #if defined(__CUDA_DENUG_LOG__)
   fprintf (stderr, "p%d: Cuda_Generate_Neighbor_Lists done...\n", system->my_rank );
   #endif
+#else
+  Generate_Neighbor_Lists( system, data, workspace, lists );
+#endif
 
 
 	//Fourth Step
+#ifdef HAVE_CUDA
 #if defined(__CUDA_DEBUG__)
 	fprintf (stderr, " Host Compute Forces begin.... \n");
   Compute_Forces( system, control, data, workspace,  
@@ -315,8 +344,13 @@ print_device_mem_usage ();
   #if defined(__CUDA_DENUG_LOG__)
   fprintf (stderr, "p%d: Cuda_Compute_Forces done...\n", system->my_rank );
   #endif
+#else
+  Compute_Forces( system, control, data, workspace,  
+		  lists, out_control, mpi_data );
+#endif
 
 
+#ifdef HAVE_CUDA
 #if defined (__CUDA_DEBUG__)
   Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
 #endif
@@ -324,7 +358,11 @@ print_device_mem_usage ();
   #if defined(__CUDA_DENUG_LOG__)
   fprintf (stderr, "p%d: Cuda_Compute_Kinetic_Energy done ... \n", system->my_rank);
   #endif
+#else
+  Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+#endif
 
+#ifdef HAVE_CUDA
 #if defined(__CUDA_DEBUG__)
 	validate_device (system, data, workspace, lists);
 #endif
@@ -333,10 +371,16 @@ print_device_mem_usage ();
   Output_Results( system, control, data, lists, out_control, mpi_data );
   //fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank);
 #endif
+#else
+  Output_Results( system, control, data, lists, out_control, mpi_data );
+  //fprintf (stderr, "p%d: Output_Results done ... \n", system->my_rank);
+#endif
 
+#ifdef HAVE_CUDA
 #if defined(DEBUG)
   fprintf( stderr, "p%d: computed forces at t0\n", system->my_rank );
   MPI_Barrier( MPI_COMM_WORLD );
+#endif
 #endif
 
   // start the simulation 
@@ -346,6 +390,7 @@ print_device_mem_usage ();
 
 		//t_begin = Get_Time ();
     
+#ifdef HAVE_CUDA
 #if defined(__CUDA_DEBUG__)
     Evolve( system, control, data, workspace, lists, out_control, mpi_data );
 #endif
@@ -362,11 +407,18 @@ print_device_mem_usage ();
 
 	//t_end = Get_Timing_Info (t_begin);
 	 //fprintf (stderr, " Post Evolve time: %f \n", t_end);
+#else
+    Evolve( system, control, data, workspace, lists, out_control, mpi_data );
+    Post_Evolve(system, control, data, workspace, lists, out_control, mpi_data);
+#endif
 
-
+#ifdef HAVE_CUDA
 #if !defined(__CUDA_DEBUG__)
   Output_Results( system, control, data, lists, out_control, mpi_data );
 #endif
+#else
+  Output_Results( system, control, data, lists, out_control, mpi_data );
+#endif
 
     //Analysis(system, control, data, workspace, lists, out_control, mpi_data);
 
@@ -385,9 +437,11 @@ print_device_mem_usage ();
 
   }
 
+#ifdef HAVE_CUDA
   //vaildate the results in debug mode
 #if defined(__CUDA_DEBUG__)
 	validate_device (system, data, workspace, lists);
+#endif
 #endif
   
   /* end of the simulation, write total simulation time */ 
diff --git a/puremd_rc_1003/PG-PuReMD/src/qEq.c b/puremd_rc_1003/PG-PuReMD/src/qEq.c
index d7324c9b..47d06c5a 100644
--- a/puremd_rc_1003/PG-PuReMD/src/qEq.c
+++ b/puremd_rc_1003/PG-PuReMD/src/qEq.c
@@ -26,10 +26,12 @@
 #include "linear_solvers.h"
 #include "tool_box.h"
 
+#ifdef HAVE_CUDA
 #include "cuda_qEq.h"
 #include "cuda_linear_solvers.h"
 
 #include "validation.h"
+#endif
 
 int compare_matrix_entry(const void *v1, const void *v2)
 {
@@ -347,6 +349,8 @@ void Calculate_Charges( reax_system *system, storage *workspace,
   free(q);
 }
 
+
+#ifdef HAVE_CUDA
 void Cuda_Calculate_Charges( reax_system *system, storage *workspace, 
 			mpi_datatypes *mpi_data )
 {
@@ -375,6 +379,7 @@ void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
 
   cuda_charges_updateq ( system, q );
 }
+#endif
 
 
 void QEq( reax_system *system, control_params *control, simulation_data *data, 
@@ -432,6 +437,8 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
 #endif
 }
 
+
+#ifdef HAVE_CUDA
 void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data, 
      storage *workspace, output_controls *out_control, 
      mpi_datatypes *mpi_data )
@@ -479,3 +486,4 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data *da
   }
 #endif
 }
+#endif
diff --git a/puremd_rc_1003/PG-PuReMD/src/reax_types.h b/puremd_rc_1003/PG-PuReMD/src/reax_types.h
index 55e789bd..d6a99c22 100644
--- a/puremd_rc_1003/PG-PuReMD/src/reax_types.h
+++ b/puremd_rc_1003/PG-PuReMD/src/reax_types.h
@@ -19,18 +19,48 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#if !(defined(__REAX_TYPES_H_) || defined(__CUDA_REAX_TYPES_H_))
+
+#ifdef __CUDACC__
+
+#ifndef __CUDA_REAX_TYPES_H_
+#define __CUDA_REAX_TYPES_H_
+#define CUDA_HOST __host__
+#define CUDA_DEVICE __device__
+#define CUDA_GLOBAL __global__
+#define CUDA_HOST_DEVICE __host__ __device__
+#endif
+
+#else
+
 #ifndef __REAX_TYPES_H_
 #define __REAX_TYPES_H_
+#define CUDA_HOST
+#define CUDA_DEVICE
+#define CUDA_GLOBAL
+#define CUDA_HOST_DEVICE
+#endif
+
+#endif
 
-#include "ctype.h"
-#include "math.h"
-#include "mpi.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
-#include "sys/time.h"
-#include "time.h"
-#include "zlib.h"
+#if (defined(HAVE_CONFIG_H) && !defined(__CONFIG_H_))
+#define __CONFIG_H_
+#include "config.h"
+#endif
+
+#include <ctype.h>
+#include <math.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <zlib.h>
+
+#ifdef HAVE_CUDA
+#include <cuda.h>
+#endif
 
 #if defined(__IBMC__)
 #define inline __inline__
@@ -131,11 +161,81 @@
 #define RESTART 30
 
 /**************** RESOURCE CONSTANTS **********************/
+#ifdef HAVE_CUDA
 //#define			CUDA_BLOCK_SIZE				256
 #define			SCRATCH_SIZE					(1024 * 1024 * 20)
 #define			HOST_SCRATCH_SIZE				(1024 * 1024 * 20)
 #define			RES_SCRATCH						0x90
 
+/* BLOCK SIZES for kernels */
+#define				HB_SYM_BLOCK_SIZE					64
+#define				HB_KER_SYM_THREADS_PER_ATOM			16
+#define				HB_POST_PROC_BLOCK_SIZE				256
+#define				HB_POST_PROC_KER_THREADS_PER_ATOM	32
+
+#if defined( __INIT_BLOCK_SIZE__)
+#define				DEF_BLOCK_SIZE						__INIT_BLOCK_SIZE__    /* all utility functions and all */
+#define				CUDA_BLOCK_SIZE						__INIT_BLOCK_SIZE__    	/* init forces */
+#define				ST_BLOCK_SIZE						__INIT_BLOCK_SIZE__    
+#else
+#define				DEF_BLOCK_SIZE						256						/* all utility functions and all */
+#define				CUDA_BLOCK_SIZE						256						/* init forces */
+#define				ST_BLOCK_SIZE						256	
+#endif
+
+#if defined( __NBRS_THREADS_PER_ATOM__ )
+#define				NB_KER_THREADS_PER_ATOM				__NBRS_THREADS_PER_ATOM__
+#else
+#define				NB_KER_THREADS_PER_ATOM				16
+#endif
+
+#if defined( __NBRS_BLOCK_SIZE__)
+#define				NBRS_BLOCK_SIZE						__NBRS_BLOCK_SIZE__
+#else
+#define				NBRS_BLOCK_SIZE						256
+#endif
+
+#if defined( __HB_THREADS_PER_ATOM__)
+#define				HB_KER_THREADS_PER_ATOM				__HB_THREADS_PER_ATOM__
+#else
+#define				HB_KER_THREADS_PER_ATOM				32
+#endif
+
+#if defined(__HB_BLOCK_SIZE__)
+#define				HB_BLOCK_SIZE					__HB_BLOCK_SIZE__
+#else
+#define				HB_BLOCK_SIZE						256
+#endif
+
+#if defined( __VDW_THREADS_PER_ATOM__ )
+#define				VDW_KER_THREADS_PER_ATOM			__VDW_THREADS_PER_ATOM__
+#else
+#define				VDW_KER_THREADS_PER_ATOM			32
+#endif
+
+#if defined( __VDW_BLOCK_SIZE__)
+#define				VDW_BLOCK_SIZE						__VDW_BLOCK_SIZE__
+#else
+#define				VDW_BLOCK_SIZE						256
+#endif
+
+#if defined( __MATVEC_THREADS_PER_ROW__ )
+#define 			MATVEC_KER_THREADS_PER_ROW		__MATVEC_THREADS_PER_ROW__
+#else
+#define 			MATVEC_KER_THREADS_PER_ROW		32
+#endif
+
+#if defined( __MATVEC_BLOCK_SIZE__)
+#define				MATVEC_BLOCK_SIZE					__MATVEC_BLOCK_SIZE__
+#else
+#define				MATVEC_BLOCK_SIZE					512
+#endif
+
+//Validation
+#define				GPU_TOLERANCE				1e-5
+
+#endif
+
 
 
 /******************* ENUMERATIONS *************************/
@@ -622,14 +722,13 @@ typedef struct
   reax_atom       *my_atoms;
   reax_atom       *d_my_atoms;
 
-  //CUDA
+/*CUDA-specific*/
   int					max_sparse_entries;
   int 				init_thblist;
   int					num_thbodies;
 
   int					max_bonds;
   int					max_hbonds;
-
 } reax_system;
 
 
@@ -846,7 +945,7 @@ typedef struct {
   int scl;
   far_neighbor_data *ptr;
 
-  //CUDA
+/*CUDA-specific*/
   int sym_index;
   rvec hb_f;
 } hbond_data;
@@ -882,7 +981,7 @@ typedef struct {
   rvec dvec;
   bond_order_data bo_data;
 
-  //CUDA
+/*CUDA-specific*/
   real ae_CdDelta;
 
   real va_CdDelta;
@@ -895,7 +994,6 @@ typedef struct {
   rvec hb_f;
 
   rvec tf_f;
-
 } bond_data;
 
 
@@ -1162,9 +1260,7 @@ typedef void (*unpacker) ( reax_system*, int, void*, int, neighbor_proc*, int );
 typedef void (*dist_packer) (void*, mpi_out_data*);
 typedef void (*coll_unpacker) (void*, void*, mpi_out_data*);
 
-////////////////////////////////
-//CUDA SPECIFIC DECLARATIONS
-////////////////////////////////
+/*CUDA-specific*/
 extern reax_list **dev_lists;
 extern storage *dev_workspace;
 extern storage *dev_storage;
diff --git a/puremd_rc_1003/PG-PuReMD/src/reduction.h b/puremd_rc_1003/PG-PuReMD/src/reduction.h
index 490bb725..5291703b 100644
--- a/puremd_rc_1003/PG-PuReMD/src/reduction.h
+++ b/puremd_rc_1003/PG-PuReMD/src/reduction.h
@@ -2,7 +2,7 @@
 #ifndef __REDUCTION_H__
 #define __REDUCTION_H__
 
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 #include "reax_types.h"
 
 #define  INITIAL  0
diff --git a/puremd_rc_1003/PG-PuReMD/src/reset_tools.c b/puremd_rc_1003/PG-PuReMD/src/reset_tools.c
index 9803c6f1..b31776e9 100644
--- a/puremd_rc_1003/PG-PuReMD/src/reset_tools.c
+++ b/puremd_rc_1003/PG-PuReMD/src/reset_tools.c
@@ -20,8 +20,13 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+
 #include "index_utils.h"
+
+#ifdef HAVE_CUDA
 #include "cuda_reset_tools.h"
+#endif
+
 #if defined(PURE_REAX)
 #include "reset_tools.h"
 #include "list.h"
@@ -261,7 +266,7 @@ void Reset( reax_system *system, control_params *control,
 }
 
 
-
+#ifdef HAVE_CUDA
 void Cuda_Reset( reax_system *system, control_params *control, 
 	    simulation_data *data, storage *workspace, reax_list **lists )
 {
@@ -281,3 +286,4 @@ void Cuda_Reset( reax_system *system, control_params *control,
   MPI_Barrier( MPI_COMM_WORLD );
 #endif
 }
+#endif
diff --git a/puremd_rc_1003/PG-PuReMD/src/system_props.c b/puremd_rc_1003/PG-PuReMD/src/system_props.c
index f8e717f3..def8a879 100644
--- a/puremd_rc_1003/PG-PuReMD/src/system_props.c
+++ b/puremd_rc_1003/PG-PuReMD/src/system_props.c
@@ -20,7 +20,9 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
+#ifdef HAVE_CUDA
 #include "dev_system_props.h"
+#endif
 
 #if defined(PURE_REAX)
 #include "system_props.h"
@@ -80,6 +82,8 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
     data->therm.T = ALMOST_ZERO;
 }
 
+
+#ifdef HAVE_CUDA
 void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data, 
 			     MPI_Comm comm )
 {
@@ -100,7 +104,7 @@ void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
   if( fabs(data->therm.T) < ALMOST_ZERO ) 
     data->therm.T = ALMOST_ZERO;
 }
-
+#endif
 
 
 void Compute_System_Energy( reax_system *system, simulation_data *data, 
@@ -111,8 +115,10 @@ void Compute_System_Energy( reax_system *system, simulation_data *data,
 	//TODO remove this is an UGLY fix
   my_en [13] = data->my_en.e_kin;
 
+#ifdef HAVE_CUDA
   //Cuda Wrapper here
   dev_sync_simulation_data ( data );
+#endif
 
   my_en[0] = data->my_en.e_bond;
   my_en[1] = data->my_en.e_ov;
@@ -182,6 +188,8 @@ void Compute_Total_Mass( reax_system *system, simulation_data *data,
   data->inv_M = 1. / data->M;
 }
 
+
+#ifdef HAVE_CUDA
 void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data, 
 			 MPI_Comm comm  )
 {
@@ -195,6 +203,7 @@ void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data,
 
   data->inv_M = 1. / data->M;
 }
+#endif
 
 
 void Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
@@ -312,6 +321,7 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 }
 
 
+#ifdef HAVE_CUDA
 void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
 			     mpi_datatypes *mpi_data, MPI_Comm comm )
 {
@@ -406,21 +416,7 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 	   data->avcm[0], data->avcm[1], data->avcm[2] );
 #endif
 }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+#endif
 
 
 /* IMPORTANT: This function assumes that current kinetic energy
diff --git a/puremd_rc_1003/PG-PuReMD/src/traj.c b/puremd_rc_1003/PG-PuReMD/src/traj.c
index d31ac1f4..fd982773 100644
--- a/puremd_rc_1003/PG-PuReMD/src/traj.c
+++ b/puremd_rc_1003/PG-PuReMD/src/traj.c
@@ -933,24 +933,27 @@ int Append_Frame( reax_system *system, control_params *control,
 #endif
   Write_Frame_Header( system, control, data, out_control, mpi_data );
 
-  //CUDA Wrappers here
   if( out_control->write_atoms ) {
     //Sync atoms here
+#ifdef HAVE_CUDA
 	 Output_Sync_Atoms ( system );
+#endif
     Write_Atoms( system, control, out_control, mpi_data );
   }
 
-  //CUDA Wrappers here
   if( out_control->write_bonds ) {
     //sync bonds here
+#ifdef HAVE_CUDA
 	 Output_Sync_Lists ((*lists + BONDS), (*dev_lists + BONDS), TYP_BOND);
+#endif
     Write_Bonds( system, control, (*lists + BONDS), out_control, mpi_data );
   }
 
-  //CUDA Wrappers here
   if( out_control->write_angles ) {
     //sync three body interactions here
+#ifdef HAVE_CUDA
 	 Output_Sync_Lists ((*lists + THREE_BODIES), (*dev_lists + THREE_BODIES), TYP_THREE_BODY);
+#endif
     Write_Angles( system, control, (*lists + BONDS), (*lists + THREE_BODIES), 
 		  out_control, mpi_data );
   }
diff --git a/puremd_rc_1003/PG-PuReMD/src/validation.cu b/puremd_rc_1003/PG-PuReMD/src/validation.cu
index 58bf11ab..694c5db3 100644
--- a/puremd_rc_1003/PG-PuReMD/src/validation.cu
+++ b/puremd_rc_1003/PG-PuReMD/src/validation.cu
@@ -2,7 +2,7 @@
 #include "validation.h"
 #include "cuda_utils.h"
 #include "list.h"
-#include "cuda_reax_constants.h"
+#include "reax_types.h"
 
 #include "index_utils.h"
 #include "vector.h"
diff --git a/puremd_rc_1003/PG-PuReMD/src/vector.h b/puremd_rc_1003/PG-PuReMD/src/vector.h
index 12878979..bf08deba 100644
--- a/puremd_rc_1003/PG-PuReMD/src/vector.h
+++ b/puremd_rc_1003/PG-PuReMD/src/vector.h
@@ -23,8 +23,6 @@
 #define __VECTOR_H_
 
 #include "reax_types.h"
-
-#include "cuda_reax_constants.h"
 #include "random.h"
 
 #ifdef __cplusplus
-- 
GitLab