diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am
index b0c1c871fffddc45440ed85ef84c9e5e90efaa20..3b051035263dfedbade53966fa45ee9d62da7775 100644
--- a/PG-PuReMD/Makefile.am
+++ b/PG-PuReMD/Makefile.am
@@ -34,25 +34,25 @@ include_HEADERS = src/reax_types.h src/index_utils.h \
 	src/integrate.h src/init_md.h
 
 if USE_CUDA
-bin_pg_puremd_SOURCES += src/cuda_utils.cu src/cuda_allocate.cu src/cuda_environment.cu \
-      src/cuda_system_props.cu src/cuda_reduction.cu src/center_mass.cu \
-      src/cuda_copy.cu src/cuda_reset_tools.cu src/cuda_list.cu \
-      src/cuda_neighbors.cu src/cuda_bond_orders.cu src/cuda_bonds.cu \
-      src/cuda_multi_body.cu src/cuda_valence_angles.cu \
-      src/cuda_torsion_angles.cu src/cuda_hydrogen_bonds.cu src/cuda_forces.cu \
-      src/cuda_charges.cu src/cuda_lin_alg.cu \
-      src/cuda_nonbonded.cu src/cuda_integrate.cu src/cuda_post_evolve.cu \
-      src/cuda_init_md.cu src/cuda_validation.cu src/cuda_lookup.cu
-include_HEADERS += src/cuda_helpers.h src/cuda_shuffle.h \
-      src/cuda_utils.h src/cuda_allocate.h src/cuda_environment.h \
-      src/cuda_system_props.h src/cuda_reduction.h src/center_mass.h \
-      src/cuda_copy.h src/cuda_reset_tools.h src/cuda_list.h \
-      src/cuda_neighbors.h src/cuda_bond_orders.h src/cuda_bonds.h \
-      src/cuda_multi_body.h src/cuda_valence_angles.h \
-      src/cuda_torsion_angles.h src/cuda_hydrogen_bonds.h src/cuda_forces.h \
-      src/cuda_charges.h src/cuda_lin_alg.h \
-      src/cuda_nonbonded.h src/cuda_integrate.h src/cuda_post_evolve.h \
-      src/cuda_init_md.h src/cuda_validation.h src/cuda_lookup.h
+bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cuda/cuda_environment.cu \
+      src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu \
+      src/cuda/cuda_copy.cu src/cuda/cuda_reset_tools.cu src/cuda/cuda_list.cu \
+      src/cuda/cuda_neighbors.cu src/cuda/cuda_bond_orders.cu src/cuda/cuda_bonds.cu \
+      src/cuda/cuda_multi_body.cu src/cuda/cuda_valence_angles.cu \
+      src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \
+      src/cuda/cuda_charges.cu src/cuda/cuda_lin_alg.cu \
+      src/cuda/cuda_nonbonded.cu src/cuda/cuda_integrate.cu src/cuda/cuda_post_evolve.cu \
+      src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu
+include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \
+      src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \
+      src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h \
+      src/cuda/cuda_copy.h src/cuda/cuda_reset_tools.h src/cuda/cuda_list.h \
+      src/cuda/cuda_neighbors.h src/cuda/cuda_bond_orders.h src/cuda/cuda_bonds.h \
+      src/cuda/cuda_multi_body.h src/cuda/cuda_valence_angles.h \
+      src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \
+      src/cuda/cuda_charges.h src/cuda/cuda_lin_alg.h \
+      src/cuda/cuda_nonbonded.h src/cuda/cuda_integrate.h src/cuda/cuda_post_evolve.h \
+      src/cuda/cuda_init_md.h src/cuda/cuda_validation.h src/cuda/cuda_lookup.h
 
 # dummy source to cause C linking
 nodist_EXTRA_bin_pg_puremd_SOURCES = src/dummy.c
diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c
index 1d85b8f980d1480a42b0d045223168d9931595af..54614694e37590dce079f9edb6c47a150c7baebe 100644
--- a/PG-PuReMD/src/allocate.c
+++ b/PG-PuReMD/src/allocate.c
@@ -20,7 +20,6 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
 
 #if defined(PURE_REAX)
   #include "allocate.h"
@@ -36,6 +35,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 /* allocate space for my_atoms
    important: we cannot know the exact number of atoms that will fall into a
diff --git a/PG-PuReMD/src/allocate.h b/PG-PuReMD/src/allocate.h
index 5fd27315d2327da2cffcf4f3eed8e9e127685c2f..a28764532a4ca6a6951705aed4af999f93483e02 100644
--- a/PG-PuReMD/src/allocate.h
+++ b/PG-PuReMD/src/allocate.h
@@ -24,11 +24,11 @@
 
 #include "reax_types.h"
 
+
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
 int PreAllocate_Space( reax_system*, control_params*, storage* );
 
 void Allocate_System( reax_system*, int, int, char* );
@@ -53,9 +53,9 @@ void Deallocate_MPI_Buffers( mpi_datatypes * );
 void ReAllocate( reax_system*, control_params*, simulation_data*, storage*,
         reax_list**, mpi_datatypes* );
 
-
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/analyze.c b/PG-PuReMD/src/analyze.c
index 283d7e470f3b0cce6b72d507f6855c9200507212..0f47ba4836065ac4eba00d214a3d9410ff862e11 100644
--- a/PG-PuReMD/src/analyze.c
+++ b/PG-PuReMD/src/analyze.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "analyze.h"
+
 #include "box.h"
 #include "list.h"
 #include "vector.h"
diff --git a/PG-PuReMD/src/analyze.h b/PG-PuReMD/src/analyze.h
index e470334136d710a20843f3920f469591ca7a1892..a772dcb2fb0152359ba9778a2592cba191ed07e8 100644
--- a/PG-PuReMD/src/analyze.h
+++ b/PG-PuReMD/src/analyze.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Analysis( reax_system*, control_params*, simulation_data*, storage*,
-               reax_list**, output_controls*, mpi_datatypes* );
+        reax_list**, output_controls*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/basic_comm.h b/PG-PuReMD/src/basic_comm.h
index e1effc50db3ed53d33e7da83013cccc2cdbcf8bc..4d8f1c34deb5c29988e87320747f54252d409425 100644
--- a/PG-PuReMD/src/basic_comm.h
+++ b/PG-PuReMD/src/basic_comm.h
@@ -24,33 +24,43 @@
 
 #include "reax_types.h"
 
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 void real_packer( void*, mpi_out_data* );
+
 void rvec_packer( void*, mpi_out_data* );
+
 void rvec2_packer( void*, mpi_out_data* );
+
 void Dist(reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, dist_packer);
 
 void real_unpacker( void*, void*, mpi_out_data* );
+
 void rvec_unpacker( void*, void*, mpi_out_data* );
+
 void rvec2_unpacker( void*, void*, mpi_out_data* );
+
 void Coll( reax_system*, mpi_datatypes*, void*, MPI_Datatype,
-           int, coll_unpacker );
+        int, coll_unpacker );
 
 real Parallel_Norm( real*, int, MPI_Comm );
+
 real Parallel_Dot( real*, real*, int, MPI_Comm );
+
 real Parallel_Vector_Acc( real*, int, MPI_Comm );
 
+#if defined(TEST_FORCES)
+void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* );
+
+void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* );
+#endif
 
 #ifdef __cplusplus
 }
 #endif
 
-#if defined(TEST_FORCES)
-void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* );
-void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* );
-#endif
 
 #endif
diff --git a/PG-PuReMD/src/bond_orders.c b/PG-PuReMD/src/bond_orders.c
index 4e023e976a65cf24c78f968085ff3dcc4d132425..da23e0025e58da12db09944e9004f1358cb2d1ed 100644
--- a/PG-PuReMD/src/bond_orders.c
+++ b/PG-PuReMD/src/bond_orders.c
@@ -31,6 +31,7 @@
   #include "reax_list.h"
   #include "reax_vector.h"
 #endif
+
 #include "index_utils.h"
 
 
diff --git a/PG-PuReMD/src/bond_orders.h b/PG-PuReMD/src/bond_orders.h
index 1975e20b6320a003b08527fae665dbd0bbc3c2e4..8cfa2e18715abd0997a1cabf82e044a8a3213bbb 100644
--- a/PG-PuReMD/src/bond_orders.h
+++ b/PG-PuReMD/src/bond_orders.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+
 typedef struct
 {
     real C1dbo, C2dbo, C3dbo;
@@ -32,28 +33,45 @@ typedef struct
     real C1dDelta, C2dDelta, C3dDelta;
 } dbond_coefficients;
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #ifdef TEST_FORCES
 void Get_dBO( reax_system*, reax_list**, int, int, real, rvec* );
+
 void Get_dBOpinpi2( reax_system*, reax_list**,
-                    int, int, real, real, rvec*, rvec* );
+        int, int, real, real, rvec*, rvec* );
 
 void Add_dBO( reax_system*, reax_list**, int, int, real, rvec* );
+
 void Add_dBOpinpi2( reax_system*, reax_list**,
-                    int, int, real, real, rvec*, rvec* );
+        int, int, real, real, rvec*, rvec* );
 
 void Add_dBO_to_Forces( reax_system*, reax_list**, int, int, real );
+
 void Add_dBOpinpi2_to_Forces( reax_system*, reax_list**,
-                              int, int, real, real );
+        int, int, real, real );
 
 void Add_dDelta( reax_system*, reax_list**, int, real, rvec* );
+
 void Add_dDelta_to_Forces( reax_system *, reax_list**, int, real );
 #endif
 
 void Add_dBond_to_Forces( int, int, storage*, reax_list** );
-void Add_dBond_to_Forces_NPT( int, int, simulation_data*,
-                              storage*, reax_list** );
-int BOp(storage*, reax_list*, real, int, int, far_neighbor_data*,
-        single_body_parameters*, single_body_parameters*, two_body_parameters*);
+
+void Add_dBond_to_Forces_NPT( int, int, simulation_data*, storage*, reax_list** );
+
+int BOp( storage*, reax_list*, real, int, int, far_neighbor_data*,
+        single_body_parameters*, single_body_parameters*, two_body_parameters* );
+
 void BO( reax_system*, control_params*, simulation_data*,
-         storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/bonds.c b/PG-PuReMD/src/bonds.c
index 9c2839eb63e2d722d531914bf6d02136f505d29e..8fb160ecbb0862ba5c7571b89d8106ec5d811678 100644
--- a/PG-PuReMD/src/bonds.c
+++ b/PG-PuReMD/src/bonds.c
@@ -20,25 +20,27 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "bonds.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "tool_box.h"
-#include "vector.h"
+  #include "bonds.h"
+  #include "bond_orders.h"
+  #include "list.h"
+  #include "tool_box.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_bonds.h"
-#include "reax_bond_orders.h"
-#include "reax_list.h"
-#include "reax_tool_box.h"
-#include "reax_vector.h"
+  #include "reax_bonds.h"
+  #include "reax_bond_orders.h"
+  #include "reax_list.h"
+  #include "reax_tool_box.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void Bonds( reax_system *system, control_params *control,
-            simulation_data *data, storage *workspace, reax_list **lists,
-            output_controls *out_control )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control )
 {
     int i, j, pj, natoms;
     int start_i, end_i;
diff --git a/PG-PuReMD/src/bonds.h b/PG-PuReMD/src/bonds.h
index 2aa3c1f93731d35eab6210471c463053c809767f..89090386a8f2044591f2324de56fbb2d4c23c051 100644
--- a/PG-PuReMD/src/bonds.h
+++ b/PG-PuReMD/src/bonds.h
@@ -24,6 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Bonds( reax_system*, control_params*, simulation_data*,
-            storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/box.c b/PG-PuReMD/src/box.c
index 86ebd6eb20ca2f84f49de5dcd37eb8ed13670b1a..525f24e5cc43f1b50f7bcdaee912b634e2ecfc71 100644
--- a/PG-PuReMD/src/box.c
+++ b/PG-PuReMD/src/box.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "box.h"
+
 #include "comm_tools.h"
 #include "io_tools.h"
 #include "system_props.h"
diff --git a/PG-PuReMD/src/box.h b/PG-PuReMD/src/box.h
index 841e367993662e1ad6f5eaa90c0f6d4659dd756d..00e51d063a1126c662f717df938787178ff77549 100644
--- a/PG-PuReMD/src/box.h
+++ b/PG-PuReMD/src/box.h
@@ -24,30 +24,51 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* initializes simulation boxes */
 void Setup_Big_Box( real, real, real, real, real, real, simulation_box* );
+
 void Init_Box( rtensor, simulation_box* );
+
 //void Setup_My_Box( reax_system*, control_params* );
+
 //void Setup_My_Ext_Box( reax_system*, control_params* );
+
 void Setup_Environment( reax_system*, control_params*, mpi_datatypes* );
 
 /* scales simulation box for NPT ensembles */
 void Scale_Box( reax_system*, control_params*,
-                simulation_data*, mpi_datatypes* );
+        simulation_data*, mpi_datatypes* );
 
 /* applies transformation to/from Cartesian/ Triclinic coordinates */
 /* use -1 flag for Cartesian -> Triclinic and +1 for otherway */
-// void Transform( rvec, simulation_box*, char, rvec );
-// void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec );
-// void Inc_on_T3_Gen( rvec, rvec, simulation_box* );
-// int Get_Nbr_Box( simulation_box*, int, int, int );
-// rvec Get_Nbr_Box_Press( simulation_box*, int, int, int );
-// void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );
+//void Transform( rvec, simulation_box*, char, rvec );
+
+//void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec );
+
+//void Inc_on_T3_Gen( rvec, rvec, simulation_box* );
+
+//int Get_Nbr_Box( simulation_box*, int, int, int );
+
+//rvec Get_Nbr_Box_Press( simulation_box*, int, int, int );
+
+//void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );
 
 /* these functions assume that the coordinates are in triclinic system
    this function returns cartesian norm but triclinic distance vector */
 //real Sq_Distance_on_T3( rvec, rvec, simulation_box*, rvec );
+
 //void Inc_on_T3( rvec, rvec, simulation_box* );
+
 //real Metric_Product( rvec, rvec, simulation_box* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/center_mass.cu b/PG-PuReMD/src/center_mass.cu
deleted file mode 100644
index 725cafbb7c79e6fc8cb7dcf275c478114a2a09f1..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/center_mass.cu
+++ /dev/null
@@ -1,551 +0,0 @@
-#include "center_mass.h"
-#include "vector.h"
-#include "cuda_shuffle.h"
-
-CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_xcm, 
-        rvec *res_vcm, 
-        rvec *res_amcm, 
-        size_t n)
-{
-    extern __shared__ rvec xcm[];
-    extern __shared__ rvec vcm[];
-    extern __shared__ rvec amcm[];
-
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    //unsigned int xcm_id = threadIdx.x;
-    unsigned int vcm_id = blockDim.x;
-    unsigned int amcm_id = 2 *(blockDim.x);
-
-    unsigned int index = 0;
-    rvec tmp;
-    real m;
-
-    rvec_MakeZero (xcm [threadIdx.x]);
-    rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
-    rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
-    rvec_MakeZero (tmp);
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
-        rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
-        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
-        rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
-    }
-    __syncthreads ();
-
-    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (xcm [threadIdx.x], xcm[index]);
-            rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
-            rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
-        } 
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0)){
-        rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
-        rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
-        rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
-    }
-}
-
-#if defined( __SM_35__)
-CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_xcm,
-        size_t n)
-{
-    extern __shared__ rvec my_xcm[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int xcm_id = threadIdx.x;
-    unsigned int index = 0;
-    rvec xcm;
-    real m;
-
-    rvec_MakeZero (xcm);
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_ScaledAdd (xcm , m, atoms [i].x);
-    }
-    __syncthreads ();
-
-    for (int z = 16; z >= 1; z /= 2){
-        xcm[0] += shfl( xcm[0], z);
-        xcm[1] += shfl( xcm[1], z);
-        xcm[2] += shfl( xcm[2], z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0)
-        rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm );
-    __syncthreads ();
-
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (my_xcm [threadIdx.x], my_xcm[index]);
-        }
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0))
-        rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]);
-}
-
-CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_vcm,
-        size_t n)
-{
-    extern __shared__ rvec my_vcm[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    rvec vcm;
-    real m;
-
-    rvec_MakeZero (vcm);
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_ScaledAdd (vcm , m, atoms [i].v);
-    }
-    __syncthreads ();
-
-    for (int z = 16; z >= 1; z /= 2){
-        vcm[0] += shfl( vcm[0], z);
-        vcm[1] += shfl( vcm[1], z);
-        vcm[2] += shfl( vcm[2], z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0)
-        rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm );
-    __syncthreads ();
-
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (my_vcm [threadIdx.x], my_vcm[index]);
-        }
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0))
-        rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]);
-}
-
-CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_amcm,
-        size_t n)
-{
-    extern __shared__ rvec my_amcm[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    rvec amcm;
-    real m;
-    rvec tmp;
-
-    rvec_MakeZero (amcm);
-    rvec_MakeZero( tmp );
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
-        rvec_ScaledAdd (amcm, m, tmp);
-    }
-    __syncthreads ();
-
-    for (int z = 16; z >= 1; z /= 2){
-        amcm[0] += shfl( amcm[0], z);
-        amcm[1] += shfl( amcm[1], z);
-        amcm[2] += shfl( amcm[2], z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0)
-        rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm );
-    __syncthreads ();
-
-
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (my_amcm[threadIdx.x], my_amcm[index]);
-        }
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0)){
-        rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]);
-    }
-}
-
-#endif
-
-
-CUDA_GLOBAL void center_of_mass (rvec *xcm, 
-        rvec *vcm, 
-        rvec *amcm, 
-        rvec *res_xcm,
-        rvec *res_vcm,
-        rvec *res_amcm,
-        size_t n)
-{
-    extern __shared__ rvec sh_xcm[];
-    extern __shared__ rvec sh_vcm[];
-    extern __shared__ rvec sh_amcm[];
-
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    unsigned int xcm_id = threadIdx.x;
-    unsigned int vcm_id = blockDim.x;
-    unsigned int amcm_id = 2 * (blockDim.x);
-
-    unsigned int index = 0;
-    rvec t_xcm, t_vcm, t_amcm;
-
-    rvec_MakeZero (t_xcm);
-    rvec_MakeZero (t_vcm);
-    rvec_MakeZero (t_amcm);
-
-    if (i < n){
-        rvec_Copy ( t_xcm, xcm[threadIdx.x]);
-        rvec_Copy ( t_vcm, vcm[threadIdx.x]);
-        rvec_Copy ( t_amcm, amcm[threadIdx.x]);
-    }
-
-    rvec_Copy (sh_xcm[xcm_id], t_xcm);
-    rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
-    rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
-
-    __syncthreads ();
-
-    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-        if (threadIdx.x < offset) {
-            index = threadIdx.x + offset;
-            rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
-            rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
-            rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
-        } 
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0){
-        rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
-        rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
-        rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp, 
-        reax_atom *atoms,
-        real *results, 
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real xx[];
-    extern __shared__ real xy[];
-    extern __shared__ real xz[];
-    extern __shared__ real yy[];
-    extern __shared__ real yz[];
-    extern __shared__ real zz[];
-
-    unsigned int xx_i = threadIdx.x;
-    unsigned int xy_i = blockDim.x;
-    unsigned int xz_i = 2 * blockDim.x;
-    unsigned int yy_i = 3 * blockDim.x;
-    unsigned int yz_i = 4 * blockDim.x;
-    unsigned int zz_i = 5 * blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-
-    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        xx[ xx_i ] = diff[0] * diff[0] * m;
-        xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
-        xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
-        yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
-        yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
-        zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            xx[ threadIdx.x ] += xx[ index ];
-            xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-            xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-            yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-            yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-            zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 ] = xx [ 0 ];
-        results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
-        results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
-        results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
-        results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
-        results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n)
-{
-    extern __shared__ real xx[];
-    extern __shared__ real xy[];
-    extern __shared__ real xz[];
-    extern __shared__ real yy[];
-    extern __shared__ real yz[];
-    extern __shared__ real zz[];
-
-    unsigned int xx_i = threadIdx.x;
-    unsigned int xy_i = blockDim.x;
-    unsigned int xz_i = 2 * blockDim.x;
-    unsigned int yy_i = 3 * blockDim.x;
-    unsigned int yz_i = 4 * blockDim.x;
-    unsigned int zz_i = 5 * blockDim.x;
-
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-
-    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-    if (i < n)
-    {
-        xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
-        xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
-        xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
-        yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
-        yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
-        zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if (threadIdx.x < offset )
-        {
-            index = threadIdx.x + offset;
-            xx [ threadIdx.x ] += xx [ index ];
-            xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-            xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-            yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-            yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-            zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0)
-    {
-        output[0] = xx[0];
-        output[1] = xy[xy_i];
-        output[2] = xz[xz_i];
-        output[3] = xz[yy_i];
-        output[4] = xz[yz_i];
-        output[5] = xz[zz_i];
-    }
-}
-
-#if defined( __SM_35__)
-
-CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *sbp,
-        reax_atom *atoms,
-        real *results,
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real my_results_xx[];
-    extern __shared__ real my_results_xy[];
-
-    unsigned int xx_i = threadIdx.x;
-    unsigned int xy_i = blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    real xx = 0;
-    real xy = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        xx = diff[0] * diff[0] * m;
-        xy = diff[0] * diff[1] * m;
-    }
-    __syncthreads ();
-
-    for (int z = 16; z <= 1; z++){
-        xx += shfl( xx, z);
-        xy += shfl( xy, z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0){
-        my_results_xx[threadIdx.x >> 5] = xx;    
-        my_results_xy[threadIdx.x >> 5] = xy;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            my_results_xx[ threadIdx.x ] += my_results_xx[ index ];
-            my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 ] = my_results_xx [ 0 ];
-        results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ];
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *sbp,
-        reax_atom *atoms,
-        real *results,
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real my_results_xz[];
-    extern __shared__ real my_results_yy[];
-
-    unsigned int yy_i = blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    real xz = 0;
-    real yy = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        xz = diff[0] * diff[2] * m;
-        yy = diff[1] * diff[1] * m;
-    }
-    __syncthreads ();
-
-    for (int z = 16; z <= 1; z++){
-        xz += shfl( xz, z);
-        yy += shfl( yy, z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0){
-        my_results_xz[threadIdx.x >> 5] = xz;    
-        my_results_yy[threadIdx.x >> 5] = yy;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            my_results_xz[ threadIdx.x ] += my_results_xz [ index ];
-            my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ];
-        results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ];
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *sbp,
-        reax_atom *atoms,
-        real *results,
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real my_results_yz[];
-    extern __shared__ real my_results_zz[];
-
-    unsigned int zz_i = blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    real yz = 0;
-    real zz = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        yz = diff[1] * diff[2] * m;
-        zz = diff[2] * diff[2] * m;
-    }
-    __syncthreads ();
-
-    for (int z = 16; z <= 1; z++){
-        yz += shfl( yz, z);
-        zz += shfl( zz, z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0){
-        my_results_yz[threadIdx.x >> 5] = yz;    
-        my_results_zz[threadIdx.x >> 5] = zz;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            my_results_yz[ threadIdx.x ] += my_results_yz [ index ];
-            my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ];
-        results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ];
-    }
-}
-
-#endif
diff --git a/PG-PuReMD/src/center_mass.h b/PG-PuReMD/src/center_mass.h
deleted file mode 100644
index 113971ad3f467b6077783b497e8cf170e63d5318..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/center_mass.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-#ifndef __CENTER_MASS_H__
-#define __CENTER_MASS_H__
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *, reax_atom *,
-                                        rvec *res_xcm,
-                                        rvec *res_vcm,
-                                        rvec *res_amcm,
-                                        size_t n);
-
-#if defined(__SM_35__)
-CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *, reax_atom *,
-        rvec *res_xcm,
-        size_t n);
-CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *, reax_atom *,
-        rvec *res_vcm,
-        size_t n);
-CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *, reax_atom *,
-        rvec *res_amcm,
-        size_t n);
-#endif
-
-
-CUDA_GLOBAL void center_of_mass (rvec *xcm,
-                                 rvec *vcm,
-                                 rvec *amcm,
-                                 rvec *res_xcm,
-                                 rvec *res_vcm,
-                                 rvec *res_amcm,
-                                 size_t n);
-
-CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp,
-                                      reax_atom *atoms,
-                                      real *results,
-                                      real xcm0, real xcm1, real xcm2,
-                                      size_t n);
-
-CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n);
-
-#if defined(__SM_35__)
-CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *, reax_atom *, real *, real , real , real , size_t );
-CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *, reax_atom *, real *, real , real , real , size_t );
-CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *, reax_atom *, real *, real , real , real , size_t );
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c
index 6d695f566b60ecbcb9f7f58d7f88371b12a07852..8f53b65d4b49a95fc68f4acf7698802e650137df 100644
--- a/PG-PuReMD/src/charges.c
+++ b/PG-PuReMD/src/charges.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "charges.h"
 
 #include "allocate.h"
@@ -27,12 +29,6 @@
 #include "lin_alg.h"
 #include "tool_box.h"
 
-#ifdef HAVE_CUDA
-  #include "cuda_charges.h"
-  #include "cuda_lin_alg.h"
-  #include "cuda_validation.h"
-#endif
-
 
 int compare_matrix_entry(const void *v1, const void *v2)
 {
@@ -406,46 +402,6 @@ void Calculate_Charges( reax_system *system, storage *workspace,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
-        mpi_datatypes *mpi_data )
-{
-    int i, scale;
-    real u;//, s_sum, t_sum;
-    rvec2 my_sum, all_sum;
-    reax_atom *atom;
-    real *q;
-
-    my_sum[0] = 0.0;
-    my_sum[1] = 0.0;
-    scale = sizeof(real) / sizeof(void);
-    q = (real *) host_scratch;
-    memset( q, 0, system->N * sizeof (real));
-
-    cuda_charges_x( system, my_sum );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "Device: my_sum[0]: %f, my_sum[1]: %f\n",
-            my_sum[0], my_sum[1] );
-#endif
-
-    MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
-
-    u = all_sum[0] / all_sum[1];
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "Device: u: %f \n", u );
-#endif
-
-    cuda_charges_st( system, workspace, q, u );
-
-    Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
-
-    cuda_charges_updateq( system, q );
-}
-#endif
-
-
 void QEq( reax_system *system, control_params *control, simulation_data *data,
         storage *workspace, output_controls *out_control,
         mpi_datatypes *mpi_data )
@@ -504,57 +460,3 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     }
 #endif
 }
-
-
-#ifdef HAVE_CUDA
-void Cuda_QEq( reax_system *system, control_params *control, simulation_data
-        *data, storage *workspace, output_controls *out_control, mpi_datatypes
-        *mpi_data )
-{
-    int s_matvecs, t_matvecs;
-
-    Cuda_Init_MatVec( system, workspace );
-
-    //if (data->step > 0) {
-    //    compare_rvec2 (workspace->b, dev_workspace->b, system->n, "b");
-    //    compare_rvec2 (workspace->x, dev_workspace->x, system->n, "x");
-    // compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
-    // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
-    //}
-
-//#ifdef __CUDA_DEBUG__
-//  Init_MatVec( system, data, control, workspace, mpi_data );
-//#endif
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
-    //Print_Linear_System( system, control, workspace, data->step );
-#endif
-
-    //MATRIX CHANGES
-    s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H,
-            dev_workspace->b, control->q_err, dev_workspace->x, mpi_data,
-            out_control->log, data);
-    t_matvecs = 0;
-    //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
-#endif
-
-    Cuda_Calculate_Charges( system, workspace, mpi_data );
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: computed charges\n", system->my_rank );
-    //Print_Charges( system );
-#endif
-
-#if defined(LOG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        data->timing.s_matvecs += s_matvecs;
-        data->timing.t_matvecs += t_matvecs;
-    }
-#endif
-}
-#endif
diff --git a/PG-PuReMD/src/charges.h b/PG-PuReMD/src/charges.h
index faad0d09f3b01f0c76a20a4975edb4d39f513fdf..08af5641406e9cb7d57d260701dbf6df702e4e47 100644
--- a/PG-PuReMD/src/charges.h
+++ b/PG-PuReMD/src/charges.h
@@ -25,11 +25,16 @@
 #include "reax_types.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void QEq( reax_system*, control_params*, simulation_data*,
         storage*, output_controls*, mpi_datatypes* );
 
-void Cuda_QEq( reax_system*, control_params*, simulation_data*,
-        storage*, output_controls*, mpi_datatypes* );
+#ifdef __cplusplus
+}
+#endif
 
 
 #endif
diff --git a/PG-PuReMD/src/comm_tools.c b/PG-PuReMD/src/comm_tools.c
index 5a832affd9758dbfc2f74001b5cbfebf1a2ad86c..a8d46fcb5f3b01061616254ea97cca5d215797b4 100644
--- a/PG-PuReMD/src/comm_tools.c
+++ b/PG-PuReMD/src/comm_tools.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "comm_tools.h"
 
 #include "grid.h"
diff --git a/PG-PuReMD/src/comm_tools.h b/PG-PuReMD/src/comm_tools.h
index 3b0b645f885b4bc1b68882fc042a4487dbc2791d..a0e8d7e5428f193c96ecf15debc544baf48f20ef 100644
--- a/PG-PuReMD/src/comm_tools.h
+++ b/PG-PuReMD/src/comm_tools.h
@@ -24,21 +24,36 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Check_MPI_Error( int, const char * );
+
 void Setup_Comm( reax_system*, control_params*, mpi_datatypes* );
+
 void Update_Comm( reax_system* );
 
 void Sort_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* );
+
 void Estimate_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* );
+
 void Unpack_Exchange_Message( reax_system*, int, void*, int,
-                              neighbor_proc*, int );
+        neighbor_proc*, int );
+
 void Unpack_Estimate_Message( reax_system*, int, void*, int,
-                              neighbor_proc*, int );
+        neighbor_proc*, int );
 
 int SendRecv( reax_system*, mpi_datatypes*_data, MPI_Datatype, int*,
-              message_sorter, unpacker, int );
+        message_sorter, unpacker, int );
 
 void Comm_Atoms( reax_system*, control_params*, simulation_data*, storage*,
-                 reax_list**, mpi_datatypes*, int );
+        reax_list**, mpi_datatypes*, int );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/control.h b/PG-PuReMD/src/control.h
index c6c6ce6c739181243b436a614b29fd28be07dfb5..24cf045176f7a3d82fdb7c71255debbe7b712326 100644
--- a/PG-PuReMD/src/control.h
+++ b/PG-PuReMD/src/control.h
@@ -24,6 +24,16 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 char Read_Control_File( char*, control_params*, output_controls* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_allocate.cu
rename to PG-PuReMD/src/cuda/cuda_allocate.cu
index 7970e9f6b79520b4258c7ab282767ddcf0dfe04e..5c722e56d816a420a93f3ac03c1994f58a1d53c5 100644
--- a/PG-PuReMD/src/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -6,10 +6,10 @@
 #include "cuda_neighbors.h"
 #include "cuda_utils.h"
 
-#include "allocate.h"
-#include "index_utils.h"
-#include "tool_box.h"
-#include "vector.h"
+#include "../allocate.h"
+#include "../index_utils.h"
+#include "../tool_box.h"
+#include "../vector.h"
 
 extern "C"
 {
diff --git a/PG-PuReMD/src/cuda_allocate.h b/PG-PuReMD/src/cuda/cuda_allocate.h
similarity index 96%
rename from PG-PuReMD/src/cuda_allocate.h
rename to PG-PuReMD/src/cuda/cuda_allocate.h
index 571240464af7b90c639b4dffb744bfd133f4bbb2..0d78d93264f21ac37fc2f7b3a715bdd75c90de78 100644
--- a/PG-PuReMD/src/cuda_allocate.h
+++ b/PG-PuReMD/src/cuda/cuda_allocate.h
@@ -1,7 +1,7 @@
 #ifndef __CUDA_ALLOCATE_H_
 #define __CUDA_ALLOCATE_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 #ifdef __cplusplus
 extern "C"  {
diff --git a/PG-PuReMD/src/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_bond_orders.cu
rename to PG-PuReMD/src/cuda/cuda_bond_orders.cu
index 6e4344aa8f7c83497259f65b02f5112c2fa6443d..bb478a3af0f6e1848ea0db2e7a057ea0e87cd0f3 100644
--- a/PG-PuReMD/src/cuda_bond_orders.cu
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
@@ -2,11 +2,12 @@
 #include "cuda_bond_orders.h"
 
 #include "cuda_list.h"
-#include "index_utils.h"
-#include "bond_orders.h"
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
 
+#include "../index_utils.h"
+#include "../bond_orders.h"
+
 
 CUDA_GLOBAL void Cuda_Calculate_BO_init( reax_atom *my_atoms, 
         single_body_parameters *sbp, storage p_workspace, int N )
diff --git a/PG-PuReMD/src/cuda_bond_orders.h b/PG-PuReMD/src/cuda/cuda_bond_orders.h
similarity index 98%
rename from PG-PuReMD/src/cuda_bond_orders.h
rename to PG-PuReMD/src/cuda/cuda_bond_orders.h
index 8be3a5926615b23a57eec630b010688a46fa2eff..a957b11bbd2a083dd03e4601341ab10af2f60af1 100644
--- a/PG-PuReMD/src/cuda_bond_orders.h
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.h
@@ -2,10 +2,9 @@
 #ifndef __CUDA_BOND_ORDERS_H__
 #define __CUDA_BOND_ORDERS_H__
 
-#include "reax_types.h"
-#include "reax_types.h"
+#include "../reax_types.h"
 
-#include "vector.h"
+#include "../vector.h"
 
 extern "C" {
 
diff --git a/PG-PuReMD/src/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu
similarity index 98%
rename from PG-PuReMD/src/cuda_bonds.cu
rename to PG-PuReMD/src/cuda/cuda_bonds.cu
index 81f3444b8d436491af38f31af0d0efa3adfcba4d..e3592630d41938714414843b3ce20a9257f8a6c4 100644
--- a/PG-PuReMD/src/cuda_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_bonds.cu
@@ -19,13 +19,12 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
-
-#include "index_utils.h"
-#include "reax_types.h"
+#include "cuda_bonds.h"
 
 #include "cuda_list.h"
 
+#include "../index_utils.h"
+
 
 CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
diff --git a/PG-PuReMD/src/cuda_bonds.h b/PG-PuReMD/src/cuda/cuda_bonds.h
similarity index 69%
rename from PG-PuReMD/src/cuda_bonds.h
rename to PG-PuReMD/src/cuda/cuda_bonds.h
index d8a7d273c80559a89f5fc5ba63637806add8ea72..fd9126bee432f48b13de67f7680468907ff484ea 100644
--- a/PG-PuReMD/src/cuda_bonds.h
+++ b/PG-PuReMD/src/cuda/cuda_bonds.h
@@ -22,16 +22,12 @@
 #ifndef __CUDA_BONDS_H_
 #define __CUDA_BONDS_H_
 
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Bonds(    reax_atom *,
-                                global_parameters ,
-                                single_body_parameters *,
-                                two_body_parameters *,
-                                storage ,
-                                reax_list ,
-                                int , int ,
-                                real *
-                           );
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Bonds( reax_atom *, global_parameters,
+        single_body_parameters *, two_body_parameters *, storage,
+        reax_list, int, int, real * );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
similarity index 68%
rename from PG-PuReMD/src/cuda_charges.cu
rename to PG-PuReMD/src/cuda/cuda_charges.cu
index c841095298ed5ac09af85c86f92a45d4e0244b21..ada6bf2f75c7960e1b00e820bf8387cbd98199d2 100644
--- a/PG-PuReMD/src/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -21,12 +21,13 @@
 
 #include "cuda_charges.h"
 
-#include "reax_types.h"
+#include "cuda_lin_alg.h"
 #include "cuda_reduction.h"
 #include "cuda_utils.h"
-
 #include "cuda_validation.h"
 
+#include "../basic_comm.h"
+
 
 CUDA_GLOBAL void k_init_matvec( reax_atom *my_atoms, single_body_parameters
         *sbp, storage p_workspace, int n  )
@@ -204,3 +205,93 @@ void cuda_charges_updateq( reax_system *system, real *q )
     cudaThreadSynchronize( );
     cudaCheckError( );
 }
+
+
+void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
+        mpi_datatypes *mpi_data )
+{
+    int i, scale;
+    real u;//, s_sum, t_sum;
+    rvec2 my_sum, all_sum;
+    reax_atom *atom;
+    real *q;
+
+    my_sum[0] = 0.0;
+    my_sum[1] = 0.0;
+    scale = sizeof(real) / sizeof(void);
+    q = (real *) host_scratch;
+    memset( q, 0, system->N * sizeof (real));
+
+    cuda_charges_x( system, my_sum );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "Device: my_sum[0]: %f, my_sum[1]: %f\n",
+            my_sum[0], my_sum[1] );
+#endif
+
+    MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
+
+    u = all_sum[0] / all_sum[1];
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "Device: u: %f \n", u );
+#endif
+
+    cuda_charges_st( system, workspace, q, u );
+
+    Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
+
+    cuda_charges_updateq( system, q );
+}
+
+
+void Cuda_QEq( reax_system *system, control_params *control, simulation_data
+        *data, storage *workspace, output_controls *out_control, mpi_datatypes
+        *mpi_data )
+{
+    int s_matvecs, t_matvecs;
+
+    Cuda_Init_MatVec( system, workspace );
+
+    //if (data->step > 0) {
+    //    compare_rvec2 (workspace->b, dev_workspace->b, system->n, "b");
+    //    compare_rvec2 (workspace->x, dev_workspace->x, system->n, "x");
+    // compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
+    // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
+    //}
+
+//#ifdef __CUDA_DEBUG__
+//  Init_MatVec( system, data, control, workspace, mpi_data );
+//#endif
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
+    //Print_Linear_System( system, control, workspace, data->step );
+#endif
+
+    //MATRIX CHANGES
+    s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H,
+            dev_workspace->b, control->q_err, dev_workspace->x, mpi_data,
+            out_control->log, data);
+    t_matvecs = 0;
+    //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
+#endif
+
+    Cuda_Calculate_Charges( system, workspace, mpi_data );
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: computed charges\n", system->my_rank );
+    //Print_Charges( system );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.s_matvecs += s_matvecs;
+        data->timing.t_matvecs += t_matvecs;
+    }
+#endif
+}
diff --git a/PG-PuReMD/src/cuda_charges.h b/PG-PuReMD/src/cuda/cuda_charges.h
similarity index 89%
rename from PG-PuReMD/src/cuda_charges.h
rename to PG-PuReMD/src/cuda/cuda_charges.h
index 2d4213895e39ce9c43b18fc3e1d57ea49cfc8cfd..d1922a48b83a6df1b27a14efb38e63d153f89111 100644
--- a/PG-PuReMD/src/cuda_charges.h
+++ b/PG-PuReMD/src/cuda/cuda_charges.h
@@ -22,7 +22,7 @@
 #ifndef __CUDA_CHARGES_H_
 #define __CUDA_CHARGES_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,6 +37,9 @@ void cuda_charges_st( reax_system *, storage *, real *, real );
 
 void cuda_charges_updateq( reax_system *, real * );
 
+void Cuda_QEq( reax_system*, control_params*, simulation_data*,
+        storage*, output_controls*, mpi_datatypes* );
+
 
 #ifdef __cplusplus
 }
diff --git a/PG-PuReMD/src/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_copy.cu
rename to PG-PuReMD/src/cuda/cuda_copy.cu
index a3bfca304d2620a3da96c844cfda16368cd8d714..42055875624ce003a3f2b4ba6661d3bc1e08eb8e 100644
--- a/PG-PuReMD/src/cuda_copy.cu
+++ b/PG-PuReMD/src/cuda/cuda_copy.cu
@@ -2,7 +2,8 @@
 #include "cuda_copy.h"
 
 #include "cuda_utils.h"
-#include "vector.h"
+
+#include "../vector.h"
 
 
 /* Copy grid info from host to device */
diff --git a/PG-PuReMD/src/cuda_copy.h b/PG-PuReMD/src/cuda/cuda_copy.h
similarity index 93%
rename from PG-PuReMD/src/cuda_copy.h
rename to PG-PuReMD/src/cuda/cuda_copy.h
index 51c4314c8fd22152dc4aff0efa821498fd70ae0f..72bf992c581950a95ea3d2fa1c7684a6f4fe1c06 100644
--- a/PG-PuReMD/src/cuda_copy.h
+++ b/PG-PuReMD/src/cuda/cuda_copy.h
@@ -1,24 +1,30 @@
 #ifndef __CUDA_COPY_H_
 #define __CUDA_COPY_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
 void Sync_Atoms( reax_system * );
+
 void Sync_Grid( grid *, grid * );
+
 void Sync_System( reax_system * );
 
 void Prep_Device_For_Output( reax_system *, simulation_data * );
+
 void Output_Sync_Lists( reax_list *host, reax_list *device, int type );
+
 void Output_Sync_Atoms( reax_system * );
-void Output_Sync_Simulation_Data( simulation_data *, simulation_data * );
 
+void Output_Sync_Simulation_Data( simulation_data *, simulation_data * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu
similarity index 100%
rename from PG-PuReMD/src/cuda_environment.cu
rename to PG-PuReMD/src/cuda/cuda_environment.cu
diff --git a/PG-PuReMD/src/cuda_environment.h b/PG-PuReMD/src/cuda/cuda_environment.h
similarity index 56%
rename from PG-PuReMD/src/cuda_environment.h
rename to PG-PuReMD/src/cuda/cuda_environment.h
index f8ae3cd0024b6585e32cf34c46d889d84693b806..1cbcc92c5d29bfa28649bfd9e9815ba332d4cbe8 100644
--- a/PG-PuReMD/src/cuda_environment.h
+++ b/PG-PuReMD/src/cuda/cuda_environment.h
@@ -2,15 +2,19 @@
 #ifndef __CUDA_ENVIRONMENT_H__
 #define __CUDA_ENVIRONMENT_H__
 
+#include "../reax_types.h"
+
+
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-void Setup_Cuda_Environment (int, int, int);
-void Cleanup_Cuda_Environment ();
+void Setup_Cuda_Environment( int, int, int );
+void Cleanup_Cuda_Environment( );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
similarity index 89%
rename from PG-PuReMD/src/cuda_forces.cu
rename to PG-PuReMD/src/cuda/cuda_forces.cu
index 831a569488ba486aeb02a15580ef4b594911f021..a790b1a8c7fd570723c27c985c458886f0dcb478 100644
--- a/PG-PuReMD/src/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -1,24 +1,27 @@
 
 #include "cuda_forces.h"
 
-#include "reax_types.h"
-#include "cuda_list.h"
-#include "cuda_utils.h"
+#include "cuda_bonds.h"
+#include "cuda_bond_orders.h"
+#include "cuda_charges.h"
 #include "cuda_helpers.h"
+#include "cuda_hydrogen_bonds.h"
+#include "cuda_lin_alg.h"
+#include "cuda_list.h"
+#include "cuda_multi_body.h"
 #include "cuda_neighbors.h"
-#include "cuda_bond_orders.h"
+#include "cuda_nonbonded.h"
 #include "cuda_reduction.h"
-#include "cuda_bonds.h"
-#include "cuda_multi_body.h"
-#include "cuda_valence_angles.h"
 #include "cuda_torsion_angles.h"
-#include "cuda_hydrogen_bonds.h"
-#include "tool_box.h"
-#include "cuda_nonbonded.h"
+#include "cuda_utils.h"
+#include "cuda_valence_angles.h"
+#include "cuda_validation.h"
 
-#include "index_utils.h"
-#include "vector.h"
-#include "forces.h"
+#include "../basic_comm.h"
+#include "../forces.h"
+#include "../index_utils.h"
+#include "../tool_box.h"
+#include "../vector.h"
 
 
 CUDA_GLOBAL void k_disable_hydrogen_bonding( control_params *control )
@@ -1719,3 +1722,201 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
     Cuda_NonBonded_Energy( system, control, workspace, data,
             lists, out_control, (control->tabulate == 0) ? false: true );
 }
+
+
+void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, mpi_datatypes *mpi_data )
+{
+    rvec *f;
+
+    f = (rvec *) host_scratch;
+    memset( f, 0, sizeof(rvec) * system->N );
+
+    Cuda_Total_Forces( system, control, data, workspace );
+
+#if defined(PURE_REAX)
+    /* now all forces are computed to their partially-final values
+     * based on the neighbors information each processor has had.
+     * final values of force on each atom needs to be computed by adding up
+     * all partially-final pieces */
+
+    //MVAPICH2
+    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N ,
+            cudaMemcpyDeviceToHost, "total_force:f:get" );
+
+    Coll( system, mpi_data, f, mpi_data->mpi_rvec,
+          sizeof(rvec) / sizeof(void), rvec_unpacker );
+
+    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N,
+            cudaMemcpyHostToDevice, "total_force:f:put" );
+
+    Cuda_Total_Forces_PURE( system, dev_workspace );
+#endif
+
+}
+
+
+int Cuda_Compute_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int charge_flag, retVal;
+
+#if defined(LOG_PERFORMANCE)
+    real t_start = 0;
+
+    //MPI_Barrier( MPI_COMM_WORLD );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        t_start = Get_Time( );
+    }
+#endif
+
+    retVal = SUCCESS;
+
+    /********* init forces ************/
+    if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 )
+    {
+        charge_flag = TRUE;
+    }
+    else
+    {
+        charge_flag = FALSE;
+    }
+
+    if ( charge_flag == TRUE )
+    {
+        retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
+
+//        int i;
+//        static reax_list **temp_lists;
+//       
+//        if ( data->step == 0 )
+//        {
+//            temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" );
+//            for ( i = 0; i < LIST_N; ++i )
+//            {
+//                temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
+//                temp_lists[i]->allocated = FALSE;
+//            }
+//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
+//                    TYP_BOND, *temp_lists + BONDS );
+//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
+//                    TYP_HBOND, *temp_lists + HBONDS );
+//        }
+//        else
+//        {
+//            Delete_List( *temp_lists + BONDS );
+//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
+//                    TYP_BOND, *temp_lists + BONDS );
+//            Delete_List( *temp_lists + HBONDS );
+//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
+//                    TYP_HBOND, *temp_lists + HBONDS );
+//
+//        }
+//        Output_Sync_Lists( *temp_lists + BONDS, *dev_lists + BONDS, TYP_BOND );
+//        Print_Bonds( system, temp_lists, control );
+//        Output_Sync_Lists( *temp_lists + HBONDS, *dev_lists + HBONDS, TYP_HBOND );
+//        Print_HBonds( system, temp_lists, control, data->step );
+//        Print_HBond_Indices( system, temp_lists, control, data->step );
+//        exit( 0 );
+    }
+    else
+    {
+        retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
+    }
+
+    if ( retVal == SUCCESS )
+    {
+        //validate_sparse_matrix( system, workspace );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.init_forces) );
+        }
+#endif
+
+        /********* bonded interactions ************/
+        retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.bonded) );
+        }
+#endif
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: completed bonded\n",
+                 system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    }
+
+    if ( retVal == SUCCESS )
+    {
+    /**************** charges ************************/
+#if defined(PURE_REAX)
+        if ( charge_flag == TRUE )
+        {
+            Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
+        }
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.qEq) );
+        }
+#endif
+
+#if defined(DEBUG_FOCUS)
+        fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+#endif //PURE_REAX
+
+        /********* nonbonded interactions ************/
+        Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.nonb) );
+        }
+#endif
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
+                system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        /*********** total force ***************/
+        Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.bonded) );
+        }
+#endif
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: total forces computed\n",
+                system->my_rank, data->step );
+        //Print_Total_Force( system, data, workspace );
+        MPI_Barrier( MPI_COMM_WORLD );
+
+#endif
+
+//        Print_Forces( system );
+    }
+
+    return retVal;
+}
diff --git a/PG-PuReMD/src/cuda_forces.h b/PG-PuReMD/src/cuda/cuda_forces.h
similarity index 86%
rename from PG-PuReMD/src/cuda_forces.h
rename to PG-PuReMD/src/cuda/cuda_forces.h
index 9dc3da66a5b5d2e08170455b6c43226b2080a345..4abdb52f6ba52703c1876d927aed490d87aef0b5 100644
--- a/PG-PuReMD/src/cuda_forces.h
+++ b/PG-PuReMD/src/cuda/cuda_forces.h
@@ -2,7 +2,7 @@
 #ifndef __CUDA_FORCES_H__
 #define __CUDA_FORCES_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
@@ -32,6 +32,9 @@ void Cuda_Compute_NonBonded_Forces( reax_system *, control_params *,
         simulation_data *, storage *, reax_list **, output_controls *,
         mpi_datatypes * );
 
+int Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*,
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
+
 void Print_Forces( reax_system * );
 
 
diff --git a/PG-PuReMD/src/cuda_helpers.h b/PG-PuReMD/src/cuda/cuda_helpers.h
similarity index 97%
rename from PG-PuReMD/src/cuda_helpers.h
rename to PG-PuReMD/src/cuda/cuda_helpers.h
index a4943a5f891c94776404247e405ab197b8258912..b14f45b331b5ff6bfcee4d3ac2a9c8df626b775b 100644
--- a/PG-PuReMD/src/cuda_helpers.h
+++ b/PG-PuReMD/src/cuda/cuda_helpers.h
@@ -1,7 +1,7 @@
 #ifndef __CUDA_HELPERS__
 #define __CUDA_HELPERS__
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 CUDA_DEVICE static inline int cuda_strcmp( char * a,
diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_hydrogen_bonds.cu
rename to PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
index 95eda0816c73f9c1261c96fbc3bdb72fab104621..18cdbb57beef84783c9ec928532cc006ae93a5ea 100644
--- a/PG-PuReMD/src/cuda_hydrogen_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
@@ -21,16 +21,14 @@
 
 #include "cuda_hydrogen_bonds.h"
 
-#include "reax_types.h"
-#include "index_utils.h"
-
 #include "cuda_valence_angles.h"
 #include "cuda_helpers.h"
 #include "cuda_list.h"
-#include "vector.h"
-
 #include "cuda_shuffle.h"
 
+#include "../index_utils.h"
+#include "../vector.h"
+
 
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameters *sbp, 
         hbond_parameters *d_hbp, global_parameters gp, control_params *control, 
diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
new file mode 100644
index 0000000000000000000000000000000000000000..606196b4efb68171b6039a85c7a8ac5b2767bc87
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
@@ -0,0 +1,48 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_HBONDS_H_
+#define __CUDA_HBONDS_H_
+
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *,
+        storage, reax_list );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *,
+        storage, reax_list, int );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess( reax_atom *,
+        storage, reax_list, int );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *,
+        single_body_parameters *, hbond_parameters *,
+        global_parameters, control_params *, storage ,
+        reax_list, reax_list, int, int, real *, rvec * );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *,
+        single_body_parameters *, hbond_parameters *,
+        global_parameters , control_params *, storage,
+        reax_list, reax_list, int, int, real *, rvec * );
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fb1ac0df5abdccc5c021ee594caf4c34fe3dcd22
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -0,0 +1,412 @@
+
+#include "cuda_init_md.h"
+
+#include "cuda_allocate.h"
+#include "cuda_list.h"
+#include "cuda_copy.h"
+#include "cuda_forces.h"
+#include "cuda_integrate.h"
+#include "cuda_neighbors.h"
+#include "cuda_reset_tools.h"
+#include "cuda_system_props.h"
+#include "cuda_utils.h"
+#include "cuda_validation.h"
+
+#if defined(PURE_REAX)
+  #include "../box.h"
+  #include "../comm_tools.h"
+  #include "../grid.h"
+  #include "../init_md.h"
+  #include "../integrate.h"
+  #include "../io_tools.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+  #include "../lookup.h"
+#ifdef __cplusplus
+}
+#endif
+  #include "../random.h"
+  #include "../reset_tools.h"
+  #include "../tool_box.h"
+  #include "../vector.h"
+#elif defined(LAMMPS_REAX)
+  #include "../reax_box.h"
+  #include "../reax_comm_tools.h"
+  #include "../reax_grid.h"
+  #include "../reax_init_md.h"
+  #include "../reax_integrate.h"
+  #include "../reax_io_tools.h"
+  #include "../reax_list.h"
+  #include "../reax_lookup.h"
+  #include "../reax_random.h"
+  #include "../reax_reset_tools.h"
+  #include "../reax_tool_box.h"
+  #include "../reax_vector.h"
+#endif
+
+
+void Cuda_Init_ScratchArea( )
+{
+    cuda_malloc( (void **)&scratch, DEVICE_SCRATCH_SIZE, TRUE, "device:scratch" );
+
+    host_scratch = (void *) smalloc( HOST_SCRATCH_SIZE, "host:scratch" );
+}
+
+
+int Cuda_Init_System( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        mpi_datatypes *mpi_data, char *msg )
+{
+    int i, ret;
+    reax_atom *atom;
+    int nrecv[MAX_NBRS];
+
+    Setup_New_Grid( system, control, MPI_COMM_WORLD );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d GRID:\n", system->my_rank );
+    Print_Grid( &(system->my_grid), stderr );
+#endif
+
+    Bin_My_Atoms( system, &(workspace->realloc) );
+    Reorder_My_Atoms( system, workspace );
+
+    /* estimate N and total capacity */
+    for ( i = 0; i < MAX_NBRS; ++i )
+    {
+        nrecv[i] = 0;
+    }
+
+    MPI_Barrier( MPI_COMM_WORLD );
+    system->max_recved = 0;
+    system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
+            Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE );
+    system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
+    Bin_Boundary_Atoms( system );
+
+    /* Sync atoms here to continue the computation */
+    dev_alloc_system( system );
+    Sync_System( system );
+
+    /* estimate numH and Hcap */
+    Cuda_Reset_Atoms( system, control );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: n=%d local_cap=%d\n",
+             system->my_rank, system->n, system->local_cap );
+    fprintf( stderr, "p%d: N=%d total_cap=%d\n",
+             system->my_rank, system->N, system->total_cap );
+    fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
+             system->my_rank, system->numH, system->Hcap );
+#endif
+
+    Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D );
+
+    Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );
+
+//    if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE )
+//    {
+//        return FAILURE;
+//    }
+
+    /* initialize velocities so that desired init T can be attained */
+    if ( !control->restart || (control->restart && control->random_vel) )
+    {
+        Generate_Initial_Velocities( system, control->T_init );
+    }
+
+    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+    return SUCCESS;
+}
+
+
+void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
+        simulation_data *data, char *msg )
+{
+    dev_alloc_simulation_data( data );
+
+    Reset_Simulation_Data( data );
+
+    if ( !control->restart )
+    {
+        data->step = data->prev_steps = 0;
+    }
+
+    switch ( control->ensemble )
+    {
+    case NVE:
+        data->N_f = 3 * system->bigN;
+        Cuda_Evolve = Velocity_Verlet_NVE;
+        control->virial = 0;
+        break;
+
+    case bNVT:
+        data->N_f = 3 * system->bigN + 1;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
+        control->virial = 0;
+        break;
+
+    case nhNVT:
+        fprintf( stderr, "[WARNING] Nose-Hoover NVT is still under testing.\n" );
+        data->N_f = 3 * system->bigN + 1;
+        Cuda_Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
+        control->virial = 0;
+        if ( !control->restart || (control->restart && control->random_vel) )
+        {
+            data->therm.G_xi = control->Tau_T *
+                               (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
+            data->therm.v_xi = data->therm.G_xi * control->dt;
+            data->therm.v_xi_old = 0;
+            data->therm.xi = 0;
+        }
+        break;
+
+    case sNPT: /* Semi-Isotropic NPT */
+        data->N_f = 3 * system->bigN + 4;
+        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
+        control->virial = 1;
+        if ( !control->restart )
+        {
+            Reset_Pressures( data );
+        }
+        break;
+
+    case iNPT: /* Isotropic NPT */
+        data->N_f = 3 * system->bigN + 2;
+        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
+        control->virial = 1;
+        if ( !control->restart )
+        {
+            Reset_Pressures( data );
+        }
+        break;
+
+    case NPT: /* Anisotropic NPT */
+        data->N_f = 3 * system->bigN + 9;
+        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
+        control->virial = 1;
+
+        fprintf( stderr, "p%d: init_simulation_data: option not yet implemented\n",
+              system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
+        break;
+
+    default:
+        fprintf( stderr, "p%d: init_simulation_data: ensemble not recognized\n",
+              system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
+    }
+
+    /* initialize the timer(s) */
+    MPI_Barrier( MPI_COMM_WORLD );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.start = Get_Time( );
+
+#if defined(LOG_PERFORMANCE)
+        Reset_Timing( &data->timing );
+#endif
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "data->N_f: %8.3f\n", data->N_f );
+#endif
+}
+
+
+void Cuda_Init_Workspace( reax_system *system, control_params *control,
+        storage *workspace, char *msg )
+{
+    dev_alloc_workspace( system, control, dev_workspace,
+            system->local_cap, system->total_cap, msg );
+
+    memset( &(workspace->realloc), 0, sizeof(reallocate_data) );
+    Cuda_Reset_Workspace( system, workspace );
+
+    /* Initialize the Taper function */
+    Init_Taper( control, dev_workspace );
+}
+
+
+int Cuda_Init_Lists( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data, char *msg )
+{
+    int ret;
+    int Htop;
+   
+    /* ignore returned error, as system->d_max_far_nbrs was not valid */
+    ret = Cuda_Estimate_Neighbors( system, data->step );
+
+    Dev_Make_List( system->total_cap, system->total_far_nbrs,
+            TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n",
+            system->my_rank, system->total_far_nbrs,
+            (int)(system->total_far_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
+    fprintf( stderr, "N: %d and total_cap: %d \n", system->N, system->total_cap );
+#endif
+
+    Cuda_Init_Neighbor_Indices( system );
+
+    Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists );
+
+    /* estimate storage for bonds and hbonds */
+    Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step );
+
+    /* estimate storage for charge sparse matrix */
+//    Cuda_Estimate_Storage_Sparse_Matrix( system, control, data, dev_lists );
+
+    dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries );
+
+    Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) );
+
+    //MATRIX CHANGES
+    //workspace->L = NULL;
+    //workspace->U = NULL;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, cap: %d \n",
+            system->my_rank, system->total_cm_entries, dev_workspace->H.m );
+    fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
+            system->my_rank, Htop,
+            (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
+#endif
+
+    if ( control->hbond_cut > 0.0 &&  system->numH > 0 )
+    {
+        Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS );
+//        Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *lists + HBONDS );
+
+        Cuda_Init_HBond_Indices( system );
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
+                system->my_rank, system->total_hbonds,
+                (int)(system->total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
+#endif
+    }
+
+    /* bonds list */
+    Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS );
+//    Make_List( system->total_cap, system->total_bonds, TYP_BOND, *lists + BONDS );
+
+    Cuda_Init_Bond_Indices( system );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
+            system->my_rank, total_bonds,
+            (int)(total_bonds * sizeof(bond_data) / (1024 * 1024)) );
+#endif
+
+    /* 3bodies list: since a more accurate estimate of the num.
+     * of three body interactions requires that bond orders have
+     * been computed, delay estimation until for computation */
+
+    return SUCCESS;
+}
+
+
+void Cuda_Initialize( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
+{
+    char msg[MAX_STR];
+    real t_start, t_end;
+
+    /* HOST/DEVICE SCRATCH */
+    Cuda_Init_ScratchArea( );
+
+    /* MPI_DATATYPES */
+    if ( Init_MPI_Datatypes( system, workspace, mpi_data, msg ) == FAILURE )
+    {
+        fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
+                 system->my_rank );
+        fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+    /* SYSTEM */
+    if ( Cuda_Init_System( system, control, data, workspace, mpi_data, msg ) == FAILURE )
+    {
+        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+    /* GRID */
+    dev_alloc_grid( system );
+    Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+    //validate_grid( system );
+
+    /* SIMULATION_DATA */
+    Cuda_Init_Simulation_Data( system, control, data, msg );
+
+    /* WORKSPACE */
+    Cuda_Init_Workspace( system, control, workspace, msg );
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
+#endif
+
+    //Sync the taper here from host to device.
+
+    /* CONTROL */
+    dev_alloc_control( control );
+
+    /* LISTS */
+    if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) ==
+            FAILURE )
+    {
+        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
+#endif
+
+    /* OUTPUT Files */
+    if ( Init_Output_Files( system, control, out_control, mpi_data, msg ) == FAILURE )
+    {
+        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+        fprintf( stderr, "p%d: could not open output files! terminating...\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: output files opened\n", system->my_rank );
+#endif
+
+    /* Lookup Tables */
+    if ( control->tabulate )
+    {
+        if ( Init_Lookup_Tables( system, control, dev_workspace->Tap, mpi_data, msg ) == FAILURE )
+        {
+            fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+            fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
+                     system->my_rank );
+            MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+        }
+
+#if defined(DEBUG)
+        fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank );
+#endif
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: Device Initialization Done \n", system->my_rank );
+#endif
+}
+
+
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.h b/PG-PuReMD/src/cuda/cuda_init_md.h
new file mode 100644
index 0000000000000000000000000000000000000000..328674a5961fdc905b1b3f57ea085ab1d60e17af
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_init_md.h
@@ -0,0 +1,22 @@
+
+#ifndef __CUDA_INIT_MD_H__
+#define __CUDA_INIT_MD_H__
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Cuda_Init_ScratchArea( );
+
+void Cuda_Initialize( reax_system*, control_params*, simulation_data*,
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dcb972921f735d419e6528a390ed990699c57d2c
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_integrate.cu
@@ -0,0 +1,249 @@
+
+#include "cuda_integrate.h"
+
+#include "cuda_allocate.h"
+#include "cuda_forces.h"
+#include "cuda_integrate.h"
+#include "cuda_copy.h"
+#include "cuda_neighbors.h"
+#include "cuda_reset_tools.h"
+#include "cuda_system_props.h"
+#include "cuda_utils.h"
+
+#include "../comm_tools.h"
+#include "../grid.h"
+#include "../vector.h"
+
+
+CUDA_GLOBAL void k_update_velocity_1( reax_atom *my_atoms, 
+        single_body_parameters *sbp, real dt, int n )
+{
+    real inv_m;
+    rvec dx;
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 1st part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute x(t + dt) */
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+    /* Compute v(t + dt/2) */
+    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+}
+
+
+void bNVT_update_velocity_part1( reax_system *system, real dt )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_update_velocity_2( reax_atom *my_atoms, 
+        single_body_parameters *sbp, real dt, int n )
+{
+    reax_atom *atom;
+    real inv_m;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 2nd part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute v(t + dt) */
+    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+}
+
+
+void bNVT_update_velocity_part2( reax_system *system, real dt )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_scale_velocities( reax_atom *my_atoms, real lambda, int n )
+{
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* Scale velocities and positions at t+dt */
+    atom = &(my_atoms[i]);
+    rvec_Scale( atom->v, lambda, atom->v );
+}
+
+
+void bNVT_scale_velocities( reax_system *system, real lambda )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, lambda, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int i, steps, renbr, ret;
+    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
+    real inv_m, dt, lambda;
+    rvec dx;
+    reax_atom *atom;
+    int *bond_top, *hb_top;
+    int Htop, num_3body;
+    int total_hbonds, count, total_bonds;
+    int bond_cap, cap_3body;
+    real t_over_start, t_over_elapsed;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+    ret = SUCCESS;
+
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+    if ( verlet_part1_done == FALSE )
+    {
+        /* velocity verlet, 1st part */
+        bNVT_update_velocity_part1( system, dt );
+        verlet_part1_done = TRUE;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Output_Sync_Atoms( system );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Sync_Atoms( system );
+
+        /* synch the Grid to the Device here */
+        Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+        init_blocks( system );
+
+#if defined(__CUDA_DEBUG_LOG__)
+        fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n",
+                system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
+#endif
+    }
+    
+    Cuda_Reset( system, control, data, workspace, lists );
+
+    if ( renbr )
+    {
+#if defined(DEBUG)
+        t_over_start  = Get_Time ();
+#endif
+
+        if ( estimate_nbrs_done == 0 )
+        {
+            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
+            ret = Cuda_Estimate_Neighbors( system, data->step );
+            estimate_nbrs_done = 1;
+        }
+
+        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
+        {
+            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+            estimate_nbrs_done = 2;
+    
+#if defined(DEBUG)
+            t_over_elapsed  = Get_Timing_Info( t_over_start );
+            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                    system->my_rank, data->step, t_over_elapsed );
+#endif
+        }
+    }
+
+    if ( ret == SUCCESS )
+    {
+        ret = Cuda_Compute_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+    }
+
+    if ( ret == SUCCESS )
+    {
+        /* velocity verlet, 2nd part */
+        bNVT_update_velocity_part2( system, dt );
+
+#if defined(DEBUG_FOCUS)
+        fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        /* temperature scaler */
+        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+        lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+        if ( lambda < MIN_dT )
+        {
+            lambda = MIN_dT;
+        }
+        else if (lambda > MAX_dT )
+        {
+            lambda = MAX_dT;
+        }
+        lambda = SQRT( lambda );
+
+        /* Scale velocities and positions at t+dt */
+        bNVT_scale_velocities( system, lambda );
+
+        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: scaled velocities\n",
+                 system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        verlet_part1_done = FALSE;
+        estimate_nbrs_done = 0;
+    }
+
+    return ret;
+}
diff --git a/PG-PuReMD/src/cuda_integrate.h b/PG-PuReMD/src/cuda/cuda_integrate.h
similarity index 86%
rename from PG-PuReMD/src/cuda_integrate.h
rename to PG-PuReMD/src/cuda/cuda_integrate.h
index b71e14e330ff4c9662ec94f4c077febfc4f269ce..2797b3e33397685fc1cfdc05dedead43fb98fe74 100644
--- a/PG-PuReMD/src/cuda_integrate.h
+++ b/PG-PuReMD/src/cuda/cuda_integrate.h
@@ -22,18 +22,26 @@
 #ifndef __CUDA_INTEGRATE_H_
 #define __CUDA_INTEGRATE_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 void bNVT_update_velocity_part1( reax_system *, real );
+
 void bNVT_update_velocity_part2( reax_system *, real );
+
 void bNVT_scale_velocities( reax_system *, real );
 
+int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
+
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dc7a2fc344e8c82d2758f89052b4cad34fb578ba
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
@@ -0,0 +1,1113 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_lin_alg.h"
+
+#include "cuda_shuffle.h"
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+
+#include "../basic_comm.h"
+
+
+//one thread per row
+CUDA_GLOBAL void k_matvec( sparse_matrix H, real *vec, real *results,
+        int rows )
+{
+    int i, col;
+    real results_row;
+    real val;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= rows )
+    {
+        return;
+    }
+
+    results_row = 0;
+
+    for (int c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
+
+        results_row += val * vec[col];
+    }
+
+    results[i] = results_row;
+}
+
+
+//32 thread warp per matrix row.
+//invoked as follows
+// <<< system->N, 32 >>>
+//CUDA_GLOBAL void __launch_bounds__(384, 16) k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows)
+CUDA_GLOBAL void k_matvec_csr( sparse_matrix H, real *vec, real *results,
+        int num_rows )
+{
+#if defined(__SM_35__)
+    real vals;
+    int x;
+#else
+    extern __shared__ real vals[];
+#endif
+    int jj;
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
+    int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    int row = warp_id;
+    
+#if defined(__SM_35__)
+    vals = 0;
+#else
+    vals[threadIdx.x] = 0;
+#endif
+
+    if (row < num_rows)
+    {
+        row_start = H.start[row];
+        row_end = H.end[row];
+
+        // compute running sum per thread
+        for ( jj = row_start + lane; jj < row_end;
+                jj += MATVEC_KER_THREADS_PER_ROW )
+#if defined(__SM_35__)
+        {
+            vals += H.entries[jj].val * vec[ H.entries[jj].j ];
+        }
+    }
+#else
+        {
+            vals[threadIdx.x] += H.entries[jj].val * vec[ H.entries[jj].j ];
+        }
+    }
+
+    __syncthreads( );
+#endif
+
+    // parallel reduction in shared memory
+    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+#if defined(__SM_35__)
+    for (x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2)
+    {
+        vals += shfl( vals, x );
+    }
+
+    if (lane == 0 && row < num_rows)
+    {
+        results[row] = vals;
+    }
+#else
+    if (lane < 16)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 16];
+    }
+    __syncthreads( );
+    if (lane < 8)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 8];
+    }
+    __syncthreads( );
+    if (lane < 4)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 4];
+    }
+    __syncthreads( );
+    if (lane < 2)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 2];
+    }
+    __syncthreads( );
+    if (lane < 1)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 1];
+    }
+    __syncthreads( );
+
+    // first thread writes the result
+    if (lane == 0 && row < num_rows)
+    {
+        results[row] = vals[threadIdx.x];
+    }
+#endif
+}
+
+
+//one thread per row
+CUDA_GLOBAL void k_dual_matvec( sparse_matrix H, rvec2 *vec, rvec2 *results,
+        int rows )
+{
+    int i, c, col;
+    rvec2 results_row;
+    real val;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= rows)
+    {
+        return;
+    }
+
+    results_row[0] = 0.0;
+    results_row[1] = 0.0;
+
+    for (c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
+
+        results_row[0] += val * vec [col][0];
+        results_row[1] += val * vec [col][1];
+    }
+
+    results[i][0] = results_row[0];
+    results[i][1] = results_row[1];
+}
+
+
+//32 thread warp per matrix row.
+//invoked as follows
+// <<< system->N, 32 >>>
+//CUDA_GLOBAL void __launch_bounds__(384, 8) k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, int num_rows)
+CUDA_GLOBAL void  k_dual_matvec_csr( sparse_matrix H, rvec2 *vec,
+        rvec2 *results, int num_rows )
+{
+#if defined(__SM_35__)
+    rvec2 rvals;
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
+    int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    int row = warp_id;
+
+    rvals[0] = 0;
+    rvals[1] = 0;
+
+    if (row < num_rows)
+    {
+        row_start = H.start[row];
+        row_end = H.end[row];
+
+        for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW)
+        {
+            rvals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
+            rvals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
+        }
+    }
+
+    for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2)
+    {
+        rvals[0] += shfl( rvals[0], s);
+        rvals[1] += shfl( rvals[1], s);
+    }
+
+    if (lane == 0 && row < num_rows)
+    {
+        results[row][0] = rvals[0];
+        results[row][1] = rvals[1];
+    }
+
+#else
+    extern __shared__ rvec2 rvals[];
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / 32;
+    int lane = thread_id & (32 - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    //int row = warp_id;
+    int row = warp_id;
+
+    rvals[threadIdx.x][0] = 0;
+    rvals[threadIdx.x][1] = 0;
+
+    if (row < num_rows)
+    {
+        row_start = H.start[row];
+        row_end = H.end[row];
+
+        // compute running sum per thread
+        for(int jj = row_start + lane; jj < row_end; jj += 32)
+        {
+            rvals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
+            rvals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
+        }
+    }
+
+    __syncthreads( );
+
+    // parallel reduction in shared memory
+    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+    if (lane < 16)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 16][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 16][1]; 
+    }
+    __syncthreads( );
+    if (lane < 8)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 8][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 8][1]; 
+    }
+    __syncthreads( );
+    if (lane < 4)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 4][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 4][1]; 
+    }
+    __syncthreads( );
+    if (lane < 2)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 2][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 2][1]; 
+    }
+    __syncthreads( );
+    if (lane < 1)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 1][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 1][1]; 
+    }
+    __syncthreads( );
+
+    // first thread writes the result
+    if (lane == 0 && row < num_rows)
+    {
+        results[row][0] = rvals[threadIdx.x][0];
+        results[row][1] = rvals[threadIdx.x][1];
+    }
+
+#endif
+}
+
+
+void Cuda_Vector_Sum( real *res, real a, real *x, real b, real *y, int count )
+{
+    //res = ax + by
+    //use the cublas here
+    int blocks;
+
+    blocks = (count / DEF_BLOCK_SIZE) + 
+        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
+        ( res, a, x, b, y, count );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count )
+{
+    //res = a*b - vector multiplication
+    //use the cublas here.
+    int blocks;
+
+    blocks = (count / DEF_BLOCK_SIZE) + 
+        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
+        ( res, a, b, count );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
+{
+    storage *workspace;
+    int j;
+   
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= n )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+
+    //for( j = 0; j < system->n; ++j ) {
+    // residual 
+    workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
+    workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
+
+    // apply diagonal pre-conditioner
+    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
+    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
+    //}
+}
+
+
+void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
+        (*workspace, b, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, 
+        real alpha_0, real alpha_1, int n, rvec2 *my_dot )
+{
+    storage *workspace;
+    rvec2 alpha;
+    int j;
+   
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= n )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+    alpha[0] = alpha_0;
+    alpha[1] = alpha_1;
+    my_dot[j][0] = my_dot[j][1] = 0.0;
+
+    //for( j = 0; j < system->n; ++j ) {
+    // update x 
+    x[j][0] += alpha[0] * workspace->d2[j][0];
+    x[j][1] += alpha[1] * workspace->d2[j][1];      
+
+    // update residual 
+    workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
+    workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
+
+    // apply diagonal pre-conditioner 
+    workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+    workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+
+    // dot product: r.p 
+    my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
+    my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
+    //}
+}
+
+
+void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha,
+        int n, rvec2 result )
+{
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    cuda_memset( tmp, 0, sizeof(rvec2) * ( 2 * n + 1),
+            "cuda_dualcg_preconditioner" );
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
+        (*workspace, x, alpha[0], alpha[1], n, tmp);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    //Reduction to calculate my_dot
+    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        ( tmp, tmp + n, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
+        ( tmp + n, tmp + 2*n, blocks);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( result, (tmp + 2*n), sizeof(rvec2),
+            cudaMemcpyDeviceToHost, "my_dot" );
+}
+
+
+void Cuda_Norm( rvec2 *arr, int n, rvec2 result )
+{
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        (arr, tmp, n, INITIAL);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
+        (tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_norm_rvec2" );
+}
+
+
+void Cuda_Dot( rvec2 *a, rvec2 *b, rvec2 result, int n )
+{
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        ( a, b, tmp, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> 
+    //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * BLOCKS_POW_2 >>> 
+        ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_dot" );
+}
+
+
+void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
+        ( x, a, b[0], b[1], c, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n )
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= n)
+    {
+        return;
+    }
+
+    dst[i] = src[i][index];
+}
+
+
+void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( dst, src, index, n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= n)
+    {
+        return;
+    }
+
+    dst[i][index] = src[i];
+}
+
+
+void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( dst, src, index, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+
+    cuda_memset( b, 0, sizeof(rvec2) * size, "dual_matvec:result" );
+
+    //One thread per row implementation
+    //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+    //        (*H, a, b, n);
+    //cudaThreadSynchronize ();
+    //cudaCheckError ();
+
+    //One warp per row implementation
+#if defined(__SM_35__)
+    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
+#else
+    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
+                      sizeof(rvec2) * MATVEC_BLOCK_SIZE >>>
+#endif
+            ( *H, a, b, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+
+    cuda_memset( b, 0, sizeof(real) * size, "dual_matvec:result" );
+
+    //one thread per row implementation
+    //k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+    //        (*H, a, b, n);
+    //cudaThreadSynchronize ();
+    //cudaCheckError ();
+
+#if defined(__SM_35__)
+    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
+#else
+    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
+                 sizeof(real) * MATVEC_BLOCK_SIZE>>>
+#endif
+                     (*H, a, b, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
+        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
+        simulation_data *data )
+{
+    int  i, j, n, N, matvecs, scale;
+    rvec2 tmp, alpha, beta;
+    rvec2 my_sum, norm_sqr, b_norm, my_dot;
+    rvec2 sig_old, sig_new;
+    MPI_Comm comm;
+    rvec2 *spad = (rvec2 *) host_scratch;
+    int a;
+
+    n = system->n;
+    N = system->N;
+    comm = mpi_data->world;
+    matvecs = 0;
+    scale = sizeof(rvec2) / sizeof(void);
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        matvecs = 0;
+        t_start = matvec_time = dot_time = 0;
+        t_start = Get_Time( );
+    }
+#endif
+
+    //MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//  Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
+//#endif
+
+//  check_zeros_device( x, system->N, "x" );
+
+    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" );
+    Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
+    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" );
+
+//  check_zeros_device( x, system->N, "x" );
+
+//  compare_rvec2 (workspace->x, x, N, "x");
+//  if (data->step > 0) {
+//      compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
+//      compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
+//
+//      exit (0);
+//  }
+
+
+//#ifdef __CUDA_DEBUG__
+//  dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
+//#endif
+    //originally we were using only H->n which was system->n (init_md.c)
+    //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
+    
+    Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
+
+//  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
+//  if (data->step > 0) exit (0);
+
+    // tryQEq
+    //MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//  Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
+//#endif
+    
+    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+            cudaMemcpyDeviceToHost, "CG:q2:get" );
+    Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+            cudaMemcpyHostToDevice,"CG:q2:put" );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &matvec_time );
+    }
+#endif
+
+//#ifdef __CUDA_DEBUG__
+//  for( j = 0; j < system->n; ++j ) {
+//    // residual
+//    workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
+//    workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
+//    // apply diagonal pre-conditioner
+//    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+//    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+//  }
+//#endif
+    
+    Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n );
+
+//  compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
+//  compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
+
+    /* norm of b */
+//#ifdef __CUDA_DEBUG__
+//  my_sum[0] = my_sum[1] = 0;
+//  for( j = 0; j < n; ++j ) {
+//    my_sum[0] += SQR( workspace->b[j][0] );
+//    my_sum[1] += SQR( workspace->b[j][1] );
+//  }
+//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
+//#endif
+
+    my_sum[0] = my_sum[1] = 0;
+    Cuda_Norm (b, n, my_sum);
+
+//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
+
+    MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
+    b_norm[0] = SQRT( norm_sqr[0] );
+    b_norm[1] = SQRT( norm_sqr[1] );
+    //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
+
+    /* dot product: r.d */
+//#ifdef __CUDA_DEBUG__
+//  my_dot[0] = my_dot[1] = 0;
+//  for( j = 0; j < n; ++j ) {
+//    my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
+//    my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
+//  }
+//  fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
+//#endif
+
+    my_dot[0] = my_dot[1] = 0;
+    Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
+
+// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
+    
+    MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
+
+    //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &dot_time );
+    }
+#endif
+
+    for ( i = 1; i < 300; ++i )
+    {
+        //MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//    Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
+//#endif
+        
+        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cg:d2:get" );
+        Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
+        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyHostToDevice, "cg:d2:put" );
+
+        //print_device_rvec2 (dev_workspace->d2, N);
+
+//#ifdef __CUDA_DEBUG__
+//    dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
+//#endif
+        
+        Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
+                system->total_cap );
+
+        /*
+        fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
+        fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
+        print_sparse_matrix_host (&workspace->H);
+        fprintf (stderr, " ******* HOST Vector ***************\n");
+        print_host_rvec2 (workspace->d2, system->N);
+        fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
+        print_sparse_matrix (&dev_workspace->H);
+        fprintf (stderr, " ******* Device Vector ***************\n");
+        print_device_rvec2 (dev_workspace->d2, system->N);
+        */
+        //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
+        // tryQEq
+        // MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//    Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
+//#endif
+
+        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cg:q2:get" );
+        Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
+        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyHostToDevice, "cg:q2:put" );
+
+//       compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &matvec_time );
+        }
+#endif
+
+        /* dot product: d.q */
+//#ifdef __CUDA_DEBUG__
+//    my_dot[0] = my_dot[1] = 0;
+//    for( j = 0; j < n; ++j ) {
+//      my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
+//      my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
+//    }
+//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+//#endif
+
+        my_dot[0] = my_dot[1] = 0;
+        Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
+        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+
+        MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
+        //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
+
+        alpha[0] = sig_new[0] / tmp[0];
+        alpha[1] = sig_new[1] / tmp[1];
+        my_dot[0] = my_dot[1] = 0;
+
+//#ifdef __CUDA_DEBUG__
+//    for( j = 0; j < system->n; ++j ) {
+//      // update x
+//      workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
+//      workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
+//      // update residual
+//      workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
+//      workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
+//      // apply diagonal pre-conditioner
+//      workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+//      workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+//      // dot product: r.p
+//      my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
+//      my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
+//    }
+//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+//#endif
+
+        my_dot[0] = my_dot[1] = 0;
+        Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
+
+        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+
+//   compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
+//   compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
+//   compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
+
+        sig_old[0] = sig_new[0];
+        sig_old[1] = sig_new[1];
+        MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
+
+        //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &dot_time );
+        }
+#endif
+
+        if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
+        {
+            break;
+        }
+
+        beta[0] = sig_new[0] / sig_old[0];
+        beta[1] = sig_new[1] / sig_old[1];
+
+//#ifdef __CUDA_DEBUG__
+//    for( j = 0; j < system->n; ++j ) {
+//      // d = p + beta * d
+//      workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
+//      workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
+//    }
+//#endif
+
+        Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
+                dev_workspace->d2, system->n );
+
+//       compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
+    }
+
+
+    if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
+    {
+        //for( j = 0; j < n; ++j )
+        //  workspace->t[j] = workspace->x[j][1];
+        //fprintf (stderr, "Getting started with Cuda_CG1 \n");
+
+        Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
+
+        //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
+        //compare_array (workspace->t, dev_workspace->t, system->n, "t");
+
+        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
+                mpi_data, fout );
+
+        //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
+        //for( j = 0; j < n; ++j )
+        //  workspace->x[j][1] = workspace->t[j];
+
+        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
+    }
+    else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
+    {
+        //for( j = 0; j < n; ++j )
+        //  workspace->s[j] = workspace->x[j][0];
+
+        Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
+
+        //compare_array (workspace->s, dev_workspace->s, system->n, "s");
+        //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
+
+        //fprintf (stderr, "Getting started with Cuda_CG2 \n");
+
+        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
+                mpi_data, fout );
+
+        //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
+        //for( j = 0; j < system->n; ++j )
+        //  workspace->x[j][0] = workspace->s[j];
+
+        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
+    }
+
+    if ( i >= 300 )
+    {
+        fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n",
+                system->my_rank, i );
+        fprintf( stderr, "    [INFO] s lin solve error: %f\n", SQRT(sig_new[0]) / b_norm[0] );
+        fprintf( stderr, "    [INFO] t lin solve error: %f\n", SQRT(sig_new[1]) / b_norm[1] );
+    }
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n",
+                i + 1, matvecs, matvec_time, dot_time );
+    }
+#endif
+
+    return (i + 1) + matvecs;
+}
+
+
+int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
+        *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+{
+    int  i, j, scale;
+    real tmp, alpha, beta, b_norm;
+    real sig_old, sig_new, sig0;
+    real *spad = (real *) host_scratch;
+
+    scale = sizeof(real) / sizeof(void);
+
+    /* x is on the device */
+    //MVAPICH2
+    memset( spad, 0, sizeof(real) * system->total_cap );
+    copy_host_device( spad, x, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:x:get" );
+    Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
+
+    //MVAPICH2
+    copy_host_device( spad, x, sizeof(real) * system->total_cap,
+            cudaMemcpyHostToDevice, "cuda_cg:x:put" );
+    Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
+
+    // tryQEq
+    // MVAPICH2
+    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
+    Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
+
+    //MVAPICH2
+    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+            cudaMemcpyHostToDevice, "cuda_cg:q:put" );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &matvec_time );
+    }
+#endif
+
+    Cuda_Vector_Sum( dev_workspace->r , 1.,  b, -1., dev_workspace->q,
+            system->n );
+    //for( j = 0; j < system->n; ++j )
+    //  workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+    Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
+            dev_workspace->Hdia_inv, system->n );
+
+    //TODO do the parallel_norm on the device for the local sum
+    copy_host_device( spad, b, sizeof(real) * system->n,
+            cudaMemcpyDeviceToHost, "cuda_cg:b:get" );
+    b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
+
+    //TODO do the parallel dot on the device for the local sum
+    copy_host_device( spad, dev_workspace->r, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
+    copy_host_device( spad + system->total_cap, dev_workspace->d, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
+    sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
+            mpi_data->world );
+
+    sig0 = sig_new;
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &dot_time );
+    }
+#endif
+
+    for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
+    {
+        //MVAPICH2
+        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
+        Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
+        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
+                cudaMemcpyHostToDevice, "cuda_cg:d:put" );
+
+        Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
+
+        //tryQEq
+        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
+        Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
+        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+                cudaMemcpyHostToDevice, "cuda_cg:q:get" );
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &matvec_time );
+        }
+#endif
+
+        //TODO do the parallel dot on the device for the local sum
+        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
+        copy_host_device( spad + system->n, dev_workspace->q, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
+        tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world );
+
+        alpha = sig_new / tmp;
+        //Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
+        Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
+
+        //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
+        Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
+                dev_workspace->r, system->n );
+        /* pre-conditioning */
+        //for( j = 0; j < system->n; ++j )
+        //  workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
+                dev_workspace->Hdia_inv, system->n );
+
+        sig_old = sig_new;
+
+        //TODO do the parallel dot on the device for the local sum
+        copy_host_device( spad, dev_workspace->r, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
+        copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:p:get" );
+        sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
+        //fprintf (stderr, "Device: sig_new: %f \n", sig_new );
+
+        beta = sig_new / sig_old;
+        Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
+                dev_workspace->d, system->n );
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &dot_time );
+        }
+#endif
+    }
+
+    if ( i >= 300 )
+    {
+        fprintf( stderr, "CG convergence failed!\n" );
+        return i;
+    }
+
+    return i;
+}
diff --git a/PG-PuReMD/src/cuda_lin_alg.h b/PG-PuReMD/src/cuda/cuda_lin_alg.h
similarity index 52%
rename from PG-PuReMD/src/cuda_lin_alg.h
rename to PG-PuReMD/src/cuda/cuda_lin_alg.h
index a7e3cc5f471d030e38b29f8344296e68b78b5ad6..aa31c126642b8eb560256390c89698df99b34a73 100644
--- a/PG-PuReMD/src/cuda_lin_alg.h
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.h
@@ -22,29 +22,44 @@
 #ifndef __CUDA_LIN_ALG_H_
 #define __CUDA_LIN_ALG_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+void Cuda_Vector_Sum( real *, real, real *, real, real *, int );
 
-void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count);
-void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count);
-void Cuda_CG_Diagonal_Preconditioner(storage *workspace, rvec2 *b, int n);
-void Cuda_DualCG_Preconditioner(storage *workspace, rvec2 *, rvec2 alpha, int n, rvec2 result);
-void Cuda_Norm(rvec2 *arr, int n, rvec2 result);
-void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n);
-void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *, rvec2 , rvec2 *c, int n);
-void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n);
-void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n);
-void Cuda_Dual_Matvec(sparse_matrix *, rvec2 *, rvec2 *, int , int);
-void Cuda_Matvec(sparse_matrix *, real *, real *, int , int);
+void Cuda_CG_Preconditioner( real *, real *, real *, int );
 
+void Cuda_CG_Diagonal_Preconditioner( storage *, rvec2 *, int );
+
+void Cuda_DualCG_Preconditioner( storage *, rvec2 *, rvec2, int, rvec2 );
+
+void Cuda_Norm( rvec2 *, int, rvec2 );
+
+void Cuda_Dot( rvec2 *, rvec2 *, rvec2, int );
+
+void Cuda_Vector_Sum_Rvec2( rvec2 *, rvec2 *, rvec2, rvec2 *, int );
+
+void Cuda_RvecCopy_From( real *, rvec2 *, int, int );
+
+void Cuda_RvecCopy_To( rvec2 *, real *, int, int );
+
+void Cuda_Dual_Matvec( sparse_matrix *, rvec2 *, rvec2 *, int , int );
+
+void Cuda_Matvec( sparse_matrix *, real *, real *, int , int );
+
+int Cuda_dual_CG( reax_system*, storage*, sparse_matrix*,
+        rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data * );
+
+int Cuda_CG( reax_system*, storage*, sparse_matrix*,
+        real*, real, real*, mpi_datatypes*, FILE* );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_list.cu b/PG-PuReMD/src/cuda/cuda_list.cu
similarity index 96%
rename from PG-PuReMD/src/cuda_list.cu
rename to PG-PuReMD/src/cuda/cuda_list.cu
index 21d8d091aad8345e3fbd3075cf7ca571d38a589c..9d0626f126eff2b87ea5fef72eb7ca6ec036adc8 100644
--- a/PG-PuReMD/src/cuda_list.cu
+++ b/PG-PuReMD/src/cuda/cuda_list.cu
@@ -19,15 +19,14 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
 #include "cuda_utils.h"
 
 #if defined(PURE_REAX)
-  #include "list.h"
-  #include "tool_box.h"
+  #include "../list.h"
+  #include "../tool_box.h"
 #elif defined(LAMMPS_REAX)
-  #include "reax_list.h"
-  #include "reax_tool_box.h"
+  #include "../reax_list.h"
+  #include "../reax_tool_box.h"
 #endif
 
 
diff --git a/PG-PuReMD/src/cuda_list.h b/PG-PuReMD/src/cuda/cuda_list.h
similarity index 98%
rename from PG-PuReMD/src/cuda_list.h
rename to PG-PuReMD/src/cuda/cuda_list.h
index 0b4e7aa04a9258ce45b156214c360f4cf9b4c673..fe06f4ce92c114842d8cc5f3c65ab7c9b0683661 100644
--- a/PG-PuReMD/src/cuda_list.h
+++ b/PG-PuReMD/src/cuda/cuda_list.h
@@ -22,13 +22,15 @@
 #ifndef __CUDA_LIST_H_
 #define __CUDA_LIST_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 void Dev_Make_List( int, int, int, reax_list* );
+
 void Dev_Delete_List( reax_list* );
 
 #ifdef __cplusplus
diff --git a/PG-PuReMD/src/cuda_lookup.cu b/PG-PuReMD/src/cuda/cuda_lookup.cu
similarity index 98%
rename from PG-PuReMD/src/cuda_lookup.cu
rename to PG-PuReMD/src/cuda/cuda_lookup.cu
index 837a3c71fdb7df88283ff10f1f3b8dc068717f9b..01bc8a79e9689f538dfb0bf6421b815998997b27 100644
--- a/PG-PuReMD/src/cuda_lookup.cu
+++ b/PG-PuReMD/src/cuda/cuda_lookup.cu
@@ -1,8 +1,9 @@
 
 #include "cuda_lookup.h"
-#include "index_utils.h"
+
 #include "cuda_utils.h"
-#include "reax_types.h"
+
+#include "../index_utils.h"
 
 
 void copy_LR_table_to_device( reax_system *system, control_params *control,
diff --git a/PG-PuReMD/src/cuda_lookup.h b/PG-PuReMD/src/cuda/cuda_lookup.h
similarity index 56%
rename from PG-PuReMD/src/cuda_lookup.h
rename to PG-PuReMD/src/cuda/cuda_lookup.h
index 88f5cfce17d54995431f356440e6250d40209b01..87026f7deab8464867471e5373bac70018895f0a 100644
--- a/PG-PuReMD/src/cuda_lookup.h
+++ b/PG-PuReMD/src/cuda/cuda_lookup.h
@@ -2,16 +2,18 @@
 #ifndef __CUDA_LOOKUP_H__
 #define __CUDA_LOOKUP_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void copy_LR_table_to_device (reax_system *, control_params *, int *);
+void copy_LR_table_to_device( reax_system *, control_params *, int * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_multi_body.cu
rename to PG-PuReMD/src/cuda/cuda_multi_body.cu
index 09a129634fa31b8a208f745b73b2c95d04889abe..cb7415711a73954a6e55d3636f115d4f84d6b6ec 100644
--- a/PG-PuReMD/src/cuda_multi_body.cu
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu
@@ -19,12 +19,13 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
 #include "cuda_multi_body.h"
-#include "index_utils.h"
+
 #include "cuda_helpers.h"
 #include "cuda_list.h"
 
+#include "../index_utils.h"
+
 
 CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
diff --git a/PG-PuReMD/src/cuda_multi_body.h b/PG-PuReMD/src/cuda/cuda_multi_body.h
similarity index 58%
rename from PG-PuReMD/src/cuda_multi_body.h
rename to PG-PuReMD/src/cuda/cuda_multi_body.h
index 332e6f06a480b61f424ec6dbc3ad7fbeb9bba1b2..06014b3ae777aff567e7eda0c5b6bb6584074eef 100644
--- a/PG-PuReMD/src/cuda_multi_body.h
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.h
@@ -22,21 +22,14 @@
 #ifndef __CUDA_MULTI_BODY_H_
 #define __CUDA_MULTI_BODY_H_
 
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Atom_Energy(  reax_atom *,
-                                    global_parameters ,
-                                    single_body_parameters *,
-                                    two_body_parameters *,
-                                    storage ,
-                                    reax_list ,
-                                    int ,
-                                    int ,
-                                    real *,
-                                    real *,
-                                    real *
-                                 );
-
-CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess (reax_list, storage, int );
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *, global_parameters,
+        single_body_parameters *, two_body_parameters *, storage,
+        reax_list, int, int, real *, real *, real *);
+
+CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess( reax_list, storage, int );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_neighbors.cu
rename to PG-PuReMD/src/cuda/cuda_neighbors.cu
index f9a20ebd811f12a62a25bdad3994a420ad6a53b4..b1f2b85d7530efb12647de27e0fa3d89ca3086b9 100644
--- a/PG-PuReMD/src/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu
@@ -21,15 +21,13 @@
 
 #include "cuda_neighbors.h"
 
-#include "reax_types.h"
-
 #include "cuda_list.h"
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
 
-#include "vector.h"
-#include "index_utils.h"
-#include "tool_box.h"
+#include "../index_utils.h"
+#include "../tool_box.h"
+#include "../vector.h"
 
 
 CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x ) 
diff --git a/PG-PuReMD/src/cuda_neighbors.h b/PG-PuReMD/src/cuda/cuda_neighbors.h
similarity index 95%
rename from PG-PuReMD/src/cuda_neighbors.h
rename to PG-PuReMD/src/cuda/cuda_neighbors.h
index f7d7cb15f1b202332505147854fc3cbe826e967c..4d4a9c4ecc4836ad6551e9ae73ff7a0dfef16a7b 100644
--- a/PG-PuReMD/src/cuda_neighbors.h
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.h
@@ -2,14 +2,13 @@
 #ifndef __CUDA_NEIGHBORS_H__
 #define __CUDA_NEIGHBORS_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
 void Cuda_Generate_Neighbor_Lists( reax_system *, simulation_data *, storage *, reax_list ** );
 
 int Cuda_Estimate_Neighbors( reax_system *, int );
@@ -24,9 +23,9 @@ void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix * );
 
 void Cuda_Init_Three_Body_Indices( int *, int );
 
-
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_nonbonded.cu
rename to PG-PuReMD/src/cuda/cuda_nonbonded.cu
index 93bca2dabf096ddeca26e802351f3691c78eae12..25c0b17df74448775e5e37a61db25c151169e056 100644
--- a/PG-PuReMD/src/cuda_nonbonded.cu
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
@@ -25,10 +25,9 @@
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
 #include "cuda_shuffle.h"
-#include "vector.h"
 
-#include "reax_types.h"
-#include "index_utils.h"
+#include "../index_utils.h"
+#include "../vector.h"
 
 
 //CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy(    
diff --git a/PG-PuReMD/src/cuda_nonbonded.h b/PG-PuReMD/src/cuda/cuda_nonbonded.h
similarity index 79%
rename from PG-PuReMD/src/cuda_nonbonded.h
rename to PG-PuReMD/src/cuda/cuda_nonbonded.h
index 1c9916bfba8821ec353f6d1571ac24aa711348a4..238d49d748289da4152b70c5a1440cc8ffd611dd 100644
--- a/PG-PuReMD/src/cuda_nonbonded.h
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.h
@@ -19,15 +19,17 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __NONBONDED_H_
-#define __NONBONDED_H_
+#ifndef __CUDA_NONBONDED_H_
+#define __CUDA_NONBONDED_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 void Cuda_Compute_Polarization_Energy( reax_system *, simulation_data *);
-void Cuda_NonBonded_Energy ( reax_system *, control_params *,
-                             storage *, simulation_data *, reax_list **,
-                             output_controls *, bool );
+
+void Cuda_NonBonded_Energy( reax_system *, control_params *,
+        storage *, simulation_data *, reax_list **,
+        output_controls *, bool );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_post_evolve.cu b/PG-PuReMD/src/cuda/cuda_post_evolve.cu
similarity index 95%
rename from PG-PuReMD/src/cuda_post_evolve.cu
rename to PG-PuReMD/src/cuda/cuda_post_evolve.cu
index 9a478192be59a203bf1b286b0779c99a6af3b563..828a0e4beff46f591b34b8727357a5ccb4ddf742 100644
--- a/PG-PuReMD/src/cuda_post_evolve.cu
+++ b/PG-PuReMD/src/cuda/cuda_post_evolve.cu
@@ -1,9 +1,10 @@
 
 #include "cuda_post_evolve.h"
-#include "reax_types.h"
-#include "vector.h"
+
 #include "cuda_utils.h"
 
+#include "../vector.h"
+
 
 CUDA_GLOBAL void ker_post_evolve( reax_atom *my_atoms, 
         simulation_data *data, int n )
diff --git a/PG-PuReMD/src/cuda_post_evolve.h b/PG-PuReMD/src/cuda/cuda_post_evolve.h
similarity index 60%
rename from PG-PuReMD/src/cuda_post_evolve.h
rename to PG-PuReMD/src/cuda/cuda_post_evolve.h
index dcdcd50cadef4db2c0c403c604bb4ecf33acd56b..a1a0571a9a9825f613bd01477f881201c621afa8 100644
--- a/PG-PuReMD/src/cuda_post_evolve.h
+++ b/PG-PuReMD/src/cuda/cuda_post_evolve.h
@@ -2,16 +2,18 @@
 #ifndef __CUDA_POST_EVOLVE_H__
 #define __CUDA_POST_EVOLVE_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void post_evolve_velocities (reax_system *, simulation_data *);
+void post_evolve_velocities( reax_system *, simulation_data * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_reduction.cu
rename to PG-PuReMD/src/cuda/cuda_reduction.cu
index 02d800ee0700b357564b72df206bbe24316ba73c..01bd3c8199a76144703738188713190e02b35e9f 100644
--- a/PG-PuReMD/src/cuda_reduction.cu
+++ b/PG-PuReMD/src/cuda/cuda_reduction.cu
@@ -4,10 +4,10 @@
 #include "cuda_shuffle.h"
 #include "cuda_utils.h"
 
-#include "vector.h"
+#include "../vector.h"
 
-#include "cub/cub/device/device_reduce.cuh"
-#include "cub/cub/device/device_scan.cuh"
+#include "../cub/cub/device/device_reduce.cuh"
+#include "../cub/cub/device/device_scan.cuh"
 
 
 //struct RvecSum
diff --git a/PG-PuReMD/src/cuda_reduction.h b/PG-PuReMD/src/cuda/cuda_reduction.h
similarity index 96%
rename from PG-PuReMD/src/cuda_reduction.h
rename to PG-PuReMD/src/cuda/cuda_reduction.h
index 15ca538f9dff5931a3ccf937118e8b320e99a314..cf9efc5de885852a4c9909154ae01d27bfccc29a 100644
--- a/PG-PuReMD/src/cuda_reduction.h
+++ b/PG-PuReMD/src/cuda/cuda_reduction.h
@@ -2,32 +2,45 @@
 #ifndef __CUDA_REDUCTION_H__
 #define __CUDA_REDUCTION_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 #define  INITIAL  0
 #define  FINAL    1
 
 
 void Cuda_Reduction_Sum( int *, int *, size_t );
+
 void Cuda_Reduction_Sum( real *, real *, size_t );
+
 //void Cuda_Reduction_Sum( rvec *, rvec *, size_t );
+
 void Cuda_Reduction_Max( int *, int *, size_t );
+
 void Cuda_Scan_Excl_Sum( int *, int *, size_t );
 
 CUDA_GLOBAL void k_reduction( const real *, real *, const size_t );
+
 CUDA_GLOBAL void k_reduction_rvec( rvec *, rvec *, size_t );
+
 CUDA_GLOBAL void k_reduction_rvec2( rvec2 *, rvec2 *, size_t );
+
 CUDA_GLOBAL void k_norm( const real *, real *, const size_t, int );
+
 CUDA_GLOBAL void k_dot( const real *, const real *, real *,
         const size_t );
 
 CUDA_GLOBAL void k_vector_sum( real*, real, real*, real,
         real*, int );
+
 CUDA_GLOBAL void k_rvec2_pbetad( rvec2 *, rvec2 *, real, real,
         rvec2 *, int );
+
 CUDA_GLOBAL void k_rvec2_mul( rvec2*, rvec2*, rvec2*, int );
+
 CUDA_GLOBAL void k_vector_mul( real*, real*, real*, int );
+
 CUDA_GLOBAL void k_norm_rvec2( const rvec2 *, rvec2 *, const size_t, int );
+
 CUDA_GLOBAL void k_dot_rvec2( const rvec2 *, rvec2 *, rvec2 *, const size_t );
 
 
diff --git a/PG-PuReMD/src/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
similarity index 98%
rename from PG-PuReMD/src/cuda_reset_tools.cu
rename to PG-PuReMD/src/cuda/cuda_reset_tools.cu
index 27cb4580d0c4604de6b83b837b0a822677810e3c..ca435269b8576ce74220ed442e75fb9f9a12819c 100644
--- a/PG-PuReMD/src/cuda_reset_tools.cu
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
@@ -5,7 +5,7 @@
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
 
-#include "reset_tools.h"
+#include "../reset_tools.h"
 
 
 extern "C"
diff --git a/PG-PuReMD/src/cuda_reset_tools.h b/PG-PuReMD/src/cuda/cuda_reset_tools.h
similarity index 94%
rename from PG-PuReMD/src/cuda_reset_tools.h
rename to PG-PuReMD/src/cuda/cuda_reset_tools.h
index f158afec0079fa8eb407bbb59bf328eb79e2afc2..2e90b8eb5d3e758d815de554181d86472560acd3 100644
--- a/PG-PuReMD/src/cuda_reset_tools.h
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.h
@@ -2,13 +2,13 @@
 #ifndef __CUDA_RESET_TOOLS_H__
 #define __CUDA_RESET_TOOLS_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
 void Cuda_Reset_Workspace( reax_system *, storage * );
 
 void Cuda_Reset_Atoms( reax_system *, control_params * );
@@ -19,9 +19,9 @@ int  Cuda_Reset_Neighbor_Lists( reax_system *, control_params *,
 void Cuda_Reset( reax_system*, control_params*, simulation_data*,
         storage*, reax_list** );
 
-
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_shuffle.h b/PG-PuReMD/src/cuda/cuda_shuffle.h
similarity index 97%
rename from PG-PuReMD/src/cuda_shuffle.h
rename to PG-PuReMD/src/cuda/cuda_shuffle.h
index f8dfddfa47fb9a63cecbcb89748d6de06d748609..0d6872713b61f650523cfa71f97dc9ef17562931 100644
--- a/PG-PuReMD/src/cuda_shuffle.h
+++ b/PG-PuReMD/src/cuda/cuda_shuffle.h
@@ -22,8 +22,7 @@
 #ifndef __CUDA_SHUFFLE_H_
 #define __CUDA_SHUFFLE_H_
 
-#include "reax_types.h"
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu
new file mode 100644
index 0000000000000000000000000000000000000000..54957d00fc8d555a26ac699511ed003865482a19
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_system_props.cu
@@ -0,0 +1,1026 @@
+
+#include "cuda_system_props.h"
+
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+#include "cuda_copy.h"
+#include "cuda_shuffle.h"
+
+#include "../vector.h"
+
+
+CUDA_GLOBAL void center_of_mass_blocks( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
+{
+    extern __shared__ rvec xcm[];
+    extern __shared__ rvec vcm[];
+    extern __shared__ rvec amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    //unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 *(blockDim.x);
+
+    unsigned int index = 0;
+    rvec tmp;
+    real m;
+
+    rvec_MakeZero (xcm [threadIdx.x]);
+    rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
+    rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
+    rvec_MakeZero (tmp);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
+        rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
+        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
+        rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
+    }
+    __syncthreads ();
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (xcm [threadIdx.x], xcm[index]);
+            rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
+            rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
+        } 
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0)){
+        rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
+    }
+}
+
+
+#if defined( __SM_35__)
+CUDA_GLOBAL void center_of_mass_blocks_xcm( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_xcm, size_t n )
+{
+    extern __shared__ rvec my_xcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int index = 0;
+    rvec xcm;
+    real m;
+
+    rvec_MakeZero (xcm);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (xcm , m, atoms [i].x);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        xcm[0] += shfl( xcm[0], z);
+        xcm[1] += shfl( xcm[1], z);
+        xcm[2] += shfl( xcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm );
+    __syncthreads ();
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_xcm [threadIdx.x], my_xcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0))
+        rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]);
+}
+
+
+CUDA_GLOBAL void center_of_mass_blocks_vcm( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_vcm, size_t n )
+{
+    extern __shared__ rvec my_vcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    rvec vcm;
+    real m;
+
+    rvec_MakeZero (vcm);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (vcm , m, atoms [i].v);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        vcm[0] += shfl( vcm[0], z);
+        vcm[1] += shfl( vcm[1], z);
+        vcm[2] += shfl( vcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm );
+    __syncthreads ();
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_vcm [threadIdx.x], my_vcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0))
+        rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]);
+}
+
+
+CUDA_GLOBAL void center_of_mass_blocks_amcm( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_amcm, size_t n )
+{
+    extern __shared__ rvec my_amcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    rvec amcm;
+    real m;
+    rvec tmp;
+
+    rvec_MakeZero (amcm);
+    rvec_MakeZero( tmp );
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
+        rvec_ScaledAdd (amcm, m, tmp);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        amcm[0] += shfl( amcm[0], z);
+        amcm[1] += shfl( amcm[1], z);
+        amcm[2] += shfl( amcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm );
+    __syncthreads ();
+
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_amcm[threadIdx.x], my_amcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0)){
+        rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]);
+    }
+}
+#endif
+
+
+CUDA_GLOBAL void center_of_mass( rvec *xcm, rvec *vcm, rvec *amcm, 
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
+{
+    extern __shared__ rvec sh_xcm[];
+    extern __shared__ rvec sh_vcm[];
+    extern __shared__ rvec sh_amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 * (blockDim.x);
+
+    unsigned int index = 0;
+    rvec t_xcm, t_vcm, t_amcm;
+
+    rvec_MakeZero (t_xcm);
+    rvec_MakeZero (t_vcm);
+    rvec_MakeZero (t_amcm);
+
+    if (i < n){
+        rvec_Copy ( t_xcm, xcm[threadIdx.x]);
+        rvec_Copy ( t_vcm, vcm[threadIdx.x]);
+        rvec_Copy ( t_amcm, amcm[threadIdx.x]);
+    }
+
+    rvec_Copy (sh_xcm[xcm_id], t_xcm);
+    rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
+    rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
+
+    __syncthreads ();
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+
+        if (threadIdx.x < offset) {
+            index = threadIdx.x + offset;
+            rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
+            rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
+            rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
+        } 
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0){
+        rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass( single_body_parameters *sbp, 
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xx[ xx_i ] = diff[0] * diff[0] * m;
+        xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
+        xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
+        yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
+        yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
+        zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            xx[ threadIdx.x ] += xx[ index ];
+            xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 ] = xx [ 0 ];
+        results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
+        results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
+        results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
+        results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
+        results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass( real *input, real *output, size_t n )
+{
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n)
+    {
+        xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
+        xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
+        xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
+        yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
+        yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
+        zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset )
+        {
+            index = threadIdx.x + offset;
+            xx [ threadIdx.x ] += xx [ index ];
+            xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0)
+    {
+        output[0] = xx[0];
+        output[1] = xy[xy_i];
+        output[2] = xz[xz_i];
+        output[3] = xz[yy_i];
+        output[4] = xz[yz_i];
+        output[5] = xz[zz_i];
+    }
+}
+
+
+#if defined( __SM_35__)
+CUDA_GLOBAL void compute_center_mass_xx_xy( single_body_parameters *sbp,
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real my_results_xx[];
+    extern __shared__ real my_results_xy[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real xx = 0;
+    real xy = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xx = diff[0] * diff[0] * m;
+        xy = diff[0] * diff[1] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        xx += shfl( xx, z);
+        xy += shfl( xy, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_xx[threadIdx.x >> 5] = xx;    
+        my_results_xy[threadIdx.x >> 5] = xy;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_xx[ threadIdx.x ] += my_results_xx[ index ];
+            my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 ] = my_results_xx [ 0 ];
+        results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ];
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass_xz_yy( single_body_parameters *sbp,
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real my_results_xz[];
+    extern __shared__ real my_results_yy[];
+
+    unsigned int yy_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real xz = 0;
+    real yy = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xz = diff[0] * diff[2] * m;
+        yy = diff[1] * diff[1] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        xz += shfl( xz, z);
+        yy += shfl( yy, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_xz[threadIdx.x >> 5] = xz;    
+        my_results_yy[threadIdx.x >> 5] = yy;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_xz[ threadIdx.x ] += my_results_xz [ index ];
+            my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ];
+        results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ];
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass_yz_zz( single_body_parameters *sbp,
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real my_results_yz[];
+    extern __shared__ real my_results_zz[];
+
+    unsigned int zz_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real yz = 0;
+    real zz = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+    if (i < n)
+    {
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        yz = diff[1] * diff[2] * m;
+        zz = diff[2] * diff[2] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        yz += shfl( yz, z);
+        zz += shfl( zz, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_yz[threadIdx.x >> 5] = yz;    
+        my_results_zz[threadIdx.x >> 5] = zz;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_yz[ threadIdx.x ] += my_results_yz [ index ];
+            my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ];
+        results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ];
+    }
+}
+#endif
+
+
+CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *my_atoms, 
+        real *block_results, int n )
+{
+#if defined(__SM_35__)
+    extern __shared__ real my_sbp[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real sdata = 0;
+
+    if (i < n)
+    {
+        sdata = sbp[ my_atoms[i].type ].mass;
+    }
+    __syncthreads( );
+
+    for(int z = 16; z >=1; z/=2)
+    {
+        sdata += shfl( sdata, z);
+    }
+
+    if (threadIdx.x % 32 == 0)
+    {
+        my_sbp[threadIdx.x >> 5] = sdata;
+    }
+
+    __syncthreads( );
+
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if(threadIdx.x == 0)
+    {
+        block_results[blockIdx.x] = my_sbp[0];
+    }
+
+#else
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if (i < n)
+    {
+        x = sbp[ my_atoms[i].type ].mass;
+    }
+
+    sdata[ threadIdx.x ] = x;
+    __syncthreads( );
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if (threadIdx.x == 0)
+    {
+        block_results[ blockIdx.x] = sdata [0];
+    }
+
+#endif
+}
+
+
+extern "C" void dev_compute_total_mass( reax_system *system, real *local_val )
+{
+    real *block_mass = (real *) scratch;
+    cuda_memset( block_mass, 0, sizeof(real) * (1 + BLOCKS_POW_2), "total_mass:tmp" );
+
+    k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
+        (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof(real), 
+            cudaMemcpyDeviceToHost, "total_mass:tmp");
+}
+
+
+CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_atom *my_atoms, 
+        real *block_results, int n )
+{
+#if defined(__SM_35__)
+    extern __shared__ real my_sbpdot[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real sdata = 0;
+    rvec p;
+
+    if (i < n)
+    {
+        sdata = sbp[ my_atoms[i].type ].mass;
+        rvec_Scale( p, sdata, my_atoms[ i ].v );
+        sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
+    }
+
+    __syncthreads( );
+
+    for(int z = 16; z >=1; z/=2)
+    {
+        sdata += shfl( sdata, z);
+    }
+
+    if (threadIdx.x % 32 == 0)
+    {
+        my_sbpdot[threadIdx.x >> 5] = sdata;
+    }
+
+    __syncthreads( );
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if (threadIdx.x == 0)
+    {
+        block_results[blockIdx.x] = my_sbpdot[0];
+    }
+
+#else
+    extern __shared__ real sdata [];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real    m = 0;
+    rvec p;
+
+    if (i < n)
+    {
+        m = sbp[ my_atoms[i].type ].mass;
+        rvec_Scale( p, m, my_atoms[ i ].v );
+        m = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
+    }
+
+    sdata[ threadIdx.x ] = m;
+    __syncthreads( );
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if (threadIdx.x == 0)
+    {
+        block_results[blockIdx.x] = sdata[0];
+    }
+#endif
+}
+
+extern "C" void dev_compute_kinetic_energy( reax_system *system,
+        simulation_data *data, real *local_val )
+{
+    real *block_energy = (real *) scratch;
+    cuda_memset( block_energy, 0, sizeof(real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp" );
+
+    k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
+        (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( local_val, block_energy + BLOCKS_POW_2,
+            //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
+            sizeof(real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp" );
+            //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
+            //        sizeof (real), "kinetic_energy");
+}
+
+
+extern "C" void dev_compute_momentum( reax_system *system, rvec xcm, 
+        rvec vcm, rvec amcm )
+{
+    rvec *l_xcm, *l_vcm, *l_amcm;
+    rvec *r_scratch = (rvec *)scratch;
+
+#if defined( __SM_35__)
+    // xcm
+    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
+    l_xcm = r_scratch;
+    
+    center_of_mass_blocks_xcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    
+    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
+            (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( xcm, l_xcm + BLOCKS_POW_2,
+            sizeof(rvec), cudaMemcpyDeviceToHost, "momentum:xcm" );
+    
+    // vcm
+    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
+    l_vcm = r_scratch;
+    
+    center_of_mass_blocks_vcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    
+    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
+        (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof(rvec),
+        cudaMemcpyDeviceToHost, "momentum:vcm" );
+    
+    // amcm
+    cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
+    l_amcm = r_scratch;
+    
+    center_of_mass_blocks_amcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    
+    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
+        (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof(rvec),
+        cudaMemcpyDeviceToHost, "momemtum:amcm" );
+
+#else
+    cuda_memset( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
+    
+    l_xcm = r_scratch;
+    l_vcm = r_scratch + (BLOCKS_POW_2 + 1); 
+    l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); 
+    
+    center_of_mass_blocks <<< BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> 
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n );
+    cudaThreadSynchronize( ); 
+    cudaCheckError( ); 
+    
+    center_of_mass <<< 1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> 
+        ( l_xcm, l_vcm, l_amcm, l_xcm + BLOCKS_POW_2, l_vcm + BLOCKS_POW_2,
+          l_amcm + BLOCKS_POW_2, BLOCKS_POW_2 );
+    cudaThreadSynchronize( ); 
+    cudaCheckError( );
+    
+    copy_host_device( xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" );
+    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" );
+    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" );
+#endif
+}
+
+
+extern "C" void dev_compute_inertial_tensor( reax_system *system, real *local_results, rvec my_xcm )
+{
+#if defined(__SM_35__)
+    real *partial_results = (real *) scratch;
+    cuda_memset( partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp" );
+
+    compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
+        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2,
+        sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results" );
+
+#else
+    real *partial_results = (real *) scratch;
+    //real *local_results;
+
+    cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
+    //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1));
+
+    compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
+        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, 
+            sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
+#endif
+}
+
+
+extern "C" void dev_sync_simulation_data( simulation_data *data )
+{
+    Output_Sync_Simulation_Data( data, (simulation_data *)data->d_simulation_data );
+}
+
+
+void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
+        MPI_Comm comm )
+{
+    int i;
+    rvec p;
+    real m;
+
+    data->my_en.e_kin = 0.0;
+
+    dev_compute_kinetic_energy( system, data, &data->my_en.e_kin );
+
+    MPI_Allreduce( &data->my_en.e_kin,  &data->sys_en.e_kin,
+            1, MPI_DOUBLE, MPI_SUM, comm );
+
+    data->therm.T = (2. * data->sys_en.e_kin) / (data->N_f * K_B);
+
+    // avoid T being an absolute zero, might cause F.P.E!
+    if ( FABS(data->therm.T) < ALMOST_ZERO )
+    {
+        data->therm.T = ALMOST_ZERO;
+    }
+}
+
+
+void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data,
+        MPI_Comm comm  )
+{
+    int  i;
+    real tmp;
+
+    //compute local total mass of the system
+    dev_compute_total_mass( system, &tmp );
+
+    MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
+
+    data->inv_M = 1. / data->M;
+}
+
+
+void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
+        mpi_datatypes *mpi_data, MPI_Comm comm )
+{
+    int i;
+    real m, det; //xx, xy, xz, yy, yz, zz;
+    real tmp_mat[6], tot_mat[6];
+    rvec my_xcm, my_vcm, my_amcm, my_avcm;
+    rvec tvec, diff;
+    rtensor mat, inv;
+
+    rvec_MakeZero( my_xcm );  // position of CoM
+    rvec_MakeZero( my_vcm );  // velocity of CoM
+    rvec_MakeZero( my_amcm ); // angular momentum of CoM
+    rvec_MakeZero( my_avcm ); // angular velocity of CoM
+
+    /* Compute the position, vel. and ang. momentum about the centre of mass */
+    dev_compute_momentum ( system, my_xcm, my_vcm, my_amcm );
+
+    MPI_Allreduce( my_xcm, data->xcm, 3, MPI_DOUBLE, MPI_SUM, comm );
+    MPI_Allreduce( my_vcm, data->vcm, 3, MPI_DOUBLE, MPI_SUM, comm );
+    MPI_Allreduce( my_amcm, data->amcm, 3, MPI_DOUBLE, MPI_SUM, comm );
+
+    rvec_Scale( data->xcm, data->inv_M, data->xcm );
+    rvec_Scale( data->vcm, data->inv_M, data->vcm );
+    rvec_Cross( tvec, data->xcm, data->vcm );
+    rvec_ScaledAdd( data->amcm, -data->M, tvec );
+    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
+
+    /* Calculate and then invert the inertial tensor */
+    for ( i = 0; i < 6; ++i )
+    {
+        tmp_mat[i] = 0;
+    }
+
+    dev_compute_inertial_tensor( system, tmp_mat, my_xcm );
+
+    MPI_Reduce( tmp_mat, tot_mat, 6, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm );
+
+    if ( system->my_rank == MASTER_NODE )
+    {
+        mat[0][0] = tot_mat[3] + tot_mat[5];  // yy + zz;
+        mat[0][1] = mat[1][0] = -tot_mat[1];  // -xy;
+        mat[0][2] = mat[2][0] = -tot_mat[2];  // -xz;
+        mat[1][1] = tot_mat[0] + tot_mat[5];  // xx + zz;
+        mat[2][1] = mat[1][2] = -tot_mat[4];  // -yz;
+        mat[2][2] = tot_mat[0] + tot_mat[3];  // xx + yy;
+
+        /* invert the inertial tensor */
+        det = ( mat[0][0] * mat[1][1] * mat[2][2] +
+                mat[0][1] * mat[1][2] * mat[2][0] +
+                mat[0][2] * mat[1][0] * mat[2][1] ) -
+              ( mat[0][0] * mat[1][2] * mat[2][1] +
+                mat[0][1] * mat[1][0] * mat[2][2] +
+                mat[0][2] * mat[1][1] * mat[2][0] );
+
+        inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
+        inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
+        inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
+        inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
+        inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
+        inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
+        inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
+        inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
+        inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
+
+        if ( det > ALMOST_ZERO )
+        {
+            rtensor_Scale( inv, 1. / det, inv );
+        }
+        else
+        {
+            rtensor_MakeZero( inv );
+        }
+
+        /* Compute the angular velocity about the centre of mass */
+        rtensor_MatVec( data->avcm, inv, data->amcm );
+    }
+
+    MPI_Bcast( data->avcm, 3, MPI_DOUBLE, MASTER_NODE, comm );
+
+    /* Compute the rotational energy */
+    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
+
+#if defined(DEBUG)
+    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",
+             data->xcm[0], data->xcm[1], data->xcm[2] );
+    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n",
+             data->vcm[0], data->vcm[1], data->vcm[2] );
+    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n",
+             data->amcm[0], data->amcm[1], data->amcm[2] );
+    /* fprintf( stderr, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
+       mat[0][0], mat[0][1], mat[0][2],
+       mat[1][0], mat[1][1], mat[1][2],
+       mat[2][0], mat[2][1], mat[2][2] );
+       fprintf( stderr, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
+       inv[0][0], inv[0][1], inv[0][2],
+       inv[1][0], inv[1][1], inv[1][2],
+       inv[2][0], inv[2][1], inv[2][2] ); */
+    fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n",
+             data->avcm[0], data->avcm[1], data->avcm[2] );
+#endif
+}
+
+
diff --git a/PG-PuReMD/src/cuda_system_props.h b/PG-PuReMD/src/cuda/cuda_system_props.h
similarity index 65%
rename from PG-PuReMD/src/cuda_system_props.h
rename to PG-PuReMD/src/cuda/cuda_system_props.h
index ce6fccc13dfc4e8da29d676212a1a4a4db04fff3..66f620b301deaf3d5bf0d7acac48e28fa45f6901 100644
--- a/PG-PuReMD/src/cuda_system_props.h
+++ b/PG-PuReMD/src/cuda/cuda_system_props.h
@@ -2,24 +2,35 @@
 #ifndef __CUDA_SYSTEM_PROPS_H__
 #define __CUDA_SYSTEM_PROPS_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
 void dev_compute_total_mass( reax_system *, real * );
+
 void dev_compute_kinetic_energy( reax_system *, simulation_data *, real * );
+
 void dev_compute_momentum( reax_system *, rvec, rvec, rvec );
+
 void dev_compute_inertial_tensor( reax_system *, real *, rvec my_xcm );
 
 void dev_sync_simulation_data( simulation_data * );
+
 //void dev_compute_kinetic_energy( reax_system *, simulation_data *, real * );
 
+void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
+
+void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
+
+void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*,
+        mpi_datatypes*, MPI_Comm );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_torsion_angles.cu
rename to PG-PuReMD/src/cuda/cuda_torsion_angles.cu
index e70c378b664d7dc86ce38861f5a3f160975796ae..47c087d283b1422acd81886dac218ccd91d372a2 100644
--- a/PG-PuReMD/src/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
@@ -19,13 +19,14 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
-#include "index_utils.h"
 #include "cuda_torsion_angles.h"
-#include "vector.h"
+
 #include "cuda_list.h"
 #include "cuda_helpers.h"
 
+#include "../index_utils.h"
+#include "../vector.h"
+
 #define MIN_SINE 1e-10
 
 
diff --git a/PG-PuReMD/src/cuda_torsion_angles.h b/PG-PuReMD/src/cuda/cuda_torsion_angles.h
similarity index 57%
rename from PG-PuReMD/src/cuda_torsion_angles.h
rename to PG-PuReMD/src/cuda/cuda_torsion_angles.h
index 235e91b0dfe3bc634491bdf9a484a8fe851c2013..a7d9c3cb3f21203e02d7bdc52a1c6bb8c6887134 100644
--- a/PG-PuReMD/src/cuda_torsion_angles.h
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.h
@@ -19,24 +19,18 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __TORSION_ANGLES_H_
-#define __TORSION_ANGLES_H_
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *,
-                                      global_parameters ,
-                                      four_body_header *,
-                                      control_params *,
-                                      reax_list , reax_list ,
-                                      storage ,
-                                      int , int ,
-                                      real *, real *,
-                                      rvec *);
-
-CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess ( reax_atom *,
-        storage ,
-        reax_list , int );
+#ifndef __CUDA_TORSION_ANGLES_H_
+#define __CUDA_TORSION_ANGLES_H_
+
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *, global_parameters,
+        four_body_header *, control_params *, reax_list, reax_list,
+        storage, int, int, real *, real *, rvec * );
+
+CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess( reax_atom *,
+        storage, reax_list, int );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_utils.cu b/PG-PuReMD/src/cuda/cuda_utils.cu
similarity index 88%
rename from PG-PuReMD/src/cuda_utils.cu
rename to PG-PuReMD/src/cuda/cuda_utils.cu
index 5899a1ecc44b69341725b747e17599c7da26f6e4..7e1757bc7c58046bba70ec8a9917ec59c2e8d8b7 100644
--- a/PG-PuReMD/src/cuda_utils.cu
+++ b/PG-PuReMD/src/cuda/cuda_utils.cu
@@ -149,3 +149,20 @@ extern "C" void print_device_mem_usage( )
             total, (long long int)total/(1024.0*1024.0),
             free, (long long int)free/(1024.0*1024.0) );
 }
+
+
+extern "C" void init_blocks( reax_system *system )
+{
+    compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n );
+    compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 );
+
+    compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N );
+    compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N );
+
+    compute_matvec_blocks( &MATVEC_BLOCKS, system->N );
+
+#if defined(__CUDA_DEBUG_LOG__)
+    fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d  - N:%d \n",
+            MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N );
+#endif
+}
diff --git a/PG-PuReMD/src/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h
similarity index 80%
rename from PG-PuReMD/src/cuda_utils.h
rename to PG-PuReMD/src/cuda/cuda_utils.h
index 3d63d5e38cf2228feb4be97e4f2d2f090f5683f6..bfc4256d1408e3a2947714d3adde62899b144f55 100644
--- a/PG-PuReMD/src/cuda_utils.h
+++ b/PG-PuReMD/src/cuda/cuda_utils.h
@@ -1,7 +1,7 @@
 #ifndef __CUDA_UTILS_H_
 #define __CUDA_UTILS_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
@@ -9,22 +9,33 @@ extern "C"  {
 #endif
 
 void cuda_malloc( void **, size_t, int, const char * );
+
 void cuda_free( void *, const char * );
+
 void cuda_memset( void *, int , size_t , const char * );
+
 void copy_host_device( void *, void *, size_t, enum cudaMemcpyKind, const char * );
+
 void copy_device( void *, void *, size_t, const char * );
 
 void compute_blocks( int *, int *, int );
+
 void compute_matvec_blocks( int *, int );
+
 void compute_nearest_pow_2( int, int * );
 
+void init_blocks( reax_system * );
+
 void print_device_mem_usage( );
 
+
 #ifdef __cplusplus
 #define cudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
 static inline void __cudaCheckError( const char *file, const int line )
 {
-    cudaError err = cudaGetLastError();
+    cudaError err;
+
+    err = cudaGetLastError();
     if ( cudaSuccess != err )
     {
         fprintf( stderr, "[ERROR] runtime error encountered: %s:%d\n", file, line );
@@ -32,19 +43,22 @@ static inline void __cudaCheckError( const char *file, const int line )
         exit( RUNTIME_ERROR );
     }
 
+#if defined(DEBUG)
     /* More careful checking. However, this will affect performance. */
-//    err = cudaDeviceSynchronize();
-//    if( cudaSuccess != err )
-//    {
-//       exit( -1 );
-//    }
+    err = cudaDeviceSynchronize( );
+    if( cudaSuccess != err )
+    {
+       exit( RUNTIME_ERROR );
+    }
+#endif
 
     return;
 }
 #endif
 
-#endif
-
 #ifdef __cplusplus
 }
 #endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_valence_angles.cu
rename to PG-PuReMD/src/cuda/cuda_valence_angles.cu
index d778c3b2fb6d7c5201e4151fc82186f2900aeed7..21b8d2c8b26ce6d1d05bf0666ca84af65da7cbd6 100644
--- a/PG-PuReMD/src/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
@@ -21,9 +21,10 @@
 
 #include "cuda_valence_angles.h"
 
-#include "index_utils.h"
 #include "cuda_list.h"
-#include "vector.h"
+
+#include "../index_utils.h"
+#include "../vector.h"
 
 
 /* Compute 3-body interactions, in which the main role is played by
diff --git a/PG-PuReMD/src/cuda_valence_angles.h b/PG-PuReMD/src/cuda/cuda_valence_angles.h
similarity index 98%
rename from PG-PuReMD/src/cuda_valence_angles.h
rename to PG-PuReMD/src/cuda/cuda_valence_angles.h
index 6510959721418fe5c54a6c2d71ff24d554955c56..d8abac25aa5b3198faf59784bff5626a8797a32d 100644
--- a/PG-PuReMD/src/cuda_valence_angles.h
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.h
@@ -22,8 +22,10 @@
 #ifndef __CUDA_VALENCE_ANGLES_H_
 #define __CUDA_VALENCE_ANGLES_H_
 
-#include "reax_types.h"
-#include "vector.h"
+#include "../reax_types.h"
+
+#include "../vector.h"
+
 
 CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *, global_parameters,
         single_body_parameters *, three_body_header *, control_params *,
diff --git a/PG-PuReMD/src/cuda_validation.cu b/PG-PuReMD/src/cuda/cuda_validation.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_validation.cu
rename to PG-PuReMD/src/cuda/cuda_validation.cu
index 34a424301f710d18ae5e64399622ab52fbc50eb7..34ebf6e5e3c04aa037fa8f9919b1b5e867fcf783 100644
--- a/PG-PuReMD/src/cuda_validation.cu
+++ b/PG-PuReMD/src/cuda/cuda_validation.cu
@@ -1,13 +1,12 @@
-#include "reax_types.h"
 
 #include "cuda_validation.h"
 
 #include "cuda_utils.h"
 
-#include "index_utils.h"
-#include "list.h"
-#include "tool_box.h"
-#include "vector.h"
+#include "../index_utils.h"
+#include "../list.h"
+#include "../tool_box.h"
+#include "../vector.h"
 
 
 bool check_zero( real p1, real p2 )
diff --git a/PG-PuReMD/src/cuda_validation.h b/PG-PuReMD/src/cuda/cuda_validation.h
similarity index 97%
rename from PG-PuReMD/src/cuda_validation.h
rename to PG-PuReMD/src/cuda/cuda_validation.h
index 42eb37a40b8b78dcdf5e03a7e0870168de832630..7faa773ba5567bdc9c8d6bc24caee06ff2f9be44 100644
--- a/PG-PuReMD/src/cuda_validation.h
+++ b/PG-PuReMD/src/cuda/cuda_validation.h
@@ -3,50 +3,60 @@
 #ifndef __CUDA_VALIDATION_H__
 #define __CUDA_VALIDATION_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
 int validate_neighbors( reax_system *, reax_list **lists );
+
 int validate_sym_dbond_indices( reax_system *system,
         storage *workspace, reax_list **lists );
 
 int validate_bonds( reax_system *, storage *, reax_list ** );
+
 int validate_hbonds( reax_system *, storage *, reax_list ** );
+
 int validate_sparse_matrix( reax_system *, storage * );
 
 int validate_grid( reax_system * );
+
 int validate_workspace( reax_system *, storage * );
 
 int validate_data( reax_system *, simulation_data * );
+
 int validate_three_bodies( reax_system *, storage *,
         reax_list ** );
+
 int validate_atoms( reax_system *, reax_list ** );
 
 int print_sparse_matrix( sparse_matrix *H );
+
 int print_sparse_matrix_host( sparse_matrix *H );
 
 int print_host_rvec2( rvec2 *, int );
+
 int print_device_rvec2( rvec2 *, int );
 
 int print_host_array( real *, int );
+
 int print_device_array( real *, int );
 
 void compare_rvec2( rvec2 *host, rvec2 *device, int N,
         const char *msg );
+
 void compare_array( real *host, real *device, int N,
         const char *msg );
 
 int check_zeros_host( rvec2 *host, int n, const char * );
-int check_zeros_device( rvec2 *device, int n, const char * );
-
 
+int check_zeros_device( rvec2 *device, int n, const char * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda_hydrogen_bonds.h
deleted file mode 100644
index 7e1644f19c82d520fd34c036b9d7a2906e97b80f..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_hydrogen_bonds.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#ifndef __HBONDS_H_
-#define __HBONDS_H_
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs (  reax_atom *,
-        storage ,
-        reax_list );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL (  reax_atom *,
-        storage ,
-        reax_list, int );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess (  reax_atom *,
-        storage ,
-        reax_list , int );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *,
-                                      single_body_parameters *,
-                                      hbond_parameters *,
-                                      global_parameters ,
-                                      control_params *,
-                                      storage ,
-                                      reax_list ,
-                                      reax_list ,
-                                      int ,
-                                      int ,
-                                      real *,
-                                      rvec *);
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *,
-        single_body_parameters *,
-        hbond_parameters *,
-        global_parameters ,
-        control_params *,
-        storage ,
-        reax_list ,
-        reax_list ,
-        int ,
-        int ,
-        real *,
-        rvec *);
-
-#endif
diff --git a/PG-PuReMD/src/cuda_init_md.cu b/PG-PuReMD/src/cuda_init_md.cu
deleted file mode 100644
index 044e8e73bc068faa6359aa4ab00b499066440c69..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_init_md.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-
-#include "cuda_init_md.h"
-
-#include "reax_types.h"
-#include "cuda_utils.h"
-
-#include "tool_box.h"
-
-void Cuda_Init_ScratchArea( )
-{
-    cuda_malloc( (void **)&scratch, DEVICE_SCRATCH_SIZE, TRUE, "device:scratch" );
-
-    host_scratch = (void *) smalloc( HOST_SCRATCH_SIZE, "host:scratch" );
-}
diff --git a/PG-PuReMD/src/cuda_init_md.h b/PG-PuReMD/src/cuda_init_md.h
deleted file mode 100644
index cf7b52490322ff3e5e6dce21669ec7c290020a54..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_init_md.h
+++ /dev/null
@@ -1,15 +0,0 @@
-
-#ifndef __CUDA_INIT_MD_H__
-#define __CUDA_INIT_MD_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void Cuda_Init_ScratchArea( );
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/cuda_integrate.cu b/PG-PuReMD/src/cuda_integrate.cu
deleted file mode 100644
index 936c68163c86f964eb90fc4145ba8bfe698a4c0a..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_integrate.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-
-#include "cuda_integrate.h"
-#include "reax_types.h"
-
-#include "vector.h"
-#include "cuda_utils.h"
-
-
-CUDA_GLOBAL void k_update_velocity_1( reax_atom *my_atoms, 
-        single_body_parameters *sbp, real dt, int n )
-{
-    real inv_m;
-    rvec dx;
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= n )
-    {
-        return;
-    }
-
-    /* velocity verlet, 1st part */
-    atom = &(my_atoms[i]);
-    inv_m = 1.0 / sbp[atom->type].mass;
-    /* Compute x(t + dt) */
-    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-    rvec_Add( atom->x, dx );
-    /* Compute v(t + dt/2) */
-    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-}
-
-
-void bNVT_update_velocity_part1( reax_system *system, real dt )
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_update_velocity_2( reax_atom *my_atoms, 
-        single_body_parameters *sbp, real dt, int n )
-{
-    reax_atom *atom;
-    real inv_m;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= n )
-    {
-        return;
-    }
-
-    /* velocity verlet, 2nd part */
-    atom = &(my_atoms[i]);
-    inv_m = 1.0 / sbp[atom->type].mass;
-    /* Compute v(t + dt) */
-    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-}
-
-
-void bNVT_update_velocity_part2( reax_system *system, real dt )
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_scale_velocities( reax_atom *my_atoms, real lambda, int n )
-{
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= n )
-    {
-        return;
-    }
-
-    /* Scale velocities and positions at t+dt */
-    atom = &(my_atoms[i]);
-    rvec_Scale( atom->v, lambda, atom->v );
-}
-
-
-void bNVT_scale_velocities( reax_system *system, real lambda )
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, lambda, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
diff --git a/PG-PuReMD/src/cuda_lin_alg.cu b/PG-PuReMD/src/cuda_lin_alg.cu
deleted file mode 100644
index 4f37d577489d954cbd0fea1fea607aa0d2951faf..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_lin_alg.cu
+++ /dev/null
@@ -1,624 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "cuda_lin_alg.h"
-
-#include "reax_types.h"
-
-#include "cuda_shuffle.h"
-#include "cuda_utils.h"
-#include "cuda_reduction.h"
-
-
-//one thread per row
-CUDA_GLOBAL void k_matvec( sparse_matrix H, real *vec, real *results,
-        int rows )
-{
-    int i, col;
-    real results_row;
-    real val;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= rows )
-    {
-        return;
-    }
-
-    results_row = 0;
-
-    for (int c = H.start[i]; c < H.end[i]; c++)
-    {
-        col = H.entries [c].j;
-        val = H.entries[c].val;
-
-        results_row += val * vec[col];
-    }
-
-    results[i] = results_row;
-}
-
-
-//32 thread warp per matrix row.
-//invoked as follows
-// <<< system->N, 32 >>>
-//CUDA_GLOBAL void __launch_bounds__(384, 16) k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows)
-CUDA_GLOBAL void k_matvec_csr( sparse_matrix H, real *vec, real *results,
-        int num_rows )
-{
-#if defined(__SM_35__)
-    real vals;
-    int x;
-#else
-    extern __shared__ real vals[];
-#endif
-    int jj;
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
-    int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1);
-    int row_start;
-    int row_end;
-    // one warp per row
-    int row = warp_id;
-    
-#if defined(__SM_35__)
-    vals = 0;
-#else
-    vals[threadIdx.x] = 0;
-#endif
-
-    if (row < num_rows)
-    {
-        row_start = H.start[row];
-        row_end = H.end[row];
-
-        // compute running sum per thread
-        for ( jj = row_start + lane; jj < row_end;
-                jj += MATVEC_KER_THREADS_PER_ROW )
-#if defined(__SM_35__)
-        {
-            vals += H.entries[jj].val * vec[ H.entries[jj].j ];
-        }
-    }
-#else
-        {
-            vals[threadIdx.x] += H.entries[jj].val * vec[ H.entries[jj].j ];
-        }
-    }
-
-    __syncthreads( );
-#endif
-
-    // parallel reduction in shared memory
-    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
-#if defined(__SM_35__)
-    for (x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2)
-    {
-        vals += shfl( vals, x );
-    }
-
-    if (lane == 0 && row < num_rows)
-    {
-        results[row] = vals;
-    }
-#else
-    if (lane < 16)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 16];
-    }
-    __syncthreads( );
-    if (lane < 8)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 8];
-    }
-    __syncthreads( );
-    if (lane < 4)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 4];
-    }
-    __syncthreads( );
-    if (lane < 2)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 2];
-    }
-    __syncthreads( );
-    if (lane < 1)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 1];
-    }
-    __syncthreads( );
-
-    // first thread writes the result
-    if (lane == 0 && row < num_rows)
-    {
-        results[row] = vals[threadIdx.x];
-    }
-#endif
-}
-
-
-//one thread per row
-CUDA_GLOBAL void k_dual_matvec( sparse_matrix H, rvec2 *vec, rvec2 *results,
-        int rows )
-{
-    int i, c, col;
-    rvec2 results_row;
-    real val;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= rows)
-    {
-        return;
-    }
-
-    results_row[0] = 0.0;
-    results_row[1] = 0.0;
-
-    for (c = H.start[i]; c < H.end[i]; c++)
-    {
-        col = H.entries [c].j;
-        val = H.entries[c].val;
-
-        results_row[0] += val * vec [col][0];
-        results_row[1] += val * vec [col][1];
-    }
-
-    results[i][0] = results_row[0];
-    results[i][1] = results_row[1];
-}
-
-
-//32 thread warp per matrix row.
-//invoked as follows
-// <<< system->N, 32 >>>
-//CUDA_GLOBAL void __launch_bounds__(384, 8) k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, int num_rows)
-CUDA_GLOBAL void  k_dual_matvec_csr( sparse_matrix H, rvec2 *vec,
-        rvec2 *results, int num_rows )
-{
-#if defined(__SM_35__)
-    rvec2 rvals;
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
-    int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1);
-    int row_start;
-    int row_end;
-    // one warp per row
-    int row = warp_id;
-
-    rvals[0] = 0;
-    rvals[1] = 0;
-
-    if (row < num_rows)
-    {
-        row_start = H.start[row];
-        row_end = H.end[row];
-
-        for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW)
-        {
-            rvals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
-            rvals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
-        }
-    }
-
-    for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2)
-    {
-        rvals[0] += shfl( rvals[0], s);
-        rvals[1] += shfl( rvals[1], s);
-    }
-
-    if (lane == 0 && row < num_rows)
-    {
-        results[row][0] = rvals[0];
-        results[row][1] = rvals[1];
-    }
-
-#else
-    extern __shared__ rvec2 rvals[];
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / 32;
-    int lane = thread_id & (32 - 1);
-    int row_start;
-    int row_end;
-    // one warp per row
-    //int row = warp_id;
-    int row = warp_id;
-
-    rvals[threadIdx.x][0] = 0;
-    rvals[threadIdx.x][1] = 0;
-
-    if (row < num_rows)
-    {
-        row_start = H.start[row];
-        row_end = H.end[row];
-
-        // compute running sum per thread
-        for(int jj = row_start + lane; jj < row_end; jj += 32)
-        {
-            rvals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
-            rvals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
-        }
-    }
-
-    __syncthreads( );
-
-    // parallel reduction in shared memory
-    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
-    if (lane < 16)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 16][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 16][1]; 
-    }
-    __syncthreads( );
-    if (lane < 8)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 8][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 8][1]; 
-    }
-    __syncthreads( );
-    if (lane < 4)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 4][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 4][1]; 
-    }
-    __syncthreads( );
-    if (lane < 2)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 2][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 2][1]; 
-    }
-    __syncthreads( );
-    if (lane < 1)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 1][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 1][1]; 
-    }
-    __syncthreads( );
-
-    // first thread writes the result
-    if (lane == 0 && row < num_rows)
-    {
-        results[row][0] = rvals[threadIdx.x][0];
-        results[row][1] = rvals[threadIdx.x][1];
-    }
-
-#endif
-}
-
-
-void Cuda_Vector_Sum( real *res, real a, real *x, real b, real *y, int count )
-{
-    //res = ax + by
-    //use the cublas here
-    int blocks;
-
-    blocks = (count / DEF_BLOCK_SIZE) + 
-        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
-        ( res, a, x, b, y, count );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count )
-{
-    //res = a*b - vector multiplication
-    //use the cublas here.
-    int blocks;
-
-    blocks = (count / DEF_BLOCK_SIZE) + 
-        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
-        ( res, a, b, count );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
-{
-    storage *workspace;
-    int j;
-   
-    j = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( j >= n )
-    {
-        return;
-    }
-
-    workspace = &( p_workspace );
-
-    //for( j = 0; j < system->n; ++j ) {
-    // residual 
-    workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
-    workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
-
-    // apply diagonal pre-conditioner
-    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
-    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
-    //}
-}
-
-
-void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n )
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-        (*workspace, b, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, 
-        real alpha_0, real alpha_1, int n, rvec2 *my_dot )
-{
-    storage *workspace;
-    rvec2 alpha;
-    int j;
-   
-    j = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( j >= n )
-    {
-        return;
-    }
-
-    workspace = &( p_workspace );
-    alpha[0] = alpha_0;
-    alpha[1] = alpha_1;
-    my_dot[j][0] = my_dot[j][1] = 0.0;
-
-    //for( j = 0; j < system->n; ++j ) {
-    // update x 
-    x[j][0] += alpha[0] * workspace->d2[j][0];
-    x[j][1] += alpha[1] * workspace->d2[j][1];      
-
-    // update residual 
-    workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
-    workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
-
-    // apply diagonal pre-conditioner 
-    workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-    workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-
-    // dot product: r.p 
-    my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
-    my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
-    //}
-}
-
-
-void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha,
-        int n, rvec2 result )
-{
-    int blocks;
-    rvec2 *tmp = (rvec2 *) scratch;
-
-    cuda_memset( tmp, 0, sizeof(rvec2) * ( 2 * n + 1),
-            "cuda_dualcg_preconditioner" );
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-        (*workspace, x, alpha[0], alpha[1], n, tmp);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    //Reduction to calculate my_dot
-    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
-        ( tmp, tmp + n, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
-        ( tmp + n, tmp + 2*n, blocks);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( result, (tmp + 2*n), sizeof(rvec2),
-            cudaMemcpyDeviceToHost, "my_dot" );
-}
-
-
-void Cuda_Norm( rvec2 *arr, int n, rvec2 result )
-{
-    int blocks;
-    rvec2 *tmp = (rvec2 *) scratch;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
-        (arr, tmp, n, INITIAL);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
-        (tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
-            cudaMemcpyDeviceToHost, "cuda_norm_rvec2" );
-}
-
-
-void Cuda_Dot( rvec2 *a, rvec2 *b, rvec2 result, int n )
-{
-    int blocks;
-    rvec2 *tmp = (rvec2 *) scratch;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
-        ( a, b, tmp, n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> 
-    //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * BLOCKS_POW_2 >>> 
-        ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
-            cudaMemcpyDeviceToHost, "cuda_dot" );
-}
-
-
-void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
-        ( x, a, b[0], b[1], c, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n )
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (i >= n)
-    {
-        return;
-    }
-
-    dst[i] = src[i][index];
-}
-
-
-void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n )
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
-        ( dst, src, index, n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (i >= n)
-    {
-        return;
-    }
-
-    dst[i][index] = src[i];
-}
-
-
-void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
-        ( dst, src, index, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size )
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
-    cuda_memset( b, 0, sizeof(rvec2) * size, "dual_matvec:result" );
-
-    //One thread per row implementation
-    //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-    //        (*H, a, b, n);
-    //cudaThreadSynchronize ();
-    //cudaCheckError ();
-
-    //One warp per row implementation
-#if defined(__SM_35__)
-    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
-#else
-    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
-                      sizeof(rvec2) * MATVEC_BLOCK_SIZE >>>
-#endif
-            ( *H, a, b, n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size )
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
-    cuda_memset( b, 0, sizeof(real) * size, "dual_matvec:result" );
-
-    //one thread per row implementation
-    //k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-    //        (*H, a, b, n);
-    //cudaThreadSynchronize ();
-    //cudaCheckError ();
-
-#if defined(__SM_35__)
-    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
-#else
-    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
-                 sizeof(real) * MATVEC_BLOCK_SIZE>>>
-#endif
-                     (*H, a, b, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
diff --git a/PG-PuReMD/src/cuda_system_props.cu b/PG-PuReMD/src/cuda_system_props.cu
deleted file mode 100644
index 3202f64af932cdf80706c4e5945d83bfe7ed4666..0000000000000000000000000000000000000000
--- a/PG-PuReMD/src/cuda_system_props.cu
+++ /dev/null
@@ -1,406 +0,0 @@
-
-#include "cuda_system_props.h"
-
-#include "cuda_utils.h"
-#include "cuda_reduction.h"
-#include "center_mass.h"
-#include "cuda_copy.h"
-#include "cuda_shuffle.h"
-
-#include "vector.h"
-
-
-CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *my_atoms, 
-        real *block_results, int n )
-{
-#if defined(__SM_35__)
-    extern __shared__ real my_sbp[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real sdata = 0;
-
-    if (i < n)
-    {
-        sdata = sbp[ my_atoms[i].type ].mass;
-    }
-    __syncthreads( );
-
-    for(int z = 16; z >=1; z/=2)
-    {
-        sdata += shfl( sdata, z);
-    }
-
-    if (threadIdx.x % 32 == 0)
-    {
-        my_sbp[threadIdx.x >> 5] = sdata;
-    }
-
-    __syncthreads( );
-
-    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
-    {
-        if(threadIdx.x < offset)
-        {
-            my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset];
-        }
-
-        __syncthreads( );
-    }
-
-    if(threadIdx.x == 0)
-    {
-        block_results[blockIdx.x] = my_sbp[0];
-    }
-
-#else
-    extern __shared__ real sdata[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real x = 0;
-
-    if (i < n)
-    {
-        x = sbp[ my_atoms[i].type ].mass;
-    }
-
-    sdata[ threadIdx.x ] = x;
-    __syncthreads( );
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-        }
-
-        __syncthreads( );
-    }
-
-    if (threadIdx.x == 0)
-    {
-        block_results[ blockIdx.x] = sdata [0];
-    }
-
-#endif
-}
-
-
-extern "C" void dev_compute_total_mass( reax_system *system, real *local_val )
-{
-    real *block_mass = (real *) scratch;
-    cuda_memset( block_mass, 0, sizeof(real) * (1 + BLOCKS_POW_2), "total_mass:tmp" );
-
-    k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
-        (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof(real), 
-            cudaMemcpyDeviceToHost, "total_mass:tmp");
-}
-
-
-CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_atom *my_atoms, 
-        real *block_results, int n )
-{
-#if defined(__SM_35__)
-    extern __shared__ real my_sbpdot[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real sdata = 0;
-    rvec p;
-
-    if (i < n)
-    {
-        sdata = sbp[ my_atoms[i].type ].mass;
-        rvec_Scale( p, sdata, my_atoms[ i ].v );
-        sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
-    }
-
-    __syncthreads( );
-
-    for(int z = 16; z >=1; z/=2)
-    {
-        sdata += shfl( sdata, z);
-    }
-
-    if (threadIdx.x % 32 == 0)
-    {
-        my_sbpdot[threadIdx.x >> 5] = sdata;
-    }
-
-    __syncthreads( );
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset];
-        }
-
-        __syncthreads( );
-    }
-
-    if (threadIdx.x == 0)
-    {
-        block_results[blockIdx.x] = my_sbpdot[0];
-    }
-
-#else
-    extern __shared__ real sdata [];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real    m = 0;
-    rvec p;
-
-    if (i < n)
-    {
-        m = sbp[ my_atoms[i].type ].mass;
-        rvec_Scale( p, m, my_atoms[ i ].v );
-        m = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
-    }
-
-    sdata[ threadIdx.x ] = m;
-    __syncthreads( );
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-        }
-
-        __syncthreads( );
-    }
-
-    if (threadIdx.x == 0)
-    {
-        block_results[blockIdx.x] = sdata[0];
-    }
-#endif
-}
-
-extern "C" void dev_compute_kinetic_energy( reax_system *system,
-        simulation_data *data, real *local_val )
-{
-    real *block_energy = (real *) scratch;
-    cuda_memset( block_energy, 0, sizeof(real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp" );
-
-    k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
-        (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( local_val, block_energy + BLOCKS_POW_2,
-            //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
-            sizeof(real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp" );
-            //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
-            //        sizeof (real), "kinetic_energy");
-}
-
-
-extern "C" void dev_compute_momentum( reax_system *system, rvec xcm, 
-        rvec vcm, rvec amcm )
-{
-    rvec *l_xcm, *l_vcm, *l_amcm;
-    rvec *r_scratch = (rvec *)scratch;
-
-#if defined( __SM_35__)
-    // xcm
-    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
-    l_xcm = r_scratch;
-    
-    center_of_mass_blocks_xcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
-    (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    
-    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
-            (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    copy_host_device( xcm, l_xcm + BLOCKS_POW_2,
-            sizeof(rvec), cudaMemcpyDeviceToHost, "momentum:xcm" );
-    
-    // vcm
-    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
-    l_vcm = r_scratch;
-    
-    center_of_mass_blocks_vcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    
-    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
-        (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof(rvec),
-        cudaMemcpyDeviceToHost, "momentum:vcm" );
-    
-    // amcm
-    cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-    l_amcm = r_scratch;
-    
-    center_of_mass_blocks_amcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    
-    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
-        (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof(rvec),
-        cudaMemcpyDeviceToHost, "momemtum:amcm" );
-
-#else
-    cuda_memset( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
-    
-    l_xcm = r_scratch;
-    l_vcm = r_scratch + (BLOCKS_POW_2 + 1); 
-    l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); 
-    
-    center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> 
-        (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n);
-    cudaThreadSynchronize( ); 
-    cudaCheckError( ); 
-    
-    center_of_mass <<<1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> 
-        (l_xcm, l_vcm, l_amcm,
-         l_xcm + BLOCKS_POW_2, 
-         l_vcm + BLOCKS_POW_2, 
-         l_amcm + BLOCKS_POW_2, 
-         BLOCKS_POW_2);
-    cudaThreadSynchronize( ); 
-    cudaCheckError( );
-    
-    copy_host_device( xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" );
-    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" );
-    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" );
-#endif
-}
-
-
-extern "C" void dev_compute_inertial_tensor( reax_system *system, real *local_results, rvec my_xcm )
-{
-#if defined(__SM_35__)
-    real *partial_results = (real *) scratch;
-    cuda_memset( partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp" );
-
-    compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
-        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2,
-        sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results" );
-
-#else
-    real *partial_results = (real *) scratch;
-    //real *local_results;
-
-    cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
-    //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1));
-
-    compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
-        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, 
-            sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
-#endif
-}
-
-
-extern "C" void dev_sync_simulation_data( simulation_data *data )
-{
-    Output_Sync_Simulation_Data( data, (simulation_data *)data->d_simulation_data );
-}
-
-
-/*
-CUDA_GLOBAL void ker_kinetic_energy (reax_atom *my_atoms, 
-   single_body_parameters *sbp, int n, real *block_results)
-{
-   extern __shared__ real sken[];
-   rvec p;
-   unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-   real x = 0;
-
-   if(i < n)
-   {
-   m = sbp[my_atoms[i].type].mass;
-   rvec_Scale( p, m, my_atoms[i].v );
-   x = 0.5 * rvec_Dot( p, my_atoms[i].v );
-   }
-   sken[threadIdx.x] = x;
-   __syncthreads();
-
-   for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-   {
-   if(threadIdx.x < offset)
-   {   
-   sken[threadIdx.x] += sken[threadIdx.x + offset];
-   }   
-
-   __syncthreads();
-   }
-
-   if(threadIdx.x == 0)
-   {
-   per_block_results[blockIdx.x] = sken[0];
-   }
-}
-
-
-void dev_compute_kinetic_energy (reax_system *system, simulation_data *data, real *p_ekin)
-{
-   real *spad = (real *) scratch;
-   cuda_memset (spad, 0, sizeof (real) * 2 * system->n, "kinetic_energy");
-
-   ker_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-   (spad, spad + system->n,  system->n);
-   cudaThreadSynchronize (); 
-   cudaCheckError (); 
-
-   k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> 
-   (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, BLOCKS);
-   cudaThreadSynchronize (); 
-   cudaCheckError (); 
-
-   copy_host_device (p_ekin, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
-   sizeof (real), cudaMemcpyDeviceToHost, "kinetic_energy");
-}
-*/
diff --git a/PG-PuReMD/src/ffield.c b/PG-PuReMD/src/ffield.c
index 443d905109f4d88f94d36b1da1bdfa61b93185a0..d985339ba074b3c20534a5c9a96ff59a9a8e7d0c 100644
--- a/PG-PuReMD/src/ffield.c
+++ b/PG-PuReMD/src/ffield.c
@@ -20,7 +20,8 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-  #if defined(PURE_REAX)
+
+#if defined(PURE_REAX)
   #include "ffield.h"
   #include "tool_box.h"
 #elif defined(LAMMPS_REAX)
diff --git a/PG-PuReMD/src/ffield.h b/PG-PuReMD/src/ffield.h
index 9aa2a27f69eaee581b2f46d5cbf3db68873a3771..313c3e67c5cd7a7dfafc7c51682d9e2773ed9198 100644
--- a/PG-PuReMD/src/ffield.h
+++ b/PG-PuReMD/src/ffield.h
@@ -24,6 +24,16 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 char Read_Force_Field( char*, reax_interaction*, control_params* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index c57527fedd32bf74e90212b33da86ea5671374a9..19133fcee9aaeeb9dcca9e736a1859920a3acd19 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -21,15 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-#ifdef HAVE_CUDA
-  #include "cuda_forces.h"
-  #include "cuda_lin_alg.h"
-  #include "cuda_neighbors.h"
-  #include "cuda_utils.h"
-  #include "cuda_validation.h"
-#endif
-
 #if defined(PURE_REAX)
   #include "forces.h"
   #include "bond_orders.h"
@@ -63,11 +54,7 @@
   #include "reax_vector.h"
 #endif
 
-
-#ifdef HAVE_CUDA
-void Cuda_Total_Forces( reax_system *, control_params *, simulation_data *, storage * );
-void Cuda_Total_Forces_PURE( reax_system *, storage * );
-#endif
+#include "index_utils.h"
 
 
 interaction_function Interaction_Functions[NUM_INTRS];
@@ -221,41 +208,6 @@ void Compute_Total_Force( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace,
-        reax_list **lists, mpi_datatypes *mpi_data )
-{
-    rvec *f;
-
-    f = (rvec *) host_scratch;
-    memset( f, 0, sizeof(rvec) * system->N );
-
-    Cuda_Total_Forces( system, control, data, workspace );
-
-#if defined(PURE_REAX)
-    /* now all forces are computed to their partially-final values
-     * based on the neighbors information each processor has had.
-     * final values of force on each atom needs to be computed by adding up
-     * all partially-final pieces */
-
-    //MVAPICH2
-    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N ,
-            cudaMemcpyDeviceToHost, "total_force:f:get" );
-
-    Coll( system, mpi_data, f, mpi_data->mpi_rvec,
-          sizeof(rvec) / sizeof(void), rvec_unpacker );
-
-    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N,
-            cudaMemcpyHostToDevice, "total_force:f:put" );
-
-    Cuda_Total_Forces_PURE( system, dev_workspace );
-#endif
-
-}
-#endif
-
-
 // Essentially no-cuda copies of cuda kernels, to be used only in the mpi-not-gpu version
 ////////////////////////
 // HBOND ISSUE
@@ -1851,173 +1803,6 @@ int Compute_Forces( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_Compute_Forces( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace, reax_list **lists,
-        output_controls *out_control, mpi_datatypes *mpi_data )
-{
-    int charge_flag, retVal;
-
-#if defined(LOG_PERFORMANCE)
-    real t_start = 0;
-
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-    {
-        t_start = Get_Time( );
-    }
-#endif
-
-    retVal = SUCCESS;
-
-    /********* init forces ************/
-    if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 )
-    {
-        charge_flag = TRUE;
-    }
-    else
-    {
-        charge_flag = FALSE;
-    }
-
-    if ( charge_flag == TRUE )
-    {
-        retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
-
-//        int i;
-//        static reax_list **temp_lists;
-//       
-//        if ( data->step == 0 )
-//        {
-//            temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" );
-//            for ( i = 0; i < LIST_N; ++i )
-//            {
-//                temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
-//                temp_lists[i]->allocated = FALSE;
-//            }
-//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
-//                    TYP_BOND, *temp_lists + BONDS );
-//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
-//                    TYP_HBOND, *temp_lists + HBONDS );
-//        }
-//        else
-//        {
-//            Delete_List( *temp_lists + BONDS );
-//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
-//                    TYP_BOND, *temp_lists + BONDS );
-//            Delete_List( *temp_lists + HBONDS );
-//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
-//                    TYP_HBOND, *temp_lists + HBONDS );
-//
-//        }
-//        Output_Sync_Lists( *temp_lists + BONDS, *dev_lists + BONDS, TYP_BOND );
-//        Print_Bonds( system, temp_lists, control );
-//        Output_Sync_Lists( *temp_lists + HBONDS, *dev_lists + HBONDS, TYP_HBOND );
-//        Print_HBonds( system, temp_lists, control, data->step );
-//        Print_HBond_Indices( system, temp_lists, control, data->step );
-//        exit( 0 );
-    }
-    else
-    {
-        retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
-    }
-
-    if ( retVal == SUCCESS )
-    {
-        //validate_sparse_matrix( system, workspace );
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.init_forces) );
-        }
-#endif
-
-        /********* bonded interactions ************/
-        retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.bonded) );
-        }
-#endif
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: completed bonded\n",
-                 system->my_rank, data->step );
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-    }
-
-    if ( retVal == SUCCESS )
-    {
-    /**************** charges ************************/
-#if defined(PURE_REAX)
-        if ( charge_flag == TRUE )
-        {
-            Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
-        }
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.qEq) );
-        }
-#endif
-
-#if defined(DEBUG_FOCUS)
-        fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-#endif //PURE_REAX
-
-        /********* nonbonded interactions ************/
-        Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
-                lists, out_control, mpi_data );
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.nonb) );
-        }
-#endif
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
-                system->my_rank, data->step );
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-        /*********** total force ***************/
-        Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.bonded) );
-        }
-#endif
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: total forces computed\n",
-                system->my_rank, data->step );
-        //Print_Total_Force( system, data, workspace );
-        MPI_Barrier( MPI_COMM_WORLD );
-
-#endif
-
-//        Print_Forces( system );
-    }
-
-    return retVal;
-}
-#endif
-
-
 int validate_device( reax_system *system, simulation_data *data,
         storage *workspace, reax_list **lists )
 {
diff --git a/PG-PuReMD/src/forces.h b/PG-PuReMD/src/forces.h
index 6b4218e865576773ca7cb76d2ebff2408c814e72..0579f092de4a046e6e91d4e41635de22616b4ea8 100644
--- a/PG-PuReMD/src/forces.h
+++ b/PG-PuReMD/src/forces.h
@@ -28,6 +28,10 @@
 extern interaction_function Interaction_Functions[NUM_INTRS];
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Init_Force_Functions( control_params* );
 
 int Compute_Forces( reax_system*, control_params*, simulation_data*,
@@ -36,10 +40,11 @@ int Compute_Forces( reax_system*, control_params*, simulation_data*,
 void Estimate_Storages( reax_system*, control_params*, reax_list**,
         int*, int*, int*, int* );
 
-int Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*,
-        storage*, reax_list**, output_controls*, mpi_datatypes* );
-
 int validate_device( reax_system *, simulation_data *, storage *, reax_list ** );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/geo_tools.c b/PG-PuReMD/src/geo_tools.c
index b97123a955f2f1cc61b1d1f8db3e11ec233bce13..dff292e74019a18c78daa34519bd15b5246d0224 100644
--- a/PG-PuReMD/src/geo_tools.c
+++ b/PG-PuReMD/src/geo_tools.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "geo_tools.h"
+
 #include "allocate.h"
 #include "box.h"
 #include "tool_box.h"
diff --git a/PG-PuReMD/src/geo_tools.h b/PG-PuReMD/src/geo_tools.h
index 8078685689afa1d6edbe7b4534dd3bc65c45d1c5..628e8f74e6d6f484cfb76a544dbef4dc5c6aeb8e 100644
--- a/PG-PuReMD/src/geo_tools.h
+++ b/PG-PuReMD/src/geo_tools.h
@@ -29,10 +29,6 @@
 // CUSTOM ATOM: serial element name x y z
 #define CUSTOM_ATOM_FORMAT " %d %s %s %lf %lf %lf"
 
-char Read_Geo( char*, reax_system*, control_params*,
-               simulation_data*, storage*, mpi_datatypes* );
-
-
 /*PDB format :
 http://www.rcsb.org/pdb/file_formats/pdb/pdbguide2.2/guide2.2_frame.html
 
@@ -114,10 +110,23 @@ COLUMNS       DATA TYPE       FIELD         DEFINITION
 #define PDB_ATOM_FORMAT_O_LENGTH 81
 #define PDB_CRYST1_FORMAT_O "%6s%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f%11s%4d\n"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+char Read_Geo( char*, reax_system*, control_params*,
+        simulation_data*, storage*, mpi_datatypes* );
+
 char Read_PDB( char*, reax_system*, control_params*,
-               simulation_data*, storage*, mpi_datatypes* );
+        simulation_data*, storage*, mpi_datatypes* );
 
 char Write_PDB( reax_system*, reax_list*, simulation_data*,
-                control_params*, mpi_datatypes*, output_controls* );
+        control_params*, mpi_datatypes*, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/grid.c b/PG-PuReMD/src/grid.c
index 3714766caeefbacbc16db74fc0c6916f4515a7e4..d893f6c60e166f8e11092e203817a3f610439c64 100644
--- a/PG-PuReMD/src/grid.c
+++ b/PG-PuReMD/src/grid.c
@@ -19,15 +19,17 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "grid.h"
+
 #include "allocate.h"
+#include "index_utils.h"
 #include "io_tools.h"
 #include "reset_tools.h"
 #include "tool_box.h"
 #include "vector.h"
 
-#include "index_utils.h"
-
 
 /* determines the exchange boundaries with nbrs in terms of gcells */
 void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
diff --git a/PG-PuReMD/src/grid.h b/PG-PuReMD/src/grid.h
index ad51e699182a9b49e1de6b488605ec4e48179cd8..cb124da7b902a1d62864f2a801782875644cb73b 100644
--- a/PG-PuReMD/src/grid.h
+++ b/PG-PuReMD/src/grid.h
@@ -24,10 +24,24 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Setup_New_Grid( reax_system*, control_params*, MPI_Comm );
+
 void Update_Grid( reax_system*, control_params*, MPI_Comm );
+
 void Bin_My_Atoms( reax_system*, reallocate_data* );
+
 void Reorder_My_Atoms( reax_system*, storage* );
+
 void Bin_Boundary_Atoms( reax_system* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/hydrogen_bonds.c b/PG-PuReMD/src/hydrogen_bonds.c
index 5743feb5c7cae2861fd63fa0f106be93700a6c97..dfd7abac747a8c7e78d48d47a63aa9600af1d379 100644
--- a/PG-PuReMD/src/hydrogen_bonds.c
+++ b/PG-PuReMD/src/hydrogen_bonds.c
@@ -21,8 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-
 #if defined(PURE_REAX)
   #include "hydrogen_bonds.h"
   #include "bond_orders.h"
@@ -37,6 +35,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 // DANIEL
 // This function is taken straight from PuReMD, with minimal changes to accomodate the new datastructures
diff --git a/PG-PuReMD/src/hydrogen_bonds.h b/PG-PuReMD/src/hydrogen_bonds.h
index 346f00453f10ccf7173db07040a15461162e1781..e4f58e104d6646fddc35ef51c8c169c172f07eca 100644
--- a/PG-PuReMD/src/hydrogen_bonds.h
+++ b/PG-PuReMD/src/hydrogen_bonds.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Hydrogen_Bonds( reax_system*, control_params*, simulation_data*,
-                     storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index 595724bc7df70cc2f63278ec4c2df594172876db..2e406d1a94994d2e85c6116ac45d76b8f39da8fa 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -23,17 +23,6 @@
 
 #include <stddef.h>
 
-#ifdef HAVE_CUDA
-  #include "cuda_allocate.h"
-  #include "cuda_list.h"
-  #include "cuda_copy.h"
-  #include "cuda_forces.h"
-  #include "cuda_init_md.h"
-  #include "cuda_neighbors.h"
-  #include "cuda_reset_tools.h"
-  #include "cuda_validation.h"
-#endif
-
 #if defined(PURE_REAX)
   #include "init_md.h"
   #include "allocate.h"
@@ -239,76 +228,6 @@ int Init_System( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_Init_System( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace,
-        mpi_datatypes *mpi_data, char *msg )
-{
-    int i, ret;
-    reax_atom *atom;
-    int nrecv[MAX_NBRS];
-
-    Setup_New_Grid( system, control, MPI_COMM_WORLD );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d GRID:\n", system->my_rank );
-    Print_Grid( &(system->my_grid), stderr );
-#endif
-
-    Bin_My_Atoms( system, &(workspace->realloc) );
-    Reorder_My_Atoms( system, workspace );
-
-    /* estimate N and total capacity */
-    for ( i = 0; i < MAX_NBRS; ++i )
-    {
-        nrecv[i] = 0;
-    }
-
-    MPI_Barrier( MPI_COMM_WORLD );
-    system->max_recved = 0;
-    system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
-            Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE );
-    system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
-    Bin_Boundary_Atoms( system );
-
-    /* Sync atoms here to continue the computation */
-    dev_alloc_system( system );
-    Sync_System( system );
-
-    /* estimate numH and Hcap */
-    Cuda_Reset_Atoms( system, control );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: n=%d local_cap=%d\n",
-             system->my_rank, system->n, system->local_cap );
-    fprintf( stderr, "p%d: N=%d total_cap=%d\n",
-             system->my_rank, system->N, system->total_cap );
-    fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
-             system->my_rank, system->numH, system->Hcap );
-#endif
-
-    Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D );
-
-    Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );
-
-//    if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE )
-//    {
-//        return FAILURE;
-//    }
-
-    /* initialize velocities so that desired init T can be attained */
-    if ( !control->restart || (control->restart && control->random_vel) )
-    {
-        Generate_Initial_Velocities( system, control->T_init );
-    }
-
-    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-    return SUCCESS;
-}
-#endif
-
-
 /************************ initialize simulation data ************************/
 void Init_Simulation_Data( reax_system *system, control_params *control,
         simulation_data *data, char *msg )
@@ -411,102 +330,6 @@ void Init_Simulation_Data( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
-        simulation_data *data, char *msg )
-{
-    dev_alloc_simulation_data( data );
-
-    Reset_Simulation_Data( data );
-
-    if ( !control->restart )
-    {
-        data->step = data->prev_steps = 0;
-    }
-
-    switch ( control->ensemble )
-    {
-    case NVE:
-        data->N_f = 3 * system->bigN;
-        Cuda_Evolve = Velocity_Verlet_NVE;
-        control->virial = 0;
-        break;
-
-    case bNVT:
-        data->N_f = 3 * system->bigN + 1;
-        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
-        control->virial = 0;
-        break;
-
-    case nhNVT:
-        fprintf( stderr, "[WARNING] Nose-Hoover NVT is still under testing.\n" );
-        data->N_f = 3 * system->bigN + 1;
-        Cuda_Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
-        control->virial = 0;
-        if ( !control->restart || (control->restart && control->random_vel) )
-        {
-            data->therm.G_xi = control->Tau_T *
-                               (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
-            data->therm.v_xi = data->therm.G_xi * control->dt;
-            data->therm.v_xi_old = 0;
-            data->therm.xi = 0;
-        }
-        break;
-
-    case sNPT: /* Semi-Isotropic NPT */
-        data->N_f = 3 * system->bigN + 4;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
-        control->virial = 1;
-        if ( !control->restart )
-        {
-            Reset_Pressures( data );
-        }
-        break;
-
-    case iNPT: /* Isotropic NPT */
-        data->N_f = 3 * system->bigN + 2;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
-        control->virial = 1;
-        if ( !control->restart )
-        {
-            Reset_Pressures( data );
-        }
-        break;
-
-    case NPT: /* Anisotropic NPT */
-        data->N_f = 3 * system->bigN + 9;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
-        control->virial = 1;
-
-        fprintf( stderr, "p%d: init_simulation_data: option not yet implemented\n",
-              system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
-        break;
-
-    default:
-        fprintf( stderr, "p%d: init_simulation_data: ensemble not recognized\n",
-              system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
-    }
-
-    /* initialize the timer(s) */
-    MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-    {
-        data->timing.start = Get_Time( );
-
-#if defined(LOG_PERFORMANCE)
-        Reset_Timing( &data->timing );
-#endif
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, "data->N_f: %8.3f\n", data->N_f );
-#endif
-}
-#endif
-
-
 #elif defined(LAMMPS_REAX)
 int Init_System( reax_system *system, char *msg )
 {
@@ -603,22 +426,6 @@ void Init_Workspace( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Init_Workspace( reax_system *system, control_params *control,
-        storage *workspace, char *msg )
-{
-    dev_alloc_workspace( system, control, dev_workspace,
-            system->local_cap, system->total_cap, msg );
-
-    memset( &(workspace->realloc), 0, sizeof(reallocate_data) );
-    Cuda_Reset_Workspace( system, workspace );
-
-    /* Initialize the Taper function */
-    Init_Taper( control, dev_workspace );
-}
-#endif
-
-
 /************** setup communication data structures  **************/
 int Init_MPI_Datatypes( reax_system *system, storage *workspace,
         mpi_datatypes *mpi_data, char *msg )
@@ -885,88 +692,6 @@ int Init_Lists( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_Init_Lists( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace, reax_list **lists,
-        mpi_datatypes *mpi_data, char *msg )
-{
-    int ret;
-    int Htop;
-   
-    /* ignore returned error, as system->d_max_far_nbrs was not valid */
-    ret = Cuda_Estimate_Neighbors( system, data->step );
-
-    Dev_Make_List( system->total_cap, system->total_far_nbrs,
-            TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n",
-            system->my_rank, system->total_far_nbrs,
-            (int)(system->total_far_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
-    fprintf( stderr, "N: %d and total_cap: %d \n", system->N, system->total_cap );
-#endif
-
-    Cuda_Init_Neighbor_Indices( system );
-
-    Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists );
-
-    /* estimate storage for bonds and hbonds */
-    Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step );
-
-    /* estimate storage for charge sparse matrix */
-//    Cuda_Estimate_Storage_Sparse_Matrix( system, control, data, dev_lists );
-
-    dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries );
-
-    Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) );
-
-    //MATRIX CHANGES
-    //workspace->L = NULL;
-    //workspace->U = NULL;
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, cap: %d \n",
-            system->my_rank, system->total_cm_entries, dev_workspace->H.m );
-    fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
-            system->my_rank, Htop,
-            (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
-#endif
-
-    if ( control->hbond_cut > 0.0 &&  system->numH > 0 )
-    {
-        Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS );
-//        Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *lists + HBONDS );
-
-        Cuda_Init_HBond_Indices( system );
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
-                system->my_rank, system->total_hbonds,
-                (int)(system->total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
-#endif
-    }
-
-    /* bonds list */
-    Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS );
-//    Make_List( system->total_cap, system->total_bonds, TYP_BOND, *lists + BONDS );
-
-    Cuda_Init_Bond_Indices( system );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
-            system->my_rank, total_bonds,
-            (int)(total_bonds * sizeof(bond_data) / (1024 * 1024)) );
-#endif
-
-    /* 3bodies list: since a more accurate estimate of the num.
-     * of three body interactions requires that bond orders have
-     * been computed, delay estimation until for computation */
-
-    return SUCCESS;
-}
-#endif
-
-
 #if defined(PURE_REAX)
 void Initialize( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
@@ -1106,108 +831,6 @@ void Pure_Initialize( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Initialize( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace,
-        reax_list **lists, output_controls *out_control,
-        mpi_datatypes *mpi_data )
-{
-    char msg[MAX_STR];
-    real t_start, t_end;
-
-    /* HOST/DEVICE SCRATCH */
-    Cuda_Init_ScratchArea( );
-
-    /* MPI_DATATYPES */
-    if ( Init_MPI_Datatypes( system, workspace, mpi_data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
-                 system->my_rank );
-        fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-    /* SYSTEM */
-    if ( Cuda_Init_System( system, control, data, workspace, mpi_data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-    /* GRID */
-    dev_alloc_grid( system );
-    Sync_Grid( &system->my_grid, &system->d_my_grid );
-
-    //validate_grid( system );
-
-    /* SIMULATION_DATA */
-    Cuda_Init_Simulation_Data( system, control, data, msg );
-
-    /* WORKSPACE */
-    Cuda_Init_Workspace( system, control, workspace, msg );
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
-#endif
-
-    //Sync the taper here from host to device.
-
-    /* CONTROL */
-    dev_alloc_control( control );
-
-    /* LISTS */
-    if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) ==
-            FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
-#endif
-
-    /* OUTPUT Files */
-    if ( Init_Output_Files( system, control, out_control, mpi_data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: could not open output files! terminating...\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: output files opened\n", system->my_rank );
-#endif
-
-    /* Lookup Tables */
-    if ( control->tabulate )
-    {
-        if ( Init_Lookup_Tables( system, control, dev_workspace->Tap, mpi_data, msg ) == FAILURE )
-        {
-            fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-            fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
-                     system->my_rank );
-            MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-        }
-
-#if defined(DEBUG)
-        fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank );
-#endif
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: Device Initialization Done \n", system->my_rank );
-#endif
-}
-#endif
-
-
 #elif defined(LAMMPS_REAX)
 void Initialize( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
diff --git a/PG-PuReMD/src/init_md.h b/PG-PuReMD/src/init_md.h
index 5a66e4fbffa1e758b74a77c9427bf15e3f91f35e..c5222cbd3d2b2a4d2ddef5fe3b976e7d0644ffab 100644
--- a/PG-PuReMD/src/init_md.h
+++ b/PG-PuReMD/src/init_md.h
@@ -25,14 +25,25 @@
 #include "reax_types.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Generate_Initial_Velocities( reax_system *, real );
+
+int Init_MPI_Datatypes( reax_system *, storage *, mpi_datatypes *, char * );
+
 void Initialize( reax_system*, control_params*, simulation_data*,
         storage*, reax_list**, output_controls*, mpi_datatypes* );
 
 void Pure_Initialize( reax_system*, control_params*, simulation_data*,
         storage*, reax_list**, output_controls*, mpi_datatypes* );
 
-void Cuda_Initialize( reax_system*, control_params*, simulation_data*,
-        storage*, reax_list**, output_controls*, mpi_datatypes* );
+void Init_Taper( control_params *,  storage * );
+
+#ifdef __cplusplus
+}
+#endif
 
 
 #endif
diff --git a/PG-PuReMD/src/integrate.c b/PG-PuReMD/src/integrate.c
index 88b406b5dcae7d1870dcfde0ee10f8f798b4140b..b02008972bf42c235df90fd16ba5b8f84e6097ba 100644
--- a/PG-PuReMD/src/integrate.c
+++ b/PG-PuReMD/src/integrate.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "integrate.h"
 
 #include "allocate.h"
@@ -33,14 +35,6 @@
 #include "tool_box.h"
 #include "vector.h"
 
-#ifdef HAVE_CUDA
-  #include "cuda_allocate.h"
-  #include "cuda_integrate.h"
-  #include "cuda_copy.h"
-  #include "cuda_neighbors.h"
-  #include "cuda_reset_tools.h"
-#endif
-
 
 int Velocity_Verlet_NVE( reax_system* system, control_params* control,
         simulation_data *data, storage *workspace, reax_list **lists,
@@ -339,143 +333,6 @@ int Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
-        simulation_data *data, storage *workspace, reax_list **lists,
-        output_controls *out_control, mpi_datatypes *mpi_data )
-{
-    int i, steps, renbr, ret;
-    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
-    real inv_m, dt, lambda;
-    rvec dx;
-    reax_atom *atom;
-    int *bond_top, *hb_top;
-    int Htop, num_3body;
-    int total_hbonds, count, total_bonds;
-    int bond_cap, cap_3body;
-    real t_over_start, t_over_elapsed;
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-    dt = control->dt;
-    steps = data->step - data->prev_steps;
-    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
-    ret = SUCCESS;
-
-    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
-
-    if ( verlet_part1_done == FALSE )
-    {
-        /* velocity verlet, 1st part */
-        bNVT_update_velocity_part1( system, dt );
-        verlet_part1_done = TRUE;
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-        if ( renbr )
-        {
-            Update_Grid( system, control, mpi_data->world );
-        }
-
-        Output_Sync_Atoms( system );
-        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
-        Sync_Atoms( system );
-
-        /* synch the Grid to the Device here */
-        Sync_Grid( &system->my_grid, &system->d_my_grid );
-
-        init_blocks( system );
-
-#if defined(__CUDA_DEBUG_LOG__)
-        fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n",
-                system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
-#endif
-    }
-    
-    Cuda_Reset( system, control, data, workspace, lists );
-
-    if ( renbr )
-    {
-#if defined(DEBUG)
-        t_over_start  = Get_Time ();
-#endif
-
-        if ( estimate_nbrs_done == 0 )
-        {
-            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
-            ret = Cuda_Estimate_Neighbors( system, data->step );
-            estimate_nbrs_done = 1;
-        }
-
-        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
-        {
-            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
-            estimate_nbrs_done = 2;
-    
-#if defined(DEBUG)
-            t_over_elapsed  = Get_Timing_Info( t_over_start );
-            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
-                    system->my_rank, data->step, t_over_elapsed );
-#endif
-        }
-    }
-
-    if ( ret == SUCCESS )
-    {
-        ret = Cuda_Compute_Forces( system, control, data, workspace,
-                lists, out_control, mpi_data );
-    }
-
-    if ( ret == SUCCESS )
-    {
-        /* velocity verlet, 2nd part */
-        bNVT_update_velocity_part2( system, dt );
-
-#if defined(DEBUG_FOCUS)
-        fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-        /* temperature scaler */
-        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-        lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-        if ( lambda < MIN_dT )
-        {
-            lambda = MIN_dT;
-        }
-        else if (lambda > MAX_dT )
-        {
-            lambda = MAX_dT;
-        }
-        lambda = SQRT( lambda );
-
-        /* Scale velocities and positions at t+dt */
-        bNVT_scale_velocities( system, lambda );
-
-        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: scaled velocities\n",
-                 system->my_rank, data->step );
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-        verlet_part1_done = FALSE;
-        estimate_nbrs_done = 0;
-    }
-
-    return ret;
-}
-#endif
-
-
 /* uses Berendsen-type coupling for both T and P.
  * All box dimensions are scaled by the same amount,
  * there is no change in the angles between axes. */
diff --git a/PG-PuReMD/src/integrate.h b/PG-PuReMD/src/integrate.h
index 63fa9cbf711c934562def006f8899a3276106844..9a25c761647034226eb1c58abe07cbd7dfce2197 100644
--- a/PG-PuReMD/src/integrate.h
+++ b/PG-PuReMD/src/integrate.h
@@ -24,6 +24,11 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
         storage*, reax_list**, output_controls*, mpi_datatypes* );
 
@@ -49,9 +54,9 @@ int Velocity_Verlet_Flexible_NPT( reax_system*, control_params*,
                 output_controls*, mpi_datatypes* );
 */
 
-//CUDA SPECIFIC FUNCTIONS
-int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*,
-        simulation_data*, storage*, reax_list**, output_controls*,
-        mpi_datatypes* );
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c
index c7c0f2fef669a16a9180b58f27c1c064b33e3934..131f8a2e955fefd4fec111f5b5e6c1b1ad4f2de7 100644
--- a/PG-PuReMD/src/io_tools.c
+++ b/PG-PuReMD/src/io_tools.c
@@ -20,7 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
   #include "io_tools.h"
   #include "basic_comm.h"
@@ -41,6 +41,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 print_interaction Print_Interactions[NUM_INTRS];
 
diff --git a/PG-PuReMD/src/io_tools.h b/PG-PuReMD/src/io_tools.h
index 6ae2d6d8b0b015b0be1aa3c543be37186cb7edbd..f83c9686400a03a70e35a06f777b1a9965e0f02e 100644
--- a/PG-PuReMD/src/io_tools.h
+++ b/PG-PuReMD/src/io_tools.h
@@ -25,45 +25,71 @@
 #include "reax_types.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int Init_Output_Files( reax_system*, control_params*,
-                       output_controls*, mpi_datatypes*, char* );
+        output_controls*, mpi_datatypes*, char* );
+
 int Close_Output_Files( reax_system*, control_params*,
-                        output_controls*, mpi_datatypes* );
-
-void  Print_Box( simulation_box*, char*, FILE* );
-
-void  Print_Grid( grid*, FILE* );
-void  Print_GCell_Exchange_Bounds( int, neighbor_proc* );
-void  Print_Native_GCells( reax_system* );
-void  Print_All_GCells( reax_system*);
-
-void  Print_Init_Atoms( reax_system*, storage* );
-void  Print_My_Atoms( reax_system* );
-void  Print_My_Ext_Atoms( reax_system* );
-
-void  Print_Far_Neighbors( reax_system*, reax_list**, control_params *);
-void  Print_Sparse_Matrix( reax_system*, sparse_matrix* );
-void  Print_Sparse_Matrix2( reax_system*, sparse_matrix*, char* );
-void  Print_Linear_System( reax_system*, control_params*, storage*, int );
-void  Print_LinSys_Soln( reax_system*, real*, real*, real* );
-void  Print_Charges( reax_system* );
-void  Print_HBonds( reax_system*, reax_list**, control_params *, int );
-void  Print_HBond_Indices( reax_system*, reax_list**, control_params *, int );
-void  Print_Bonds( reax_system*, reax_list**, control_params *);
-void  Print_Bond_List2( reax_system*, reax_list*, char* );
-void  Print_Total_Force( reax_system*, simulation_data*, storage* );
-void  Output_Results( reax_system*, control_params*, simulation_data*,
-                      reax_list**, output_controls*, mpi_datatypes* );
+        output_controls*, mpi_datatypes* );
+
+void Print_Box( simulation_box*, char*, FILE* );
+
+void Print_Grid( grid*, FILE* );
+
+void Print_GCell_Exchange_Bounds( int, neighbor_proc* );
+
+void Print_Native_GCells( reax_system* );
+
+void Print_All_GCells( reax_system*);
+
+void Print_Init_Atoms( reax_system*, storage* );
+
+void Print_My_Atoms( reax_system* );
+
+void Print_My_Ext_Atoms( reax_system* );
+
+void Print_Far_Neighbors( reax_system*, reax_list**, control_params *);
+
+void Print_Sparse_Matrix( reax_system*, sparse_matrix* );
+
+void Print_Sparse_Matrix2( reax_system*, sparse_matrix*, char* );
+
+void Print_Linear_System( reax_system*, control_params*, storage*, int );
+
+void Print_LinSys_Soln( reax_system*, real*, real*, real* );
+
+void Print_Charges( reax_system* );
+
+void Print_HBonds( reax_system*, reax_list**, control_params *, int );
+
+void Print_HBond_Indices( reax_system*, reax_list**, control_params *, int );
+
+void Print_Bonds( reax_system*, reax_list**, control_params *);
+
+void Print_Bond_List2( reax_system*, reax_list*, char* );
+
+void Print_Total_Force( reax_system*, simulation_data*, storage* );
+
+void Output_Results( reax_system*, control_params*, simulation_data*,
+        reax_list**, output_controls*, mpi_datatypes* );
 
 #if defined(DEBUG_FOCUS) || defined(TEST_FORCES) || defined(TEST_ENERGY)
 void Debug_Marker_Bonded( output_controls*, int );
+
 void Debug_Marker_Nonbonded( output_controls*, int );
-void  Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*,
-                                 simulation_data*, output_controls* );
-void  Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*,
-                                simulation_data*, output_controls* );
-void  Print_Bond_List( reax_system*, control_params*, simulation_data*,
-                       reax_list**, output_controls* );
+
+void Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*,
+        simulation_data*, output_controls* );
+
+void Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*,
+        simulation_data*, output_controls* );
+
+void Print_Bond_List( reax_system*, control_params*, simulation_data*,
+        reax_list**, output_controls* );
+
 /*void Dummy_Printer( reax_system*, control_params*, simulation_data*,
             storage*, reax_list**, output_controls* );
 void Print_Bond_Orders( reax_system*, control_params*, simulation_data*,
@@ -89,23 +115,28 @@ void Print_Total_Force( reax_system*, control_params*, simulation_data*,
             storage*, reax_list**, output_controls* );
 void Compare_Total_Forces( reax_system*, control_params*, simulation_data*,
 storage*, reax_list**, output_controls* );*/
+
 //void  Print_Total_Force( reax_system*, control_params* );
+
 void Print_Force_Files( reax_system*, control_params*, simulation_data*,
-                        storage*, reax_list**, output_controls*,
-                        mpi_datatypes * );
+        storage*, reax_list**, output_controls*, mpi_datatypes * );
+
 //void Init_Force_Test_Functions( );
 
 int fn_qsort_intcmp( const void *, const void * );
 
 void Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*,
-                               simulation_data*, output_controls* );
+        simulation_data*, output_controls* );
 
 void Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*,
-                                simulation_data*, output_controls* );
+        simulation_data*, output_controls* );
 
 void Print_Bond_List( reax_system*, control_params*, simulation_data*,
-                      reax_list**, output_controls*);
+        reax_list**, output_controls*);
+#endif
 
+#ifdef __cplusplus
+}
 #endif
 
 
diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c
index bac272a0c424c213756b2b9aaa167497247f211d..e9ce62e72411dd1e7739d79b3a2875476b63ed48 100644
--- a/PG-PuReMD/src/lin_alg.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "lin_alg.h"
 
 #include "basic_comm.h"
@@ -27,9 +29,7 @@
 #include "vector.h"
 
 #ifdef HAVE_CUDA
-  #include "cuda_lin_alg.h"
-  #include "cuda_utils.h"
-  #include "cuda_validation.h"
+  #include "cuda/cuda_validation.h"
 #endif
 
 #if defined(CG_PERFORMANCE)
@@ -100,13 +100,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
 #endif
 
 #ifdef HAVE_CUDA
-    check_zeros_host (x, system->N, "x");
+    check_zeros_host( x, system->N, "x" );
 #endif
 
     Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
 
 #ifdef HAVE_CUDA
-    check_zeros_host (x, system->N, "x");
+    check_zeros_host( x, system->N, "x" );
 #endif
 
     dual_Sparse_MatVec( H, x, workspace->q2, N );
@@ -285,352 +285,6 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
-        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
-        simulation_data *data )
-{
-    int  i, j, n, N, matvecs, scale;
-    rvec2 tmp, alpha, beta;
-    rvec2 my_sum, norm_sqr, b_norm, my_dot;
-    rvec2 sig_old, sig_new;
-    MPI_Comm comm;
-    rvec2 *spad = (rvec2 *) host_scratch;
-    int a;
-
-    n = system->n;
-    N = system->N;
-    comm = mpi_data->world;
-    matvecs = 0;
-    scale = sizeof(rvec2) / sizeof(void);
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        matvecs = 0;
-        t_start = matvec_time = dot_time = 0;
-        t_start = Get_Time( );
-    }
-#endif
-
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
-//#endif
-
-//  check_zeros_device( x, system->N, "x" );
-
-    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" );
-    Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
-    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" );
-
-//  check_zeros_device( x, system->N, "x" );
-
-//  compare_rvec2 (workspace->x, x, N, "x");
-//  if (data->step > 0) {
-//      compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
-//      compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
-//
-//      exit (0);
-//  }
-
-
-//#ifdef __CUDA_DEBUG__
-//  dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
-//#endif
-    //originally we were using only H->n which was system->n (init_md.c)
-    //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
-    
-    Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
-
-//  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-//  if (data->step > 0) exit (0);
-
-    // tryQEq
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-    
-    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
-            cudaMemcpyDeviceToHost, "CG:q2:get" );
-    Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
-    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
-            cudaMemcpyHostToDevice,"CG:q2:put" );
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &matvec_time );
-    }
-#endif
-
-//#ifdef __CUDA_DEBUG__
-//  for( j = 0; j < system->n; ++j ) {
-//    // residual
-//    workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
-//    workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
-//    // apply diagonal pre-conditioner
-//    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//  }
-//#endif
-    
-    Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n );
-
-//  compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
-//  compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
-
-    /* norm of b */
-//#ifdef __CUDA_DEBUG__
-//  my_sum[0] = my_sum[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_sum[0] += SQR( workspace->b[j][0] );
-//    my_sum[1] += SQR( workspace->b[j][1] );
-//  }
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-//#endif
-
-    my_sum[0] = my_sum[1] = 0;
-    Cuda_Norm (b, n, my_sum);
-
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-
-    MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
-    b_norm[0] = SQRT( norm_sqr[0] );
-    b_norm[1] = SQRT( norm_sqr[1] );
-    //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
-
-    /* dot product: r.d */
-//#ifdef __CUDA_DEBUG__
-//  my_dot[0] = my_dot[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
-//    my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
-//  }
-//  fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-    my_dot[0] = my_dot[1] = 0;
-    Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
-
-// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
-    
-    MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
-
-    //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &dot_time );
-    }
-#endif
-
-    for ( i = 1; i < 300; ++i )
-    {
-        //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
-//#endif
-        
-        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
-                cudaMemcpyDeviceToHost, "cg:d2:get" );
-        Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
-        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
-                cudaMemcpyHostToDevice, "cg:d2:put" );
-
-        //print_device_rvec2 (dev_workspace->d2, N);
-
-//#ifdef __CUDA_DEBUG__
-//    dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
-//#endif
-        
-        Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
-                system->total_cap );
-
-        /*
-        fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
-        fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
-        print_sparse_matrix_host (&workspace->H);
-        fprintf (stderr, " ******* HOST Vector ***************\n");
-        print_host_rvec2 (workspace->d2, system->N);
-        fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
-        print_sparse_matrix (&dev_workspace->H);
-        fprintf (stderr, " ******* Device Vector ***************\n");
-        print_device_rvec2 (dev_workspace->d2, system->N);
-        */
-        //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-        // tryQEq
-        // MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-
-        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
-                cudaMemcpyDeviceToHost, "cg:q2:get" );
-        Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
-        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
-                cudaMemcpyHostToDevice, "cg:q2:put" );
-
-//       compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &matvec_time );
-        }
-#endif
-
-        /* dot product: d.q */
-//#ifdef __CUDA_DEBUG__
-//    my_dot[0] = my_dot[1] = 0;
-//    for( j = 0; j < n; ++j ) {
-//      my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
-//      my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = my_dot[1] = 0;
-        Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-
-        MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
-        //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
-
-        alpha[0] = sig_new[0] / tmp[0];
-        alpha[1] = sig_new[1] / tmp[1];
-        my_dot[0] = my_dot[1] = 0;
-
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // update x
-//      workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
-//      workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
-//      // update residual
-//      workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
-//      workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
-//      // apply diagonal pre-conditioner
-//      workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//      workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//      // dot product: r.p
-//      my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
-//      my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = my_dot[1] = 0;
-        Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
-
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-
-//   compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
-//   compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
-//   compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
-
-        sig_old[0] = sig_new[0];
-        sig_old[1] = sig_new[1];
-        MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
-
-        //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &dot_time );
-        }
-#endif
-
-        if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
-        {
-            break;
-        }
-
-        beta[0] = sig_new[0] / sig_old[0];
-        beta[1] = sig_new[1] / sig_old[1];
-
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // d = p + beta * d
-//      workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
-//      workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
-//    }
-//#endif
-
-        Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
-                dev_workspace->d2, system->n );
-
-//       compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
-    }
-
-
-    if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
-    {
-        //for( j = 0; j < n; ++j )
-        //  workspace->t[j] = workspace->x[j][1];
-        //fprintf (stderr, "Getting started with Cuda_CG1 \n");
-
-        Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
-
-        //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
-        //compare_array (workspace->t, dev_workspace->t, system->n, "t");
-
-        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
-                mpi_data, fout );
-
-        //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
-        //for( j = 0; j < n; ++j )
-        //  workspace->x[j][1] = workspace->t[j];
-
-        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
-    }
-    else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
-    {
-        //for( j = 0; j < n; ++j )
-        //  workspace->s[j] = workspace->x[j][0];
-
-        Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
-
-        //compare_array (workspace->s, dev_workspace->s, system->n, "s");
-        //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
-
-        //fprintf (stderr, "Getting started with Cuda_CG2 \n");
-
-        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
-                mpi_data, fout );
-
-        //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->x[j][0] = workspace->s[j];
-
-        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
-    }
-
-    if ( i >= 300 )
-    {
-        fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n",
-                system->my_rank, i );
-        fprintf( stderr, "    [INFO] s lin solve error: %f\n", SQRT(sig_new[0]) / b_norm[0] );
-        fprintf( stderr, "    [INFO] t lin solve error: %f\n", SQRT(sig_new[1]) / b_norm[1] );
-    }
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n",
-                i + 1, matvecs, matvec_time, dot_time );
-    }
-#endif
-
-    return (i + 1) + matvecs;
-}
-#endif
-
-
 void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
 {
     int  i, j, k, si;
@@ -745,153 +399,6 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
-        *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
-{
-    int  i, j, scale;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new, sig0;
-    real *spad = (real *) host_scratch;
-
-    scale = sizeof(real) / sizeof(void);
-
-    /* x is on the device */
-    //MVAPICH2
-    memset( spad, 0, sizeof(real) * system->total_cap );
-    copy_host_device( spad, x, sizeof(real) * system->total_cap,
-            cudaMemcpyDeviceToHost, "cuda_cg:x:get" );
-    Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
-
-    //MVAPICH2
-    copy_host_device( spad, x, sizeof(real) * system->total_cap,
-            cudaMemcpyHostToDevice, "cuda_cg:x:put" );
-    Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
-
-    // tryQEq
-    // MVAPICH2
-    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
-            cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
-    Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
-
-    //MVAPICH2
-    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
-            cudaMemcpyHostToDevice, "cuda_cg:q:put" );
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &matvec_time );
-    }
-#endif
-
-    Cuda_Vector_Sum( dev_workspace->r , 1.,  b, -1., dev_workspace->q,
-            system->n );
-    //for( j = 0; j < system->n; ++j )
-    //  workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
-    Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
-            dev_workspace->Hdia_inv, system->n );
-
-    //TODO do the parallel_norm on the device for the local sum
-    copy_host_device( spad, b, sizeof(real) * system->n,
-            cudaMemcpyDeviceToHost, "cuda_cg:b:get" );
-    b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
-
-    //TODO do the parallel dot on the device for the local sum
-    copy_host_device( spad, dev_workspace->r, sizeof(real) * system->total_cap,
-            cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
-    copy_host_device( spad + system->total_cap, dev_workspace->d, sizeof(real) * system->total_cap,
-            cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
-    sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
-            mpi_data->world );
-
-    sig0 = sig_new;
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &dot_time );
-    }
-#endif
-
-    for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
-    {
-        //MVAPICH2
-        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
-                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
-        Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
-        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
-                cudaMemcpyHostToDevice, "cuda_cg:d:put" );
-
-        Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
-
-        //tryQEq
-        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
-                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
-        Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
-        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
-                cudaMemcpyHostToDevice, "cuda_cg:q:get" );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &matvec_time );
-        }
-#endif
-
-        //TODO do the parallel dot on the device for the local sum
-        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->n,
-                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
-        copy_host_device( spad + system->n, dev_workspace->q, sizeof(real) * system->n,
-                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
-        tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world );
-
-        alpha = sig_new / tmp;
-        //Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
-        Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
-
-        //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
-                dev_workspace->r, system->n );
-        /* pre-conditioning */
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-        Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
-                dev_workspace->Hdia_inv, system->n );
-
-        sig_old = sig_new;
-
-        //TODO do the parallel dot on the device for the local sum
-        copy_host_device( spad, dev_workspace->r, sizeof(real) * system->n,
-                cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
-        copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n,
-                cudaMemcpyDeviceToHost, "cuda_cg:p:get" );
-        sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
-        //fprintf (stderr, "Device: sig_new: %f \n", sig_new );
-
-        beta = sig_new / sig_old;
-        Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
-                dev_workspace->d, system->n );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &dot_time );
-        }
-#endif
-    }
-
-    if ( i >= 300 )
-    {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
-    }
-
-    return i;
-}
-#endif
-
-
 int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, real
         *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
diff --git a/PG-PuReMD/src/lin_alg.h b/PG-PuReMD/src/lin_alg.h
index f401fb2d06177d16abec8a89638ccda8acbf1958..3663978e4b6cc412a1b9d8fe2cc613443383c3fc 100644
--- a/PG-PuReMD/src/lin_alg.h
+++ b/PG-PuReMD/src/lin_alg.h
@@ -24,23 +24,32 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int GMRES( reax_system*, storage*, sparse_matrix*,
-           real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes*, FILE* );
+
 int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*,
-                       real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes*, FILE* );
+
 int dual_CG( reax_system*, storage*, sparse_matrix*,
-             rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *);
+        rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *);
+
 int CG( reax_system*, storage*, sparse_matrix*,
         real*, real, real*, mpi_datatypes*, FILE* );
+
 int PCG( reax_system*, storage*, sparse_matrix*, real*, real,
-         sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
+        sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
+
 int sCG( reax_system*, storage*, sparse_matrix*,
-         real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes*, FILE* );
+
+#ifdef __cplusplus
+}
+#endif
 
-//CUDA Functions
-int Cuda_dual_CG( reax_system*, storage*, sparse_matrix*,
-                  rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *);
-int Cuda_CG( reax_system*, storage*, sparse_matrix*,
-             real*, real, real*, mpi_datatypes*, FILE* );
 
 #endif
diff --git a/PG-PuReMD/src/list.c b/PG-PuReMD/src/list.c
index 05213cb32663ff54976cc929516bf1eaab1641cc..69736afbbee10a1fdd67d1399166613abaa21112 100644
--- a/PG-PuReMD/src/list.c
+++ b/PG-PuReMD/src/list.c
@@ -22,11 +22,11 @@
 #include "reax_types.h"
 
 #if defined(PURE_REAX)
-#include "list.h"
-#include "tool_box.h"
+  #include "list.h"
+  #include "tool_box.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_list.h"
-#include "reax_tool_box.h"
+  #include "reax_list.h"
+  #include "reax_tool_box.h"
 #endif
 
 
diff --git a/PG-PuReMD/src/list.h b/PG-PuReMD/src/list.h
index 1f29f5f832f780b23565104e4003a29467cfc867..df6ec82f24b9002f51e0b93a6461a0d7ca14bbeb 100644
--- a/PG-PuReMD/src/list.h
+++ b/PG-PuReMD/src/list.h
@@ -24,17 +24,21 @@
 
 #include "reax_types.h"
 
+
 #ifdef _cplusplus
 extern "C" {
 #endif
 
-
 void Print_List( reax_list* );
 
 void Make_List( int, int, int, reax_list* );
 
 void Delete_List( reax_list* );
 
+#ifdef _cplusplus
+}
+#endif
+
 #if defined(LAMMPS_REAX) || defined(PURE_REAX)
 static inline int Num_Entries( int i, reax_list *l )
 {
@@ -60,12 +64,7 @@ static inline void Set_End_Index( int i, int val, reax_list *l )
 {
     l->end_index[i] = val;
 }
-
 #endif
 
 
-#ifdef _cplusplus
-}
-#endif
-
 #endif
diff --git a/PG-PuReMD/src/lookup.c b/PG-PuReMD/src/lookup.c
index 2c6652f9e5599d5cef27e8e16ed4a17066015a6d..b071ea89cf94a895221862043feada0fbf928b13 100644
--- a/PG-PuReMD/src/lookup.c
+++ b/PG-PuReMD/src/lookup.c
@@ -21,12 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-
-#ifdef HAVE_CUDA
-  #include "cuda_lookup.h"
-#endif
-
 #if defined(PURE_REAX)
   #include "lookup.h"
   #include "nonbonded.h"
@@ -37,6 +31,12 @@
   #include "reax_tool_box.h"
 #endif
 
+#include "index_utils.h"
+
+#ifdef HAVE_CUDA
+  #include "cuda/cuda_lookup.h"
+#endif
+
 
 /* Fills solution into x. Warning: will modify c and d! */
 void Tridiagonal_Solve( const real *a, const real *b,
diff --git a/PG-PuReMD/src/lookup.h b/PG-PuReMD/src/lookup.h
index f6e45bd17d6b3eb3bd0f723c9888244b5767e686..4db34ce0234f86fed309f51909ec8e4ca070c37c 100644
--- a/PG-PuReMD/src/lookup.h
+++ b/PG-PuReMD/src/lookup.h
@@ -26,7 +26,17 @@
 
 //extern LR_lookup_table **LR;
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 int Init_Lookup_Tables( reax_system*, control_params*, real *,
-                        mpi_datatypes*, char* );
+        mpi_datatypes*, char* );
+
+#ifdef _cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/multi_body.c b/PG-PuReMD/src/multi_body.c
index b480d3bb2bcc89c2ff20dd4688c7f7a71599f121..aab4957d0ae3923f8278f91f88ffc49eb0180b2d 100644
--- a/PG-PuReMD/src/multi_body.c
+++ b/PG-PuReMD/src/multi_body.c
@@ -32,6 +32,7 @@
   #include "reax_list.h"
   #include "reax_vector.h"
 #endif
+
 #include "index_utils.h"
 
 
diff --git a/PG-PuReMD/src/multi_body.h b/PG-PuReMD/src/multi_body.h
index aaed59e559d21760354544453286202a6deaef38..9cc865b4b1601f1efc4f0cb92006733025b2c10e 100644
--- a/PG-PuReMD/src/multi_body.h
+++ b/PG-PuReMD/src/multi_body.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 void Atom_Energy( reax_system*, control_params*, simulation_data*,
-                  storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef _cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/neighbors.c b/PG-PuReMD/src/neighbors.c
index 753ecc36d0754d81bc6530271a2c7a396c3cbfad..e938329a6728f57d178d530ee8bb996447fa8af7 100644
--- a/PG-PuReMD/src/neighbors.c
+++ b/PG-PuReMD/src/neighbors.c
@@ -19,14 +19,16 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "neighbors.h"
+
+#include "index_utils.h"
 #include "io_tools.h"
 #include "list.h"
 #include "tool_box.h"
 #include "vector.h"
 
-#include "index_utils.h"
-
 
 int compare_far_nbrs( const void *p1, const void *p2 )
 {
diff --git a/PG-PuReMD/src/neighbors.h b/PG-PuReMD/src/neighbors.h
index 0a1e3daf289883268e77fbefd7f6a24deaa582dd..37c3642b7d243d472dd0eb531c2b89ce6a94a06a 100644
--- a/PG-PuReMD/src/neighbors.h
+++ b/PG-PuReMD/src/neighbors.h
@@ -31,8 +31,18 @@
                      int, int*, int*, int*, int,
                      int, int, real, rvec, ivec );*/
 
-void Generate_Neighbor_Lists( reax_system*, simulation_data*, storage*,
-                              reax_list** );
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
+void Generate_Neighbor_Lists( reax_system*, simulation_data*, storage*, reax_list** );
+
 int Estimate_NumNeighbors( reax_system*, reax_list** );
 
+#ifdef _cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/nonbonded.c b/PG-PuReMD/src/nonbonded.c
index 8edd2b11c690e9c26f8d5d4393c96fd97dc6b8a1..e073ec6252f2b76cefbfe5e10bb5e3c43d9e006c 100644
--- a/PG-PuReMD/src/nonbonded.c
+++ b/PG-PuReMD/src/nonbonded.c
@@ -20,7 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
   #include "nonbonded.h"
   #include "bond_orders.h"
@@ -34,10 +34,12 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void vdW_Coulomb_Energy( reax_system *system, control_params *control,
-                         simulation_data *data, storage *workspace,
-                         reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
     int i, j, pj, natoms;
     int start_i, end_i, orig_i, orig_j;
diff --git a/PG-PuReMD/src/nonbonded.h b/PG-PuReMD/src/nonbonded.h
index 81613be54f9581c64461ba9f1a3e2002403d63e0..45137bf894c5726b8dbaf1ab9b33905581aed4a4 100644
--- a/PG-PuReMD/src/nonbonded.h
+++ b/PG-PuReMD/src/nonbonded.h
@@ -24,14 +24,24 @@
 
 #include "reax_types.h"
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 void vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*,
-                         storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
 
 void Tabulated_vdW_Coulomb_Energy( reax_system*, control_params*,
-                                   simulation_data*, storage*,
-                                   reax_list**, output_controls* );
+        simulation_data*, storage*, reax_list**, output_controls* );
 
 void Compute_Polarization_Energy( reax_system*, simulation_data* );
 
 void LR_vdW_Coulomb( reax_system*, real *, int, int, real, LR_data* );
+
+#ifdef _cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index 4d677687a4f8b6d3d58ab8bbea8f86f1d175fcd6..30c2372227d3ce6d7fe3e81e0e4789f1fe61537a 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -40,13 +40,13 @@
 #include "vector.h"
 
 #ifdef HAVE_CUDA
-  #include "cuda_copy.h"
-  #include "cuda_environment.h"
-  #include "cuda_neighbors.h"
-  #include "cuda_post_evolve.h"
-  #include "cuda_reset_tools.h"
-  #include "cuda_utils.h"
-  #include "cuda_validation.h"
+  #include "cuda/cuda_copy.h"
+  #include "cuda/cuda_environment.h"
+  #include "cuda/cuda_neighbors.h"
+  #include "cuda/cuda_post_evolve.h"
+  #include "cuda/cuda_reset_tools.h"
+  #include "cuda/cuda_utils.h"
+  #include "cuda/cuda_validation.h"
 #endif
 
 evolve_function Evolve;
@@ -156,25 +156,6 @@ int Cuda_Post_Evolve( reax_system* system, control_params* control,
 #endif
 
 
-#ifdef HAVE_CUDA
-void init_blocks( reax_system *system )
-{
-    compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n );
-    compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 );
-
-    compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N );
-    compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N );
-
-    compute_matvec_blocks( &MATVEC_BLOCKS, system->N );
-
-#if defined(__CUDA_DEBUG_LOG__)
-    fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d  - N:%d \n",
-            MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N );
-#endif
-}
-#endif
-
-
 static void usage( char* argv[] )
 {
     fprintf( stderr, "usage: ./%s geometry ffield control\n", argv[0] );
diff --git a/PG-PuReMD/src/random.c b/PG-PuReMD/src/random.c
index 2811a6b50caeb21c8d0b0f3b1af2e6a6d5c539be..ffe55458a0c3c82efc8305e670d3f819c4470447 100644
--- a/PG-PuReMD/src/random.c
+++ b/PG-PuReMD/src/random.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "random.h"
 
 /* System random number generator used linear congruance method with
diff --git a/PG-PuReMD/src/random.h b/PG-PuReMD/src/random.h
index a3ce35265758ec136e0994cf439d28a57d068183..66a5d59d18d10643be1661a33719b86624717b6f 100644
--- a/PG-PuReMD/src/random.h
+++ b/PG-PuReMD/src/random.h
@@ -24,19 +24,28 @@
 
 #include "reax_types.h"
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 /* System random number generator used linear congruance method with
    large periodicity for generation of pseudo random number. function
    Random returns this random number appropriately scaled so that
    0 <= Random(range) < range */
-double Random(double);
+double Random( double );
 
 /* This function seeds the system pseudo random number generator with
    current time. Use this function once in the begining to initialize
    the system */
-void Randomize();
+void Randomize( );
 
 /* GRandom return random number with gaussian distribution with mean
    and standard deviation "sigma" */
-double GRandom(double, double);
+double GRandom( double, double );
+
+#ifdef _cplusplus
+}
+#endif
 
 #endif
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index c39277b85155f34feed27b2658183bde0614589a..38810bd6e6bb43d38f7e20ac47d69f1241cc10f6 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -96,6 +96,14 @@
 #define FABS   fabs
 #define FMOD   fmod
 
+/* transcendental constant pi */
+#if defined(M_PI)
+  /* GNU C library (libc), defined in math.h */
+  #define PI (M_PI)
+#else
+  #define PI            3.14159265
+#endif
+
 #define SQR(x)        ((x)*(x))
 #define CUBE(x)       ((x)*(x)*(x))
 #define DEG2RAD(a)    ((a)*PI/180.0)
@@ -104,13 +112,6 @@
 #define MIN(x,y)      (((x) < (y)) ? (x) : (y))
 #define MAX3(x,y,z)   MAX( MAX(x,y), z)
 
-/* transcendental constant pi */
-#if defined(M_PI)
-  /* GNU C library (libc), defined in math.h */
-  #define PI (M_PI)
-#else
-  #define PI            3.14159265
-#endif
 /* ??? */
 #define C_ele          332.06371
 /* ??? */
diff --git a/PG-PuReMD/src/reset_tools.c b/PG-PuReMD/src/reset_tools.c
index a605cc7909786a8b58d4f2a7fb09d8bd02788fda..c3778145e1913b16e5299bd5e3c2cf5f3f263b5f 100644
--- a/PG-PuReMD/src/reset_tools.c
+++ b/PG-PuReMD/src/reset_tools.c
@@ -21,8 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-
 #if defined(PURE_REAX)
   #include "reset_tools.h"
   #include "list.h"
@@ -35,6 +33,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void Reset_Atoms( reax_system* system, control_params *control )
 {
diff --git a/PG-PuReMD/src/reset_tools.h b/PG-PuReMD/src/reset_tools.h
index 34f3876034f4469e242b8526efe427ae8e8c32e1..001b7f578d33b4fdb749a558d5d40a55e3e49beb 100644
--- a/PG-PuReMD/src/reset_tools.h
+++ b/PG-PuReMD/src/reset_tools.h
@@ -24,11 +24,11 @@
 
 #include "reax_types.h"
 
+
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
 void Reset_Pressures( simulation_data* );
 
 void Reset_Simulation_Data( simulation_data* );
@@ -49,9 +49,9 @@ void Reset( reax_system*, control_params*, simulation_data*, storage*, reax_list
 void Reset_Test_Forces( reax_system*, storage* );
 #endif
 
-
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/restart.c b/PG-PuReMD/src/restart.c
index 967e025d47989f3f91b385d39fbb55226163f14a..6b8ddcdffc0e257eb57e0d4a00d43cf4734f379e 100644
--- a/PG-PuReMD/src/restart.c
+++ b/PG-PuReMD/src/restart.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "restart.h"
+
 #include "allocate.h"
 #include "box.h"
 #include "tool_box.h"
diff --git a/PG-PuReMD/src/restart.h b/PG-PuReMD/src/restart.h
index 39a5dcd5a149208a58f5085507bb198895d7e47d..3d13a5a17256457829224a20f32c4d9f7305a06b 100644
--- a/PG-PuReMD/src/restart.h
+++ b/PG-PuReMD/src/restart.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+
 #define RESTART_HEADER "%8d%12d%8.3f%8.3f%8.3f%8.3f%8.3f\n%15.5f%15.5f%15.5f\n%15.5f%15.5f%15.5f\n%15.5f%15.5f%15.5f\n"
 #define RESTART_HEADER_LINE_LEN 200
 /* step, system->bigN, data->therm.T, data->therm.xi,
@@ -39,16 +40,26 @@
 #define READ_RESTART_HEADER " %d %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf"
 #define READ_RESTART_LINE " %d %d %s %lf %lf %lf %lf %lf %lf"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Write_Binary_Restart( reax_system*, control_params*,
-                           simulation_data*, output_controls*, mpi_datatypes* );
+        simulation_data*, output_controls*, mpi_datatypes* );
 
 void Write_Restart( reax_system*, control_params*,
-                    simulation_data*, output_controls*, mpi_datatypes* );
+        simulation_data*, output_controls*, mpi_datatypes* );
 
 void Read_Binary_Restart( char*, reax_system*, control_params*,
-                          simulation_data*, storage*, mpi_datatypes* );
+        simulation_data*, storage*, mpi_datatypes* );
 
 void Read_Restart( char*, reax_system*, control_params*,
-                   simulation_data*, storage*, mpi_datatypes* );
+        simulation_data*, storage*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/system_props.c b/PG-PuReMD/src/system_props.c
index e2852a4b6d8b0d906291ede5e9630ca8969dffdd..ea4465c58f54635963b0da5c383833fc2aa094f7 100644
--- a/PG-PuReMD/src/system_props.c
+++ b/PG-PuReMD/src/system_props.c
@@ -21,10 +21,6 @@
 
 #include "reax_types.h"
 
-#ifdef HAVE_CUDA
-  #include "cuda_system_props.h"
-#endif
-
 #if defined(PURE_REAX)
   #include "system_props.h"
   #include "tool_box.h"
@@ -35,6 +31,10 @@
   #include "reax_vector.h"
 #endif
 
+#ifdef HAVE_CUDA
+  #include "cuda/cuda_system_props.h"
+#endif
+
 
 void Temperature_Control( control_params *control, simulation_data *data )
 {
@@ -83,29 +83,6 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
         data->my_en.e_kin += 0.5 * rvec_Dot( p, system->my_atoms[i].v );
     }
 
-    MPI_Allreduce( &data->my_en.e_kin,  &data->sys_en.e_kin,
-                   1, MPI_DOUBLE, MPI_SUM, comm );
-
-    data->therm.T = (2. * data->sys_en.e_kin) / (data->N_f * K_B);
-
-    // avoid T being an absolute zero, might cause F.P.E!
-    if ( FABS(data->therm.T) < ALMOST_ZERO )
-        data->therm.T = ALMOST_ZERO;
-}
-
-
-#ifdef HAVE_CUDA
-void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
-        MPI_Comm comm )
-{
-    int i;
-    rvec p;
-    real m;
-
-    data->my_en.e_kin = 0.0;
-
-    dev_compute_kinetic_energy( system, data, &data->my_en.e_kin );
-
     MPI_Allreduce( &data->my_en.e_kin,  &data->sys_en.e_kin,
             1, MPI_DOUBLE, MPI_SUM, comm );
 
@@ -117,7 +94,6 @@ void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
         data->therm.T = ALMOST_ZERO;
     }
 }
-#endif
 
 
 void Compute_System_Energy( reax_system *system, simulation_data *data,
@@ -130,7 +106,7 @@ void Compute_System_Energy( reax_system *system, simulation_data *data,
 
 #ifdef HAVE_CUDA
     //Cuda Wrapper here
-    dev_sync_simulation_data ( data );
+    dev_sync_simulation_data( data );
 #endif
 
     my_en[0] = data->my_en.e_bond;
@@ -205,23 +181,6 @@ void Compute_Total_Mass( reax_system *system, simulation_data *data,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data,
-        MPI_Comm comm  )
-{
-    int  i;
-    real tmp;
-
-    //compute local total mass of the system
-    dev_compute_total_mass( system, &tmp );
-
-    MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
-
-    data->inv_M = 1. / data->M;
-}
-#endif
-
-
 void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
         mpi_datatypes *mpi_data, MPI_Comm comm )
 {
@@ -342,112 +301,6 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
-        mpi_datatypes *mpi_data, MPI_Comm comm )
-{
-    int i;
-    real m, det; //xx, xy, xz, yy, yz, zz;
-    real tmp_mat[6], tot_mat[6];
-    rvec my_xcm, my_vcm, my_amcm, my_avcm;
-    rvec tvec, diff;
-    rtensor mat, inv;
-
-    rvec_MakeZero( my_xcm );  // position of CoM
-    rvec_MakeZero( my_vcm );  // velocity of CoM
-    rvec_MakeZero( my_amcm ); // angular momentum of CoM
-    rvec_MakeZero( my_avcm ); // angular velocity of CoM
-
-    /* Compute the position, vel. and ang. momentum about the centre of mass */
-    dev_compute_momentum ( system, my_xcm, my_vcm, my_amcm );
-
-    MPI_Allreduce( my_xcm, data->xcm, 3, MPI_DOUBLE, MPI_SUM, comm );
-    MPI_Allreduce( my_vcm, data->vcm, 3, MPI_DOUBLE, MPI_SUM, comm );
-    MPI_Allreduce( my_amcm, data->amcm, 3, MPI_DOUBLE, MPI_SUM, comm );
-
-    rvec_Scale( data->xcm, data->inv_M, data->xcm );
-    rvec_Scale( data->vcm, data->inv_M, data->vcm );
-    rvec_Cross( tvec, data->xcm, data->vcm );
-    rvec_ScaledAdd( data->amcm, -data->M, tvec );
-    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
-
-    /* Calculate and then invert the inertial tensor */
-    for ( i = 0; i < 6; ++i )
-    {
-        tmp_mat[i] = 0;
-    }
-
-    dev_compute_inertial_tensor( system, tmp_mat, my_xcm );
-
-    MPI_Reduce( tmp_mat, tot_mat, 6, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm );
-
-    if ( system->my_rank == MASTER_NODE )
-    {
-        mat[0][0] = tot_mat[3] + tot_mat[5];  // yy + zz;
-        mat[0][1] = mat[1][0] = -tot_mat[1];  // -xy;
-        mat[0][2] = mat[2][0] = -tot_mat[2];  // -xz;
-        mat[1][1] = tot_mat[0] + tot_mat[5];  // xx + zz;
-        mat[2][1] = mat[1][2] = -tot_mat[4];  // -yz;
-        mat[2][2] = tot_mat[0] + tot_mat[3];  // xx + yy;
-
-        /* invert the inertial tensor */
-        det = ( mat[0][0] * mat[1][1] * mat[2][2] +
-                mat[0][1] * mat[1][2] * mat[2][0] +
-                mat[0][2] * mat[1][0] * mat[2][1] ) -
-              ( mat[0][0] * mat[1][2] * mat[2][1] +
-                mat[0][1] * mat[1][0] * mat[2][2] +
-                mat[0][2] * mat[1][1] * mat[2][0] );
-
-        inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
-        inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
-        inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
-        inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
-        inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
-        inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
-        inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
-        inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
-        inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
-
-        if ( det > ALMOST_ZERO )
-        {
-            rtensor_Scale( inv, 1. / det, inv );
-        }
-        else
-        {
-            rtensor_MakeZero( inv );
-        }
-
-        /* Compute the angular velocity about the centre of mass */
-        rtensor_MatVec( data->avcm, inv, data->amcm );
-    }
-
-    MPI_Bcast( data->avcm, 3, MPI_DOUBLE, MASTER_NODE, comm );
-
-    /* Compute the rotational energy */
-    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
-
-#if defined(DEBUG)
-    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",
-             data->xcm[0], data->xcm[1], data->xcm[2] );
-    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n",
-             data->vcm[0], data->vcm[1], data->vcm[2] );
-    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n",
-             data->amcm[0], data->amcm[1], data->amcm[2] );
-    /* fprintf( stderr, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
-       mat[0][0], mat[0][1], mat[0][2],
-       mat[1][0], mat[1][1], mat[1][2],
-       mat[2][0], mat[2][1], mat[2][2] );
-       fprintf( stderr, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
-       inv[0][0], inv[0][1], inv[0][2],
-       inv[1][0], inv[1][1], inv[1][2],
-       inv[2][0], inv[2][1], inv[2][2] ); */
-    fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n",
-             data->avcm[0], data->avcm[1], data->avcm[2] );
-#endif
-}
-#endif
-
-
 /* IMPORTANT: This function assumes that current kinetic energy
  * the system is already computed
  *
diff --git a/PG-PuReMD/src/system_props.h b/PG-PuReMD/src/system_props.h
index 5efff3c561019e4dbcad2b1de338f6d79f5d85b1..f04a9590ca20da6a020c7ad9ed0ad8023f8c3024 100644
--- a/PG-PuReMD/src/system_props.h
+++ b/PG-PuReMD/src/system_props.h
@@ -24,6 +24,11 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Temperature_Control( control_params*, simulation_data* );
 
 void Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
@@ -33,16 +38,16 @@ void Compute_System_Energy( reax_system*, simulation_data*, MPI_Comm );
 void Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
 
 void Compute_Center_of_Mass( reax_system*, simulation_data*,
-                             mpi_datatypes*, MPI_Comm );
+        mpi_datatypes*, MPI_Comm );
 
 void Compute_Pressure( reax_system*, control_params*,
-                       simulation_data*, mpi_datatypes* );
+        simulation_data*, mpi_datatypes* );
+
 //void Compute_Pressure( reax_system*, simulation_data* );
 
-//CUDA Functions
-void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
-void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
-void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*,
-                                  mpi_datatypes*, MPI_Comm );
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/tool_box.h b/PG-PuReMD/src/tool_box.h
index ecaee197789daf550357e479fdfb20ed245dc036..a1f55910ff34fe08aceb5a84aa7e270a855d9291 100644
--- a/PG-PuReMD/src/tool_box.h
+++ b/PG-PuReMD/src/tool_box.h
@@ -29,7 +29,6 @@
 extern "C" {
 #endif
 
-
 /* from comm_tools.h */
 int SumScan( int, int, int, MPI_Comm );
 
@@ -76,7 +75,6 @@ void *scalloc( size_t, size_t, const char* );
 
 void sfree( void*, const char* );
 
-
 #ifdef __cplusplus
 }
 #endif
@@ -227,4 +225,5 @@ static inline real DistSqr_to_Special_Point( rvec cp, rvec x )
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/torsion_angles.c b/PG-PuReMD/src/torsion_angles.c
index 58e71f4e2fb1a7f8b7cd83ba075a89e9eaf24c0d..29cfb4444a97ea59a91b6f1fac2c5ccc55877a4c 100644
--- a/PG-PuReMD/src/torsion_angles.c
+++ b/PG-PuReMD/src/torsion_angles.c
@@ -21,7 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
 #if defined(PURE_REAX)
   #include "torsion_angles.h"
   #include "bond_orders.h"
@@ -36,6 +35,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 #define MIN_SINE 1e-10
 
 
diff --git a/PG-PuReMD/src/torsion_angles.h b/PG-PuReMD/src/torsion_angles.h
index d0762a4e2f693877da9e60a8bbc44ea2f5b714b0..454f06791e6ac3cd3fba970090ada1b07c515f6f 100644
--- a/PG-PuReMD/src/torsion_angles.h
+++ b/PG-PuReMD/src/torsion_angles.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Torsion_Angles( reax_system*, control_params*, simulation_data*,
-                     storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c
index d561a45f370f3581b95a73796928a305b3cd57f4..b7ba111220dd8874726590c294cc72e07cb06b50 100644
--- a/PG-PuReMD/src/traj.c
+++ b/PG-PuReMD/src/traj.c
@@ -32,7 +32,7 @@
 #endif
 
 #ifdef HAVE_CUDA
-  #include "cuda_copy.h"
+  #include "cuda/cuda_copy.h"
 #endif
 
 
diff --git a/PG-PuReMD/src/traj.h b/PG-PuReMD/src/traj.h
index 8f09c4a79718dbd14701caebd9c3bde65e9dc4e2..13435ecbfd071f6c5ff55f18f368a736967fffc9 100644
--- a/PG-PuReMD/src/traj.h
+++ b/PG-PuReMD/src/traj.h
@@ -22,10 +22,8 @@
 #ifndef __TRAJ_H__
 #define __TRAJ_H__
 
-
 #include "reax_types.h"
 
-
 #define MAX_TRJ_LINE_LEN     120
 #define MAX_TRJ_BUFFER_SIZE  (MAX_TRJ_LINE_LEN * 100)
 
@@ -80,6 +78,10 @@ enum ANGLE_LINE_OPTS
 };
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int Init_Traj( reax_system*, control_params*, output_controls*, mpi_datatypes*, char* );
 
 int End_Traj( int, output_controls* );
@@ -87,5 +89,9 @@ int End_Traj( int, output_controls* );
 int Append_Frame( reax_system*, control_params*, simulation_data*, reax_list**,
         output_controls*, mpi_datatypes* );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/valence_angles.h b/PG-PuReMD/src/valence_angles.h
index c7a56eaa45fa4c188e04ef88010e684f940a7f81..1958b0cbb5784d29a2e0af9024f099bcb57d3bda 100644
--- a/PG-PuReMD/src/valence_angles.h
+++ b/PG-PuReMD/src/valence_angles.h
@@ -25,6 +25,10 @@
 #include "reax_types.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Valence_Angles( reax_system*, control_params*, simulation_data*,
         storage*, reax_list**, output_controls* );
 
@@ -32,5 +36,9 @@ void Calculate_Theta( rvec, real, rvec, real, real*, real* );
 
 void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/vector.h b/PG-PuReMD/src/vector.h
index adfe7da2af2161c14f4383024ff8bf2adc6fd026..14250909570368ae6f17e0433e80bce8c34d26e4 100644
--- a/PG-PuReMD/src/vector.h
+++ b/PG-PuReMD/src/vector.h
@@ -136,6 +136,7 @@ CUDA_HOST_DEVICE static inline void rvec_Copy( rvec dest, rvec src )
     dest[2] = src[2];
 }
 
+
 CUDA_HOST_DEVICE static inline void rvec_Scale( rvec ret, real c, rvec v )
 {
     ret[0] = c * v[0];
@@ -497,8 +498,8 @@ CUDA_HOST_DEVICE static inline void rtensor_Transpose( rtensor ret, rtensor t )
 CUDA_HOST_DEVICE static inline real rtensor_Det( rtensor t )
 {
     return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
-             t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
-             t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
+            t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
+            t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
 }