From 5299a3154c6112e9f002a919442ed32ed8fc5b8f Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@msu.edu>
Date: Sun, 16 Jul 2017 23:19:55 -0400
Subject: [PATCH] PG-PuReMD: minor project refactoring. Separate CUDA code.
 Change header includes.

---
 PG-PuReMD/Makefile.am                         |   38 +-
 PG-PuReMD/src/allocate.c                      |    3 +-
 PG-PuReMD/src/allocate.h                      |    4 +-
 PG-PuReMD/src/analyze.c                       |    3 +
 PG-PuReMD/src/analyze.h                       |   12 +-
 PG-PuReMD/src/basic_comm.h                    |   20 +-
 PG-PuReMD/src/bond_orders.c                   |    1 +
 PG-PuReMD/src/bond_orders.h                   |   34 +-
 PG-PuReMD/src/bonds.c                         |   28 +-
 PG-PuReMD/src/bonds.h                         |   13 +-
 PG-PuReMD/src/box.c                           |    3 +
 PG-PuReMD/src/box.h                           |   35 +-
 PG-PuReMD/src/center_mass.cu                  |  551 --------
 PG-PuReMD/src/center_mass.h                   |   49 -
 PG-PuReMD/src/charges.c                       |  102 +-
 PG-PuReMD/src/charges.h                       |    9 +-
 PG-PuReMD/src/comm_tools.c                    |    2 +
 PG-PuReMD/src/comm_tools.h                    |   23 +-
 PG-PuReMD/src/control.h                       |   10 +
 PG-PuReMD/src/{ => cuda}/cuda_allocate.cu     |    8 +-
 PG-PuReMD/src/{ => cuda}/cuda_allocate.h      |    2 +-
 PG-PuReMD/src/{ => cuda}/cuda_bond_orders.cu  |    5 +-
 PG-PuReMD/src/{ => cuda}/cuda_bond_orders.h   |    5 +-
 PG-PuReMD/src/{ => cuda}/cuda_bonds.cu        |    7 +-
 PG-PuReMD/src/{ => cuda}/cuda_bonds.h         |   18 +-
 PG-PuReMD/src/{ => cuda}/cuda_charges.cu      |   95 +-
 PG-PuReMD/src/{ => cuda}/cuda_charges.h       |    5 +-
 PG-PuReMD/src/{ => cuda}/cuda_copy.cu         |    3 +-
 PG-PuReMD/src/{ => cuda}/cuda_copy.h          |   10 +-
 PG-PuReMD/src/{ => cuda}/cuda_environment.cu  |    0
 PG-PuReMD/src/{ => cuda}/cuda_environment.h   |    8 +-
 PG-PuReMD/src/{ => cuda}/cuda_forces.cu       |  227 +++-
 PG-PuReMD/src/{ => cuda}/cuda_forces.h        |    5 +-
 PG-PuReMD/src/{ => cuda}/cuda_helpers.h       |    2 +-
 .../src/{ => cuda}/cuda_hydrogen_bonds.cu     |    8 +-
 PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h      |   48 +
 PG-PuReMD/src/cuda/cuda_init_md.cu            |  412 ++++++
 PG-PuReMD/src/cuda/cuda_init_md.h             |   22 +
 PG-PuReMD/src/cuda/cuda_integrate.cu          |  249 ++++
 PG-PuReMD/src/{ => cuda}/cuda_integrate.h     |   10 +-
 PG-PuReMD/src/cuda/cuda_lin_alg.cu            | 1113 +++++++++++++++++
 PG-PuReMD/src/{ => cuda}/cuda_lin_alg.h       |   39 +-
 PG-PuReMD/src/{ => cuda}/cuda_list.cu         |    9 +-
 PG-PuReMD/src/{ => cuda}/cuda_list.h          |    4 +-
 PG-PuReMD/src/{ => cuda}/cuda_lookup.cu       |    5 +-
 PG-PuReMD/src/{ => cuda}/cuda_lookup.h        |    6 +-
 PG-PuReMD/src/{ => cuda}/cuda_multi_body.cu   |    5 +-
 PG-PuReMD/src/{ => cuda}/cuda_multi_body.h    |   25 +-
 PG-PuReMD/src/{ => cuda}/cuda_neighbors.cu    |    8 +-
 PG-PuReMD/src/{ => cuda}/cuda_neighbors.h     |    5 +-
 PG-PuReMD/src/{ => cuda}/cuda_nonbonded.cu    |    5 +-
 PG-PuReMD/src/{ => cuda}/cuda_nonbonded.h     |   14 +-
 PG-PuReMD/src/{ => cuda}/cuda_post_evolve.cu  |    5 +-
 PG-PuReMD/src/{ => cuda}/cuda_post_evolve.h   |    6 +-
 PG-PuReMD/src/{ => cuda}/cuda_reduction.cu    |    6 +-
 PG-PuReMD/src/{ => cuda}/cuda_reduction.h     |   15 +-
 PG-PuReMD/src/{ => cuda}/cuda_reset_tools.cu  |    2 +-
 PG-PuReMD/src/{ => cuda}/cuda_reset_tools.h   |    6 +-
 PG-PuReMD/src/{ => cuda}/cuda_shuffle.h       |    3 +-
 PG-PuReMD/src/cuda/cuda_system_props.cu       | 1026 +++++++++++++++
 PG-PuReMD/src/{ => cuda}/cuda_system_props.h  |   15 +-
 .../src/{ => cuda}/cuda_torsion_angles.cu     |    7 +-
 .../src/{ => cuda}/cuda_torsion_angles.h      |   32 +-
 PG-PuReMD/src/{ => cuda}/cuda_utils.cu        |   17 +
 PG-PuReMD/src/{ => cuda}/cuda_utils.h         |   32 +-
 .../src/{ => cuda}/cuda_valence_angles.cu     |    5 +-
 .../src/{ => cuda}/cuda_valence_angles.h      |    6 +-
 PG-PuReMD/src/{ => cuda}/cuda_validation.cu   |    9 +-
 PG-PuReMD/src/{ => cuda}/cuda_validation.h    |   18 +-
 PG-PuReMD/src/cuda_hydrogen_bonds.h           |   66 -
 PG-PuReMD/src/cuda_init_md.cu                 |   14 -
 PG-PuReMD/src/cuda_init_md.h                  |   15 -
 PG-PuReMD/src/cuda_integrate.cu               |  105 --
 PG-PuReMD/src/cuda_lin_alg.cu                 |  624 ---------
 PG-PuReMD/src/cuda_system_props.cu            |  406 ------
 PG-PuReMD/src/ffield.c                        |    3 +-
 PG-PuReMD/src/ffield.h                        |   10 +
 PG-PuReMD/src/forces.c                        |  217 +---
 PG-PuReMD/src/forces.h                        |   11 +-
 PG-PuReMD/src/geo_tools.c                     |    3 +
 PG-PuReMD/src/geo_tools.h                     |   21 +-
 PG-PuReMD/src/grid.c                          |    6 +-
 PG-PuReMD/src/grid.h                          |   14 +
 PG-PuReMD/src/hydrogen_bonds.c                |    4 +-
 PG-PuReMD/src/hydrogen_bonds.h                |   12 +-
 PG-PuReMD/src/init_md.c                       |  377 ------
 PG-PuReMD/src/init_md.h                       |   15 +-
 PG-PuReMD/src/integrate.c                     |  147 +--
 PG-PuReMD/src/integrate.h                     |   13 +-
 PG-PuReMD/src/io_tools.c                      |    4 +-
 PG-PuReMD/src/io_tools.h                      |  107 +-
 PG-PuReMD/src/lin_alg.c                       |  503 +-------
 PG-PuReMD/src/lin_alg.h                       |   29 +-
 PG-PuReMD/src/list.c                          |    8 +-
 PG-PuReMD/src/list.h                          |   11 +-
 PG-PuReMD/src/lookup.c                        |   12 +-
 PG-PuReMD/src/lookup.h                        |   12 +-
 PG-PuReMD/src/multi_body.c                    |    1 +
 PG-PuReMD/src/multi_body.h                    |   12 +-
 PG-PuReMD/src/neighbors.c                     |    6 +-
 PG-PuReMD/src/neighbors.h                     |   14 +-
 PG-PuReMD/src/nonbonded.c                     |    8 +-
 PG-PuReMD/src/nonbonded.h                     |   16 +-
 PG-PuReMD/src/parallelreax.c                  |   33 +-
 PG-PuReMD/src/random.c                        |    2 +
 PG-PuReMD/src/random.h                        |   15 +-
 PG-PuReMD/src/reax_types.h                    |   15 +-
 PG-PuReMD/src/reset_tools.c                   |    4 +-
 PG-PuReMD/src/reset_tools.h                   |    4 +-
 PG-PuReMD/src/restart.c                       |    3 +
 PG-PuReMD/src/restart.h                       |   19 +-
 PG-PuReMD/src/system_props.c                  |  157 +--
 PG-PuReMD/src/system_props.h                  |   19 +-
 PG-PuReMD/src/tool_box.h                      |    3 +-
 PG-PuReMD/src/torsion_angles.c                |    3 +-
 PG-PuReMD/src/torsion_angles.h                |   12 +-
 PG-PuReMD/src/traj.c                          |    2 +-
 PG-PuReMD/src/traj.h                          |   10 +-
 PG-PuReMD/src/valence_angles.h                |    8 +
 PG-PuReMD/src/vector.h                        |    5 +-
 120 files changed, 3967 insertions(+), 3710 deletions(-)
 delete mode 100644 PG-PuReMD/src/center_mass.cu
 delete mode 100644 PG-PuReMD/src/center_mass.h
 rename PG-PuReMD/src/{ => cuda}/cuda_allocate.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_allocate.h (96%)
 rename PG-PuReMD/src/{ => cuda}/cuda_bond_orders.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_bond_orders.h (98%)
 rename PG-PuReMD/src/{ => cuda}/cuda_bonds.cu (98%)
 rename PG-PuReMD/src/{ => cuda}/cuda_bonds.h (69%)
 rename PG-PuReMD/src/{ => cuda}/cuda_charges.cu (68%)
 rename PG-PuReMD/src/{ => cuda}/cuda_charges.h (89%)
 rename PG-PuReMD/src/{ => cuda}/cuda_copy.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_copy.h (93%)
 rename PG-PuReMD/src/{ => cuda}/cuda_environment.cu (100%)
 rename PG-PuReMD/src/{ => cuda}/cuda_environment.h (56%)
 rename PG-PuReMD/src/{ => cuda}/cuda_forces.cu (89%)
 rename PG-PuReMD/src/{ => cuda}/cuda_forces.h (86%)
 rename PG-PuReMD/src/{ => cuda}/cuda_helpers.h (97%)
 rename PG-PuReMD/src/{ => cuda}/cuda_hydrogen_bonds.cu (99%)
 create mode 100644 PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
 create mode 100644 PG-PuReMD/src/cuda/cuda_init_md.cu
 create mode 100644 PG-PuReMD/src/cuda/cuda_init_md.h
 create mode 100644 PG-PuReMD/src/cuda/cuda_integrate.cu
 rename PG-PuReMD/src/{ => cuda}/cuda_integrate.h (86%)
 create mode 100644 PG-PuReMD/src/cuda/cuda_lin_alg.cu
 rename PG-PuReMD/src/{ => cuda}/cuda_lin_alg.h (52%)
 rename PG-PuReMD/src/{ => cuda}/cuda_list.cu (96%)
 rename PG-PuReMD/src/{ => cuda}/cuda_list.h (98%)
 rename PG-PuReMD/src/{ => cuda}/cuda_lookup.cu (98%)
 rename PG-PuReMD/src/{ => cuda}/cuda_lookup.h (56%)
 rename PG-PuReMD/src/{ => cuda}/cuda_multi_body.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_multi_body.h (58%)
 rename PG-PuReMD/src/{ => cuda}/cuda_neighbors.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_neighbors.h (95%)
 rename PG-PuReMD/src/{ => cuda}/cuda_nonbonded.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_nonbonded.h (79%)
 rename PG-PuReMD/src/{ => cuda}/cuda_post_evolve.cu (95%)
 rename PG-PuReMD/src/{ => cuda}/cuda_post_evolve.h (60%)
 rename PG-PuReMD/src/{ => cuda}/cuda_reduction.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_reduction.h (96%)
 rename PG-PuReMD/src/{ => cuda}/cuda_reset_tools.cu (98%)
 rename PG-PuReMD/src/{ => cuda}/cuda_reset_tools.h (94%)
 rename PG-PuReMD/src/{ => cuda}/cuda_shuffle.h (97%)
 create mode 100644 PG-PuReMD/src/cuda/cuda_system_props.cu
 rename PG-PuReMD/src/{ => cuda}/cuda_system_props.h (65%)
 rename PG-PuReMD/src/{ => cuda}/cuda_torsion_angles.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_torsion_angles.h (57%)
 rename PG-PuReMD/src/{ => cuda}/cuda_utils.cu (88%)
 rename PG-PuReMD/src/{ => cuda}/cuda_utils.h (80%)
 rename PG-PuReMD/src/{ => cuda}/cuda_valence_angles.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_valence_angles.h (98%)
 rename PG-PuReMD/src/{ => cuda}/cuda_validation.cu (99%)
 rename PG-PuReMD/src/{ => cuda}/cuda_validation.h (97%)
 delete mode 100644 PG-PuReMD/src/cuda_hydrogen_bonds.h
 delete mode 100644 PG-PuReMD/src/cuda_init_md.cu
 delete mode 100644 PG-PuReMD/src/cuda_init_md.h
 delete mode 100644 PG-PuReMD/src/cuda_integrate.cu
 delete mode 100644 PG-PuReMD/src/cuda_lin_alg.cu
 delete mode 100644 PG-PuReMD/src/cuda_system_props.cu

diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am
index b0c1c871..3b051035 100644
--- a/PG-PuReMD/Makefile.am
+++ b/PG-PuReMD/Makefile.am
@@ -34,25 +34,25 @@ include_HEADERS = src/reax_types.h src/index_utils.h \
 	src/integrate.h src/init_md.h
 
 if USE_CUDA
-bin_pg_puremd_SOURCES += src/cuda_utils.cu src/cuda_allocate.cu src/cuda_environment.cu \
-      src/cuda_system_props.cu src/cuda_reduction.cu src/center_mass.cu \
-      src/cuda_copy.cu src/cuda_reset_tools.cu src/cuda_list.cu \
-      src/cuda_neighbors.cu src/cuda_bond_orders.cu src/cuda_bonds.cu \
-      src/cuda_multi_body.cu src/cuda_valence_angles.cu \
-      src/cuda_torsion_angles.cu src/cuda_hydrogen_bonds.cu src/cuda_forces.cu \
-      src/cuda_charges.cu src/cuda_lin_alg.cu \
-      src/cuda_nonbonded.cu src/cuda_integrate.cu src/cuda_post_evolve.cu \
-      src/cuda_init_md.cu src/cuda_validation.cu src/cuda_lookup.cu
-include_HEADERS += src/cuda_helpers.h src/cuda_shuffle.h \
-      src/cuda_utils.h src/cuda_allocate.h src/cuda_environment.h \
-      src/cuda_system_props.h src/cuda_reduction.h src/center_mass.h \
-      src/cuda_copy.h src/cuda_reset_tools.h src/cuda_list.h \
-      src/cuda_neighbors.h src/cuda_bond_orders.h src/cuda_bonds.h \
-      src/cuda_multi_body.h src/cuda_valence_angles.h \
-      src/cuda_torsion_angles.h src/cuda_hydrogen_bonds.h src/cuda_forces.h \
-      src/cuda_charges.h src/cuda_lin_alg.h \
-      src/cuda_nonbonded.h src/cuda_integrate.h src/cuda_post_evolve.h \
-      src/cuda_init_md.h src/cuda_validation.h src/cuda_lookup.h
+bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cuda/cuda_environment.cu \
+      src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu \
+      src/cuda/cuda_copy.cu src/cuda/cuda_reset_tools.cu src/cuda/cuda_list.cu \
+      src/cuda/cuda_neighbors.cu src/cuda/cuda_bond_orders.cu src/cuda/cuda_bonds.cu \
+      src/cuda/cuda_multi_body.cu src/cuda/cuda_valence_angles.cu \
+      src/cuda/cuda_torsion_angles.cu src/cuda/cuda_hydrogen_bonds.cu src/cuda/cuda_forces.cu \
+      src/cuda/cuda_charges.cu src/cuda/cuda_lin_alg.cu \
+      src/cuda/cuda_nonbonded.cu src/cuda/cuda_integrate.cu src/cuda/cuda_post_evolve.cu \
+      src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu
+include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \
+      src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \
+      src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h \
+      src/cuda/cuda_copy.h src/cuda/cuda_reset_tools.h src/cuda/cuda_list.h \
+      src/cuda/cuda_neighbors.h src/cuda/cuda_bond_orders.h src/cuda/cuda_bonds.h \
+      src/cuda/cuda_multi_body.h src/cuda/cuda_valence_angles.h \
+      src/cuda/cuda_torsion_angles.h src/cuda/cuda_hydrogen_bonds.h src/cuda/cuda_forces.h \
+      src/cuda/cuda_charges.h src/cuda/cuda_lin_alg.h \
+      src/cuda/cuda_nonbonded.h src/cuda/cuda_integrate.h src/cuda/cuda_post_evolve.h \
+      src/cuda/cuda_init_md.h src/cuda/cuda_validation.h src/cuda/cuda_lookup.h
 
 # dummy source to cause C linking
 nodist_EXTRA_bin_pg_puremd_SOURCES = src/dummy.c
diff --git a/PG-PuReMD/src/allocate.c b/PG-PuReMD/src/allocate.c
index 1d85b8f9..54614694 100644
--- a/PG-PuReMD/src/allocate.c
+++ b/PG-PuReMD/src/allocate.c
@@ -20,7 +20,6 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
 
 #if defined(PURE_REAX)
   #include "allocate.h"
@@ -36,6 +35,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 /* allocate space for my_atoms
    important: we cannot know the exact number of atoms that will fall into a
diff --git a/PG-PuReMD/src/allocate.h b/PG-PuReMD/src/allocate.h
index 5fd27315..a2876453 100644
--- a/PG-PuReMD/src/allocate.h
+++ b/PG-PuReMD/src/allocate.h
@@ -24,11 +24,11 @@
 
 #include "reax_types.h"
 
+
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
 int PreAllocate_Space( reax_system*, control_params*, storage* );
 
 void Allocate_System( reax_system*, int, int, char* );
@@ -53,9 +53,9 @@ void Deallocate_MPI_Buffers( mpi_datatypes * );
 void ReAllocate( reax_system*, control_params*, simulation_data*, storage*,
         reax_list**, mpi_datatypes* );
 
-
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/analyze.c b/PG-PuReMD/src/analyze.c
index 283d7e47..0f47ba48 100644
--- a/PG-PuReMD/src/analyze.c
+++ b/PG-PuReMD/src/analyze.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "analyze.h"
+
 #include "box.h"
 #include "list.h"
 #include "vector.h"
diff --git a/PG-PuReMD/src/analyze.h b/PG-PuReMD/src/analyze.h
index e4703341..a772dcb2 100644
--- a/PG-PuReMD/src/analyze.h
+++ b/PG-PuReMD/src/analyze.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Analysis( reax_system*, control_params*, simulation_data*, storage*,
-               reax_list**, output_controls*, mpi_datatypes* );
+        reax_list**, output_controls*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/basic_comm.h b/PG-PuReMD/src/basic_comm.h
index e1effc50..4d8f1c34 100644
--- a/PG-PuReMD/src/basic_comm.h
+++ b/PG-PuReMD/src/basic_comm.h
@@ -24,33 +24,43 @@
 
 #include "reax_types.h"
 
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 void real_packer( void*, mpi_out_data* );
+
 void rvec_packer( void*, mpi_out_data* );
+
 void rvec2_packer( void*, mpi_out_data* );
+
 void Dist(reax_system*, mpi_datatypes*, void*, MPI_Datatype, int, dist_packer);
 
 void real_unpacker( void*, void*, mpi_out_data* );
+
 void rvec_unpacker( void*, void*, mpi_out_data* );
+
 void rvec2_unpacker( void*, void*, mpi_out_data* );
+
 void Coll( reax_system*, mpi_datatypes*, void*, MPI_Datatype,
-           int, coll_unpacker );
+        int, coll_unpacker );
 
 real Parallel_Norm( real*, int, MPI_Comm );
+
 real Parallel_Dot( real*, real*, int, MPI_Comm );
+
 real Parallel_Vector_Acc( real*, int, MPI_Comm );
 
+#if defined(TEST_FORCES)
+void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* );
+
+void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* );
+#endif
 
 #ifdef __cplusplus
 }
 #endif
 
-#if defined(TEST_FORCES)
-void Coll_ids_at_Master( reax_system*, storage*, mpi_datatypes* );
-void Coll_rvecs_at_Master( reax_system*, storage*, mpi_datatypes*, rvec* );
-#endif
 
 #endif
diff --git a/PG-PuReMD/src/bond_orders.c b/PG-PuReMD/src/bond_orders.c
index 4e023e97..da23e002 100644
--- a/PG-PuReMD/src/bond_orders.c
+++ b/PG-PuReMD/src/bond_orders.c
@@ -31,6 +31,7 @@
   #include "reax_list.h"
   #include "reax_vector.h"
 #endif
+
 #include "index_utils.h"
 
 
diff --git a/PG-PuReMD/src/bond_orders.h b/PG-PuReMD/src/bond_orders.h
index 1975e20b..8cfa2e18 100644
--- a/PG-PuReMD/src/bond_orders.h
+++ b/PG-PuReMD/src/bond_orders.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+
 typedef struct
 {
     real C1dbo, C2dbo, C3dbo;
@@ -32,28 +33,45 @@ typedef struct
     real C1dDelta, C2dDelta, C3dDelta;
 } dbond_coefficients;
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #ifdef TEST_FORCES
 void Get_dBO( reax_system*, reax_list**, int, int, real, rvec* );
+
 void Get_dBOpinpi2( reax_system*, reax_list**,
-                    int, int, real, real, rvec*, rvec* );
+        int, int, real, real, rvec*, rvec* );
 
 void Add_dBO( reax_system*, reax_list**, int, int, real, rvec* );
+
 void Add_dBOpinpi2( reax_system*, reax_list**,
-                    int, int, real, real, rvec*, rvec* );
+        int, int, real, real, rvec*, rvec* );
 
 void Add_dBO_to_Forces( reax_system*, reax_list**, int, int, real );
+
 void Add_dBOpinpi2_to_Forces( reax_system*, reax_list**,
-                              int, int, real, real );
+        int, int, real, real );
 
 void Add_dDelta( reax_system*, reax_list**, int, real, rvec* );
+
 void Add_dDelta_to_Forces( reax_system *, reax_list**, int, real );
 #endif
 
 void Add_dBond_to_Forces( int, int, storage*, reax_list** );
-void Add_dBond_to_Forces_NPT( int, int, simulation_data*,
-                              storage*, reax_list** );
-int BOp(storage*, reax_list*, real, int, int, far_neighbor_data*,
-        single_body_parameters*, single_body_parameters*, two_body_parameters*);
+
+void Add_dBond_to_Forces_NPT( int, int, simulation_data*, storage*, reax_list** );
+
+int BOp( storage*, reax_list*, real, int, int, far_neighbor_data*,
+        single_body_parameters*, single_body_parameters*, two_body_parameters* );
+
 void BO( reax_system*, control_params*, simulation_data*,
-         storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/bonds.c b/PG-PuReMD/src/bonds.c
index 9c2839eb..8fb160ec 100644
--- a/PG-PuReMD/src/bonds.c
+++ b/PG-PuReMD/src/bonds.c
@@ -20,25 +20,27 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
-#include "bonds.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "tool_box.h"
-#include "vector.h"
+  #include "bonds.h"
+  #include "bond_orders.h"
+  #include "list.h"
+  #include "tool_box.h"
+  #include "vector.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_bonds.h"
-#include "reax_bond_orders.h"
-#include "reax_list.h"
-#include "reax_tool_box.h"
-#include "reax_vector.h"
+  #include "reax_bonds.h"
+  #include "reax_bond_orders.h"
+  #include "reax_list.h"
+  #include "reax_tool_box.h"
+  #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void Bonds( reax_system *system, control_params *control,
-            simulation_data *data, storage *workspace, reax_list **lists,
-            output_controls *out_control )
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control )
 {
     int i, j, pj, natoms;
     int start_i, end_i;
diff --git a/PG-PuReMD/src/bonds.h b/PG-PuReMD/src/bonds.h
index 2aa3c1f9..89090386 100644
--- a/PG-PuReMD/src/bonds.h
+++ b/PG-PuReMD/src/bonds.h
@@ -24,6 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Bonds( reax_system*, control_params*, simulation_data*,
-            storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/box.c b/PG-PuReMD/src/box.c
index 86ebd6eb..525f24e5 100644
--- a/PG-PuReMD/src/box.c
+++ b/PG-PuReMD/src/box.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "box.h"
+
 #include "comm_tools.h"
 #include "io_tools.h"
 #include "system_props.h"
diff --git a/PG-PuReMD/src/box.h b/PG-PuReMD/src/box.h
index 841e3679..00e51d06 100644
--- a/PG-PuReMD/src/box.h
+++ b/PG-PuReMD/src/box.h
@@ -24,30 +24,51 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* initializes simulation boxes */
 void Setup_Big_Box( real, real, real, real, real, real, simulation_box* );
+
 void Init_Box( rtensor, simulation_box* );
+
 //void Setup_My_Box( reax_system*, control_params* );
+
 //void Setup_My_Ext_Box( reax_system*, control_params* );
+
 void Setup_Environment( reax_system*, control_params*, mpi_datatypes* );
 
 /* scales simulation box for NPT ensembles */
 void Scale_Box( reax_system*, control_params*,
-                simulation_data*, mpi_datatypes* );
+        simulation_data*, mpi_datatypes* );
 
 /* applies transformation to/from Cartesian/ Triclinic coordinates */
 /* use -1 flag for Cartesian -> Triclinic and +1 for otherway */
-// void Transform( rvec, simulation_box*, char, rvec );
-// void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec );
-// void Inc_on_T3_Gen( rvec, rvec, simulation_box* );
-// int Get_Nbr_Box( simulation_box*, int, int, int );
-// rvec Get_Nbr_Box_Press( simulation_box*, int, int, int );
-// void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );
+//void Transform( rvec, simulation_box*, char, rvec );
+
+//void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec );
+
+//void Inc_on_T3_Gen( rvec, rvec, simulation_box* );
+
+//int Get_Nbr_Box( simulation_box*, int, int, int );
+
+//rvec Get_Nbr_Box_Press( simulation_box*, int, int, int );
+
+//void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );
 
 /* these functions assume that the coordinates are in triclinic system
    this function returns cartesian norm but triclinic distance vector */
 //real Sq_Distance_on_T3( rvec, rvec, simulation_box*, rvec );
+
 //void Inc_on_T3( rvec, rvec, simulation_box* );
+
 //real Metric_Product( rvec, rvec, simulation_box* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/center_mass.cu b/PG-PuReMD/src/center_mass.cu
deleted file mode 100644
index 725cafbb..00000000
--- a/PG-PuReMD/src/center_mass.cu
+++ /dev/null
@@ -1,551 +0,0 @@
-#include "center_mass.h"
-#include "vector.h"
-#include "cuda_shuffle.h"
-
-CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_xcm, 
-        rvec *res_vcm, 
-        rvec *res_amcm, 
-        size_t n)
-{
-    extern __shared__ rvec xcm[];
-    extern __shared__ rvec vcm[];
-    extern __shared__ rvec amcm[];
-
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    //unsigned int xcm_id = threadIdx.x;
-    unsigned int vcm_id = blockDim.x;
-    unsigned int amcm_id = 2 *(blockDim.x);
-
-    unsigned int index = 0;
-    rvec tmp;
-    real m;
-
-    rvec_MakeZero (xcm [threadIdx.x]);
-    rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
-    rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
-    rvec_MakeZero (tmp);
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
-        rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
-        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
-        rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
-    }
-    __syncthreads ();
-
-    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (xcm [threadIdx.x], xcm[index]);
-            rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
-            rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
-        } 
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0)){
-        rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
-        rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
-        rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
-    }
-}
-
-#if defined( __SM_35__)
-CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_xcm,
-        size_t n)
-{
-    extern __shared__ rvec my_xcm[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int xcm_id = threadIdx.x;
-    unsigned int index = 0;
-    rvec xcm;
-    real m;
-
-    rvec_MakeZero (xcm);
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_ScaledAdd (xcm , m, atoms [i].x);
-    }
-    __syncthreads ();
-
-    for (int z = 16; z >= 1; z /= 2){
-        xcm[0] += shfl( xcm[0], z);
-        xcm[1] += shfl( xcm[1], z);
-        xcm[2] += shfl( xcm[2], z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0)
-        rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm );
-    __syncthreads ();
-
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (my_xcm [threadIdx.x], my_xcm[index]);
-        }
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0))
-        rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]);
-}
-
-CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_vcm,
-        size_t n)
-{
-    extern __shared__ rvec my_vcm[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    rvec vcm;
-    real m;
-
-    rvec_MakeZero (vcm);
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_ScaledAdd (vcm , m, atoms [i].v);
-    }
-    __syncthreads ();
-
-    for (int z = 16; z >= 1; z /= 2){
-        vcm[0] += shfl( vcm[0], z);
-        vcm[1] += shfl( vcm[1], z);
-        vcm[2] += shfl( vcm[2], z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0)
-        rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm );
-    __syncthreads ();
-
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (my_vcm [threadIdx.x], my_vcm[index]);
-        }
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0))
-        rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]);
-}
-
-CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_amcm,
-        size_t n)
-{
-    extern __shared__ rvec my_amcm[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    rvec amcm;
-    real m;
-    rvec tmp;
-
-    rvec_MakeZero (amcm);
-    rvec_MakeZero( tmp );
-
-    if (i < n){
-        m = sbp [ atoms[i].type ].mass;
-        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
-        rvec_ScaledAdd (amcm, m, tmp);
-    }
-    __syncthreads ();
-
-    for (int z = 16; z >= 1; z /= 2){
-        amcm[0] += shfl( amcm[0], z);
-        amcm[1] += shfl( amcm[1], z);
-        amcm[2] += shfl( amcm[2], z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0)
-        rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm );
-    __syncthreads ();
-
-
-    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-        if ((threadIdx.x < offset)) {
-            index = threadIdx.x + offset;
-            rvec_Add (my_amcm[threadIdx.x], my_amcm[index]);
-        }
-        __syncthreads ();
-    }
-
-    if ((threadIdx.x == 0)){
-        rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]);
-    }
-}
-
-#endif
-
-
-CUDA_GLOBAL void center_of_mass (rvec *xcm, 
-        rvec *vcm, 
-        rvec *amcm, 
-        rvec *res_xcm,
-        rvec *res_vcm,
-        rvec *res_amcm,
-        size_t n)
-{
-    extern __shared__ rvec sh_xcm[];
-    extern __shared__ rvec sh_vcm[];
-    extern __shared__ rvec sh_amcm[];
-
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    unsigned int xcm_id = threadIdx.x;
-    unsigned int vcm_id = blockDim.x;
-    unsigned int amcm_id = 2 * (blockDim.x);
-
-    unsigned int index = 0;
-    rvec t_xcm, t_vcm, t_amcm;
-
-    rvec_MakeZero (t_xcm);
-    rvec_MakeZero (t_vcm);
-    rvec_MakeZero (t_amcm);
-
-    if (i < n){
-        rvec_Copy ( t_xcm, xcm[threadIdx.x]);
-        rvec_Copy ( t_vcm, vcm[threadIdx.x]);
-        rvec_Copy ( t_amcm, amcm[threadIdx.x]);
-    }
-
-    rvec_Copy (sh_xcm[xcm_id], t_xcm);
-    rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
-    rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
-
-    __syncthreads ();
-
-    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-        if (threadIdx.x < offset) {
-            index = threadIdx.x + offset;
-            rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
-            rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
-            rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
-        } 
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0){
-        rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
-        rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
-        rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp, 
-        reax_atom *atoms,
-        real *results, 
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real xx[];
-    extern __shared__ real xy[];
-    extern __shared__ real xz[];
-    extern __shared__ real yy[];
-    extern __shared__ real yz[];
-    extern __shared__ real zz[];
-
-    unsigned int xx_i = threadIdx.x;
-    unsigned int xy_i = blockDim.x;
-    unsigned int xz_i = 2 * blockDim.x;
-    unsigned int yy_i = 3 * blockDim.x;
-    unsigned int yz_i = 4 * blockDim.x;
-    unsigned int zz_i = 5 * blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-
-    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        xx[ xx_i ] = diff[0] * diff[0] * m;
-        xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
-        xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
-        yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
-        yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
-        zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            xx[ threadIdx.x ] += xx[ index ];
-            xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-            xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-            yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-            yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-            zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 ] = xx [ 0 ];
-        results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
-        results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
-        results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
-        results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
-        results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n)
-{
-    extern __shared__ real xx[];
-    extern __shared__ real xy[];
-    extern __shared__ real xz[];
-    extern __shared__ real yy[];
-    extern __shared__ real yz[];
-    extern __shared__ real zz[];
-
-    unsigned int xx_i = threadIdx.x;
-    unsigned int xy_i = blockDim.x;
-    unsigned int xz_i = 2 * blockDim.x;
-    unsigned int yy_i = 3 * blockDim.x;
-    unsigned int yz_i = 4 * blockDim.x;
-    unsigned int zz_i = 5 * blockDim.x;
-
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-
-    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-    if (i < n)
-    {
-        xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
-        xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
-        xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
-        yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
-        yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
-        zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if (threadIdx.x < offset )
-        {
-            index = threadIdx.x + offset;
-            xx [ threadIdx.x ] += xx [ index ];
-            xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-            xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-            yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-            yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-            zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0)
-    {
-        output[0] = xx[0];
-        output[1] = xy[xy_i];
-        output[2] = xz[xz_i];
-        output[3] = xz[yy_i];
-        output[4] = xz[yz_i];
-        output[5] = xz[zz_i];
-    }
-}
-
-#if defined( __SM_35__)
-
-CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *sbp,
-        reax_atom *atoms,
-        real *results,
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real my_results_xx[];
-    extern __shared__ real my_results_xy[];
-
-    unsigned int xx_i = threadIdx.x;
-    unsigned int xy_i = blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    real xx = 0;
-    real xy = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        xx = diff[0] * diff[0] * m;
-        xy = diff[0] * diff[1] * m;
-    }
-    __syncthreads ();
-
-    for (int z = 16; z <= 1; z++){
-        xx += shfl( xx, z);
-        xy += shfl( xy, z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0){
-        my_results_xx[threadIdx.x >> 5] = xx;    
-        my_results_xy[threadIdx.x >> 5] = xy;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            my_results_xx[ threadIdx.x ] += my_results_xx[ index ];
-            my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 ] = my_results_xx [ 0 ];
-        results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ];
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *sbp,
-        reax_atom *atoms,
-        real *results,
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real my_results_xz[];
-    extern __shared__ real my_results_yy[];
-
-    unsigned int yy_i = blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    real xz = 0;
-    real yy = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        xz = diff[0] * diff[2] * m;
-        yy = diff[1] * diff[1] * m;
-    }
-    __syncthreads ();
-
-    for (int z = 16; z <= 1; z++){
-        xz += shfl( xz, z);
-        yy += shfl( yy, z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0){
-        my_results_xz[threadIdx.x >> 5] = xz;    
-        my_results_yy[threadIdx.x >> 5] = yy;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            my_results_xz[ threadIdx.x ] += my_results_xz [ index ];
-            my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ];
-        results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ];
-    }
-}
-
-CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *sbp,
-        reax_atom *atoms,
-        real *results,
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
-{
-    extern __shared__ real my_results_yz[];
-    extern __shared__ real my_results_zz[];
-
-    unsigned int zz_i = blockDim.x;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int index = 0;
-    real yz = 0;
-    real zz = 0;
-
-    rvec diff, xcm;
-    real m = 0;
-    rvec_MakeZero (diff);
-    xcm[0] = xcm0;
-    xcm[1] = xcm1;
-    xcm[2] = xcm2;
-
-
-    if (i < n){
-        m = sbp[ atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-        yz = diff[1] * diff[2] * m;
-        zz = diff[2] * diff[2] * m;
-    }
-    __syncthreads ();
-
-    for (int z = 16; z <= 1; z++){
-        yz += shfl( yz, z);
-        zz += shfl( zz, z);
-    }
-    __syncthreads ();
-
-    if (threadIdx.x % 32 == 0){
-        my_results_yz[threadIdx.x >> 5] = yz;    
-        my_results_zz[threadIdx.x >> 5] = zz;    
-    }
-    __syncthreads ();
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
-            index = threadIdx.x + offset;
-            my_results_yz[ threadIdx.x ] += my_results_yz [ index ];
-            my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ];
-        }
-        __syncthreads ();
-    }
-
-    if (threadIdx.x == 0) {
-        results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ];
-        results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ];
-    }
-}
-
-#endif
diff --git a/PG-PuReMD/src/center_mass.h b/PG-PuReMD/src/center_mass.h
deleted file mode 100644
index 113971ad..00000000
--- a/PG-PuReMD/src/center_mass.h
+++ /dev/null
@@ -1,49 +0,0 @@
-
-#ifndef __CENTER_MASS_H__
-#define __CENTER_MASS_H__
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *, reax_atom *,
-                                        rvec *res_xcm,
-                                        rvec *res_vcm,
-                                        rvec *res_amcm,
-                                        size_t n);
-
-#if defined(__SM_35__)
-CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *, reax_atom *,
-        rvec *res_xcm,
-        size_t n);
-CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *, reax_atom *,
-        rvec *res_vcm,
-        size_t n);
-CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *, reax_atom *,
-        rvec *res_amcm,
-        size_t n);
-#endif
-
-
-CUDA_GLOBAL void center_of_mass (rvec *xcm,
-                                 rvec *vcm,
-                                 rvec *amcm,
-                                 rvec *res_xcm,
-                                 rvec *res_vcm,
-                                 rvec *res_amcm,
-                                 size_t n);
-
-CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp,
-                                      reax_atom *atoms,
-                                      real *results,
-                                      real xcm0, real xcm1, real xcm2,
-                                      size_t n);
-
-CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n);
-
-#if defined(__SM_35__)
-CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *, reax_atom *, real *, real , real , real , size_t );
-CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *, reax_atom *, real *, real , real , real , size_t );
-CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *, reax_atom *, real *, real , real , real , size_t );
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/charges.c b/PG-PuReMD/src/charges.c
index 6d695f56..8f53b65d 100644
--- a/PG-PuReMD/src/charges.c
+++ b/PG-PuReMD/src/charges.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "charges.h"
 
 #include "allocate.h"
@@ -27,12 +29,6 @@
 #include "lin_alg.h"
 #include "tool_box.h"
 
-#ifdef HAVE_CUDA
-  #include "cuda_charges.h"
-  #include "cuda_lin_alg.h"
-  #include "cuda_validation.h"
-#endif
-
 
 int compare_matrix_entry(const void *v1, const void *v2)
 {
@@ -406,46 +402,6 @@ void Calculate_Charges( reax_system *system, storage *workspace,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
-        mpi_datatypes *mpi_data )
-{
-    int i, scale;
-    real u;//, s_sum, t_sum;
-    rvec2 my_sum, all_sum;
-    reax_atom *atom;
-    real *q;
-
-    my_sum[0] = 0.0;
-    my_sum[1] = 0.0;
-    scale = sizeof(real) / sizeof(void);
-    q = (real *) host_scratch;
-    memset( q, 0, system->N * sizeof (real));
-
-    cuda_charges_x( system, my_sum );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "Device: my_sum[0]: %f, my_sum[1]: %f\n",
-            my_sum[0], my_sum[1] );
-#endif
-
-    MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
-
-    u = all_sum[0] / all_sum[1];
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "Device: u: %f \n", u );
-#endif
-
-    cuda_charges_st( system, workspace, q, u );
-
-    Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
-
-    cuda_charges_updateq( system, q );
-}
-#endif
-
-
 void QEq( reax_system *system, control_params *control, simulation_data *data,
         storage *workspace, output_controls *out_control,
         mpi_datatypes *mpi_data )
@@ -504,57 +460,3 @@ void QEq( reax_system *system, control_params *control, simulation_data *data,
     }
 #endif
 }
-
-
-#ifdef HAVE_CUDA
-void Cuda_QEq( reax_system *system, control_params *control, simulation_data
-        *data, storage *workspace, output_controls *out_control, mpi_datatypes
-        *mpi_data )
-{
-    int s_matvecs, t_matvecs;
-
-    Cuda_Init_MatVec( system, workspace );
-
-    //if (data->step > 0) {
-    //    compare_rvec2 (workspace->b, dev_workspace->b, system->n, "b");
-    //    compare_rvec2 (workspace->x, dev_workspace->x, system->n, "x");
-    // compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
-    // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
-    //}
-
-//#ifdef __CUDA_DEBUG__
-//  Init_MatVec( system, data, control, workspace, mpi_data );
-//#endif
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
-    //Print_Linear_System( system, control, workspace, data->step );
-#endif
-
-    //MATRIX CHANGES
-    s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H,
-            dev_workspace->b, control->q_err, dev_workspace->x, mpi_data,
-            out_control->log, data);
-    t_matvecs = 0;
-    //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
-#endif
-
-    Cuda_Calculate_Charges( system, workspace, mpi_data );
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: computed charges\n", system->my_rank );
-    //Print_Charges( system );
-#endif
-
-#if defined(LOG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        data->timing.s_matvecs += s_matvecs;
-        data->timing.t_matvecs += t_matvecs;
-    }
-#endif
-}
-#endif
diff --git a/PG-PuReMD/src/charges.h b/PG-PuReMD/src/charges.h
index faad0d09..08af5641 100644
--- a/PG-PuReMD/src/charges.h
+++ b/PG-PuReMD/src/charges.h
@@ -25,11 +25,16 @@
 #include "reax_types.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void QEq( reax_system*, control_params*, simulation_data*,
         storage*, output_controls*, mpi_datatypes* );
 
-void Cuda_QEq( reax_system*, control_params*, simulation_data*,
-        storage*, output_controls*, mpi_datatypes* );
+#ifdef __cplusplus
+}
+#endif
 
 
 #endif
diff --git a/PG-PuReMD/src/comm_tools.c b/PG-PuReMD/src/comm_tools.c
index 5a832aff..a8d46fcb 100644
--- a/PG-PuReMD/src/comm_tools.c
+++ b/PG-PuReMD/src/comm_tools.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "comm_tools.h"
 
 #include "grid.h"
diff --git a/PG-PuReMD/src/comm_tools.h b/PG-PuReMD/src/comm_tools.h
index 3b0b645f..a0e8d7e5 100644
--- a/PG-PuReMD/src/comm_tools.h
+++ b/PG-PuReMD/src/comm_tools.h
@@ -24,21 +24,36 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Check_MPI_Error( int, const char * );
+
 void Setup_Comm( reax_system*, control_params*, mpi_datatypes* );
+
 void Update_Comm( reax_system* );
 
 void Sort_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* );
+
 void Estimate_Boundary_Atoms( reax_system*, int, int, int, mpi_out_data* );
+
 void Unpack_Exchange_Message( reax_system*, int, void*, int,
-                              neighbor_proc*, int );
+        neighbor_proc*, int );
+
 void Unpack_Estimate_Message( reax_system*, int, void*, int,
-                              neighbor_proc*, int );
+        neighbor_proc*, int );
 
 int SendRecv( reax_system*, mpi_datatypes*_data, MPI_Datatype, int*,
-              message_sorter, unpacker, int );
+        message_sorter, unpacker, int );
 
 void Comm_Atoms( reax_system*, control_params*, simulation_data*, storage*,
-                 reax_list**, mpi_datatypes*, int );
+        reax_list**, mpi_datatypes*, int );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/control.h b/PG-PuReMD/src/control.h
index c6c6ce6c..24cf0451 100644
--- a/PG-PuReMD/src/control.h
+++ b/PG-PuReMD/src/control.h
@@ -24,6 +24,16 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 char Read_Control_File( char*, control_params*, output_controls* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_allocate.cu
rename to PG-PuReMD/src/cuda/cuda_allocate.cu
index 7970e9f6..5c722e56 100644
--- a/PG-PuReMD/src/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -6,10 +6,10 @@
 #include "cuda_neighbors.h"
 #include "cuda_utils.h"
 
-#include "allocate.h"
-#include "index_utils.h"
-#include "tool_box.h"
-#include "vector.h"
+#include "../allocate.h"
+#include "../index_utils.h"
+#include "../tool_box.h"
+#include "../vector.h"
 
 extern "C"
 {
diff --git a/PG-PuReMD/src/cuda_allocate.h b/PG-PuReMD/src/cuda/cuda_allocate.h
similarity index 96%
rename from PG-PuReMD/src/cuda_allocate.h
rename to PG-PuReMD/src/cuda/cuda_allocate.h
index 57124046..0d78d932 100644
--- a/PG-PuReMD/src/cuda_allocate.h
+++ b/PG-PuReMD/src/cuda/cuda_allocate.h
@@ -1,7 +1,7 @@
 #ifndef __CUDA_ALLOCATE_H_
 #define __CUDA_ALLOCATE_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 #ifdef __cplusplus
 extern "C"  {
diff --git a/PG-PuReMD/src/cuda_bond_orders.cu b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_bond_orders.cu
rename to PG-PuReMD/src/cuda/cuda_bond_orders.cu
index 6e4344aa..bb478a3a 100644
--- a/PG-PuReMD/src/cuda_bond_orders.cu
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.cu
@@ -2,11 +2,12 @@
 #include "cuda_bond_orders.h"
 
 #include "cuda_list.h"
-#include "index_utils.h"
-#include "bond_orders.h"
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
 
+#include "../index_utils.h"
+#include "../bond_orders.h"
+
 
 CUDA_GLOBAL void Cuda_Calculate_BO_init( reax_atom *my_atoms, 
         single_body_parameters *sbp, storage p_workspace, int N )
diff --git a/PG-PuReMD/src/cuda_bond_orders.h b/PG-PuReMD/src/cuda/cuda_bond_orders.h
similarity index 98%
rename from PG-PuReMD/src/cuda_bond_orders.h
rename to PG-PuReMD/src/cuda/cuda_bond_orders.h
index 8be3a592..a957b11b 100644
--- a/PG-PuReMD/src/cuda_bond_orders.h
+++ b/PG-PuReMD/src/cuda/cuda_bond_orders.h
@@ -2,10 +2,9 @@
 #ifndef __CUDA_BOND_ORDERS_H__
 #define __CUDA_BOND_ORDERS_H__
 
-#include "reax_types.h"
-#include "reax_types.h"
+#include "../reax_types.h"
 
-#include "vector.h"
+#include "../vector.h"
 
 extern "C" {
 
diff --git a/PG-PuReMD/src/cuda_bonds.cu b/PG-PuReMD/src/cuda/cuda_bonds.cu
similarity index 98%
rename from PG-PuReMD/src/cuda_bonds.cu
rename to PG-PuReMD/src/cuda/cuda_bonds.cu
index 81f3444b..e3592630 100644
--- a/PG-PuReMD/src/cuda_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_bonds.cu
@@ -19,13 +19,12 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
-
-#include "index_utils.h"
-#include "reax_types.h"
+#include "cuda_bonds.h"
 
 #include "cuda_list.h"
 
+#include "../index_utils.h"
+
 
 CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, global_parameters gp, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
diff --git a/PG-PuReMD/src/cuda_bonds.h b/PG-PuReMD/src/cuda/cuda_bonds.h
similarity index 69%
rename from PG-PuReMD/src/cuda_bonds.h
rename to PG-PuReMD/src/cuda/cuda_bonds.h
index d8a7d273..fd9126be 100644
--- a/PG-PuReMD/src/cuda_bonds.h
+++ b/PG-PuReMD/src/cuda/cuda_bonds.h
@@ -22,16 +22,12 @@
 #ifndef __CUDA_BONDS_H_
 #define __CUDA_BONDS_H_
 
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Bonds(    reax_atom *,
-                                global_parameters ,
-                                single_body_parameters *,
-                                two_body_parameters *,
-                                storage ,
-                                reax_list ,
-                                int , int ,
-                                real *
-                           );
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Bonds( reax_atom *, global_parameters,
+        single_body_parameters *, two_body_parameters *, storage,
+        reax_list, int, int, real * );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
similarity index 68%
rename from PG-PuReMD/src/cuda_charges.cu
rename to PG-PuReMD/src/cuda/cuda_charges.cu
index c8410952..ada6bf2f 100644
--- a/PG-PuReMD/src/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -21,12 +21,13 @@
 
 #include "cuda_charges.h"
 
-#include "reax_types.h"
+#include "cuda_lin_alg.h"
 #include "cuda_reduction.h"
 #include "cuda_utils.h"
-
 #include "cuda_validation.h"
 
+#include "../basic_comm.h"
+
 
 CUDA_GLOBAL void k_init_matvec( reax_atom *my_atoms, single_body_parameters
         *sbp, storage p_workspace, int n  )
@@ -204,3 +205,93 @@ void cuda_charges_updateq( reax_system *system, real *q )
     cudaThreadSynchronize( );
     cudaCheckError( );
 }
+
+
+void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
+        mpi_datatypes *mpi_data )
+{
+    int i, scale;
+    real u;//, s_sum, t_sum;
+    rvec2 my_sum, all_sum;
+    reax_atom *atom;
+    real *q;
+
+    my_sum[0] = 0.0;
+    my_sum[1] = 0.0;
+    scale = sizeof(real) / sizeof(void);
+    q = (real *) host_scratch;
+    memset( q, 0, system->N * sizeof (real));
+
+    cuda_charges_x( system, my_sum );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "Device: my_sum[0]: %f, my_sum[1]: %f\n",
+            my_sum[0], my_sum[1] );
+#endif
+
+    MPI_Allreduce( &my_sum, &all_sum, 2, MPI_DOUBLE, MPI_SUM, mpi_data->world );
+
+    u = all_sum[0] / all_sum[1];
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "Device: u: %f \n", u );
+#endif
+
+    cuda_charges_st( system, workspace, q, u );
+
+    Dist( system, mpi_data, q, MPI_DOUBLE, scale, real_packer );
+
+    cuda_charges_updateq( system, q );
+}
+
+
+void Cuda_QEq( reax_system *system, control_params *control, simulation_data
+        *data, storage *workspace, output_controls *out_control, mpi_datatypes
+        *mpi_data )
+{
+    int s_matvecs, t_matvecs;
+
+    Cuda_Init_MatVec( system, workspace );
+
+    //if (data->step > 0) {
+    //    compare_rvec2 (workspace->b, dev_workspace->b, system->n, "b");
+    //    compare_rvec2 (workspace->x, dev_workspace->x, system->n, "x");
+    // compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
+    // compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
+    //}
+
+//#ifdef __CUDA_DEBUG__
+//  Init_MatVec( system, data, control, workspace, mpi_data );
+//#endif
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: initialized qEq\n", system->my_rank );
+    //Print_Linear_System( system, control, workspace, data->step );
+#endif
+
+    //MATRIX CHANGES
+    s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H,
+            dev_workspace->b, control->q_err, dev_workspace->x, mpi_data,
+            out_control->log, data);
+    t_matvecs = 0;
+    //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: first CG completed\n", system->my_rank );
+#endif
+
+    Cuda_Calculate_Charges( system, workspace, mpi_data );
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: computed charges\n", system->my_rank );
+    //Print_Charges( system );
+#endif
+
+#if defined(LOG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.s_matvecs += s_matvecs;
+        data->timing.t_matvecs += t_matvecs;
+    }
+#endif
+}
diff --git a/PG-PuReMD/src/cuda_charges.h b/PG-PuReMD/src/cuda/cuda_charges.h
similarity index 89%
rename from PG-PuReMD/src/cuda_charges.h
rename to PG-PuReMD/src/cuda/cuda_charges.h
index 2d421389..d1922a48 100644
--- a/PG-PuReMD/src/cuda_charges.h
+++ b/PG-PuReMD/src/cuda/cuda_charges.h
@@ -22,7 +22,7 @@
 #ifndef __CUDA_CHARGES_H_
 #define __CUDA_CHARGES_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,6 +37,9 @@ void cuda_charges_st( reax_system *, storage *, real *, real );
 
 void cuda_charges_updateq( reax_system *, real * );
 
+void Cuda_QEq( reax_system*, control_params*, simulation_data*,
+        storage*, output_controls*, mpi_datatypes* );
+
 
 #ifdef __cplusplus
 }
diff --git a/PG-PuReMD/src/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_copy.cu
rename to PG-PuReMD/src/cuda/cuda_copy.cu
index a3bfca30..42055875 100644
--- a/PG-PuReMD/src/cuda_copy.cu
+++ b/PG-PuReMD/src/cuda/cuda_copy.cu
@@ -2,7 +2,8 @@
 #include "cuda_copy.h"
 
 #include "cuda_utils.h"
-#include "vector.h"
+
+#include "../vector.h"
 
 
 /* Copy grid info from host to device */
diff --git a/PG-PuReMD/src/cuda_copy.h b/PG-PuReMD/src/cuda/cuda_copy.h
similarity index 93%
rename from PG-PuReMD/src/cuda_copy.h
rename to PG-PuReMD/src/cuda/cuda_copy.h
index 51c4314c..72bf992c 100644
--- a/PG-PuReMD/src/cuda_copy.h
+++ b/PG-PuReMD/src/cuda/cuda_copy.h
@@ -1,24 +1,30 @@
 #ifndef __CUDA_COPY_H_
 #define __CUDA_COPY_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
 void Sync_Atoms( reax_system * );
+
 void Sync_Grid( grid *, grid * );
+
 void Sync_System( reax_system * );
 
 void Prep_Device_For_Output( reax_system *, simulation_data * );
+
 void Output_Sync_Lists( reax_list *host, reax_list *device, int type );
+
 void Output_Sync_Atoms( reax_system * );
-void Output_Sync_Simulation_Data( simulation_data *, simulation_data * );
 
+void Output_Sync_Simulation_Data( simulation_data *, simulation_data * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_environment.cu b/PG-PuReMD/src/cuda/cuda_environment.cu
similarity index 100%
rename from PG-PuReMD/src/cuda_environment.cu
rename to PG-PuReMD/src/cuda/cuda_environment.cu
diff --git a/PG-PuReMD/src/cuda_environment.h b/PG-PuReMD/src/cuda/cuda_environment.h
similarity index 56%
rename from PG-PuReMD/src/cuda_environment.h
rename to PG-PuReMD/src/cuda/cuda_environment.h
index f8ae3cd0..1cbcc92c 100644
--- a/PG-PuReMD/src/cuda_environment.h
+++ b/PG-PuReMD/src/cuda/cuda_environment.h
@@ -2,15 +2,19 @@
 #ifndef __CUDA_ENVIRONMENT_H__
 #define __CUDA_ENVIRONMENT_H__
 
+#include "../reax_types.h"
+
+
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-void Setup_Cuda_Environment (int, int, int);
-void Cleanup_Cuda_Environment ();
+void Setup_Cuda_Environment( int, int, int );
+void Cleanup_Cuda_Environment( );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
similarity index 89%
rename from PG-PuReMD/src/cuda_forces.cu
rename to PG-PuReMD/src/cuda/cuda_forces.cu
index 831a5694..a790b1a8 100644
--- a/PG-PuReMD/src/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -1,24 +1,27 @@
 
 #include "cuda_forces.h"
 
-#include "reax_types.h"
-#include "cuda_list.h"
-#include "cuda_utils.h"
+#include "cuda_bonds.h"
+#include "cuda_bond_orders.h"
+#include "cuda_charges.h"
 #include "cuda_helpers.h"
+#include "cuda_hydrogen_bonds.h"
+#include "cuda_lin_alg.h"
+#include "cuda_list.h"
+#include "cuda_multi_body.h"
 #include "cuda_neighbors.h"
-#include "cuda_bond_orders.h"
+#include "cuda_nonbonded.h"
 #include "cuda_reduction.h"
-#include "cuda_bonds.h"
-#include "cuda_multi_body.h"
-#include "cuda_valence_angles.h"
 #include "cuda_torsion_angles.h"
-#include "cuda_hydrogen_bonds.h"
-#include "tool_box.h"
-#include "cuda_nonbonded.h"
+#include "cuda_utils.h"
+#include "cuda_valence_angles.h"
+#include "cuda_validation.h"
 
-#include "index_utils.h"
-#include "vector.h"
-#include "forces.h"
+#include "../basic_comm.h"
+#include "../forces.h"
+#include "../index_utils.h"
+#include "../tool_box.h"
+#include "../vector.h"
 
 
 CUDA_GLOBAL void k_disable_hydrogen_bonding( control_params *control )
@@ -1719,3 +1722,201 @@ void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control
     Cuda_NonBonded_Energy( system, control, workspace, data,
             lists, out_control, (control->tabulate == 0) ? false: true );
 }
+
+
+void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, mpi_datatypes *mpi_data )
+{
+    rvec *f;
+
+    f = (rvec *) host_scratch;
+    memset( f, 0, sizeof(rvec) * system->N );
+
+    Cuda_Total_Forces( system, control, data, workspace );
+
+#if defined(PURE_REAX)
+    /* now all forces are computed to their partially-final values
+     * based on the neighbors information each processor has had.
+     * final values of force on each atom needs to be computed by adding up
+     * all partially-final pieces */
+
+    //MVAPICH2
+    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N ,
+            cudaMemcpyDeviceToHost, "total_force:f:get" );
+
+    Coll( system, mpi_data, f, mpi_data->mpi_rvec,
+          sizeof(rvec) / sizeof(void), rvec_unpacker );
+
+    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N,
+            cudaMemcpyHostToDevice, "total_force:f:put" );
+
+    Cuda_Total_Forces_PURE( system, dev_workspace );
+#endif
+
+}
+
+
+int Cuda_Compute_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int charge_flag, retVal;
+
+#if defined(LOG_PERFORMANCE)
+    real t_start = 0;
+
+    //MPI_Barrier( MPI_COMM_WORLD );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        t_start = Get_Time( );
+    }
+#endif
+
+    retVal = SUCCESS;
+
+    /********* init forces ************/
+    if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 )
+    {
+        charge_flag = TRUE;
+    }
+    else
+    {
+        charge_flag = FALSE;
+    }
+
+    if ( charge_flag == TRUE )
+    {
+        retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
+
+//        int i;
+//        static reax_list **temp_lists;
+//       
+//        if ( data->step == 0 )
+//        {
+//            temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" );
+//            for ( i = 0; i < LIST_N; ++i )
+//            {
+//                temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
+//                temp_lists[i]->allocated = FALSE;
+//            }
+//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
+//                    TYP_BOND, *temp_lists + BONDS );
+//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
+//                    TYP_HBOND, *temp_lists + HBONDS );
+//        }
+//        else
+//        {
+//            Delete_List( *temp_lists + BONDS );
+//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
+//                    TYP_BOND, *temp_lists + BONDS );
+//            Delete_List( *temp_lists + HBONDS );
+//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
+//                    TYP_HBOND, *temp_lists + HBONDS );
+//
+//        }
+//        Output_Sync_Lists( *temp_lists + BONDS, *dev_lists + BONDS, TYP_BOND );
+//        Print_Bonds( system, temp_lists, control );
+//        Output_Sync_Lists( *temp_lists + HBONDS, *dev_lists + HBONDS, TYP_HBOND );
+//        Print_HBonds( system, temp_lists, control, data->step );
+//        Print_HBond_Indices( system, temp_lists, control, data->step );
+//        exit( 0 );
+    }
+    else
+    {
+        retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
+    }
+
+    if ( retVal == SUCCESS )
+    {
+        //validate_sparse_matrix( system, workspace );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.init_forces) );
+        }
+#endif
+
+        /********* bonded interactions ************/
+        retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.bonded) );
+        }
+#endif
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: completed bonded\n",
+                 system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    }
+
+    if ( retVal == SUCCESS )
+    {
+    /**************** charges ************************/
+#if defined(PURE_REAX)
+        if ( charge_flag == TRUE )
+        {
+            Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
+        }
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.qEq) );
+        }
+#endif
+
+#if defined(DEBUG_FOCUS)
+        fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+#endif //PURE_REAX
+
+        /********* nonbonded interactions ************/
+        Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.nonb) );
+        }
+#endif
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
+                system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        /*********** total force ***************/
+        Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
+
+#if defined(LOG_PERFORMANCE)
+        //MPI_Barrier( MPI_COMM_WORLD );
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &(data->timing.bonded) );
+        }
+#endif
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: total forces computed\n",
+                system->my_rank, data->step );
+        //Print_Total_Force( system, data, workspace );
+        MPI_Barrier( MPI_COMM_WORLD );
+
+#endif
+
+//        Print_Forces( system );
+    }
+
+    return retVal;
+}
diff --git a/PG-PuReMD/src/cuda_forces.h b/PG-PuReMD/src/cuda/cuda_forces.h
similarity index 86%
rename from PG-PuReMD/src/cuda_forces.h
rename to PG-PuReMD/src/cuda/cuda_forces.h
index 9dc3da66..4abdb52f 100644
--- a/PG-PuReMD/src/cuda_forces.h
+++ b/PG-PuReMD/src/cuda/cuda_forces.h
@@ -2,7 +2,7 @@
 #ifndef __CUDA_FORCES_H__
 #define __CUDA_FORCES_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
@@ -32,6 +32,9 @@ void Cuda_Compute_NonBonded_Forces( reax_system *, control_params *,
         simulation_data *, storage *, reax_list **, output_controls *,
         mpi_datatypes * );
 
+int Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*,
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
+
 void Print_Forces( reax_system * );
 
 
diff --git a/PG-PuReMD/src/cuda_helpers.h b/PG-PuReMD/src/cuda/cuda_helpers.h
similarity index 97%
rename from PG-PuReMD/src/cuda_helpers.h
rename to PG-PuReMD/src/cuda/cuda_helpers.h
index a4943a5f..b14f45b3 100644
--- a/PG-PuReMD/src/cuda_helpers.h
+++ b/PG-PuReMD/src/cuda/cuda_helpers.h
@@ -1,7 +1,7 @@
 #ifndef __CUDA_HELPERS__
 #define __CUDA_HELPERS__
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 CUDA_DEVICE static inline int cuda_strcmp( char * a,
diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_hydrogen_bonds.cu
rename to PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
index 95eda081..18cdbb57 100644
--- a/PG-PuReMD/src/cuda_hydrogen_bonds.cu
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.cu
@@ -21,16 +21,14 @@
 
 #include "cuda_hydrogen_bonds.h"
 
-#include "reax_types.h"
-#include "index_utils.h"
-
 #include "cuda_valence_angles.h"
 #include "cuda_helpers.h"
 #include "cuda_list.h"
-#include "vector.h"
-
 #include "cuda_shuffle.h"
 
+#include "../index_utils.h"
+#include "../vector.h"
+
 
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, single_body_parameters *sbp, 
         hbond_parameters *d_hbp, global_parameters gp, control_params *control, 
diff --git a/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
new file mode 100644
index 00000000..606196b4
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_hydrogen_bonds.h
@@ -0,0 +1,48 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_HBONDS_H_
+#define __CUDA_HBONDS_H_
+
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs( reax_atom *,
+        storage, reax_list );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL( reax_atom *,
+        storage, reax_list, int );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess( reax_atom *,
+        storage, reax_list, int );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *,
+        single_body_parameters *, hbond_parameters *,
+        global_parameters, control_params *, storage ,
+        reax_list, reax_list, int, int, real *, rvec * );
+
+CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *,
+        single_body_parameters *, hbond_parameters *,
+        global_parameters , control_params *, storage,
+        reax_list, reax_list, int, int, real *, rvec * );
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
new file mode 100644
index 00000000..fb1ac0df
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -0,0 +1,412 @@
+
+#include "cuda_init_md.h"
+
+#include "cuda_allocate.h"
+#include "cuda_list.h"
+#include "cuda_copy.h"
+#include "cuda_forces.h"
+#include "cuda_integrate.h"
+#include "cuda_neighbors.h"
+#include "cuda_reset_tools.h"
+#include "cuda_system_props.h"
+#include "cuda_utils.h"
+#include "cuda_validation.h"
+
+#if defined(PURE_REAX)
+  #include "../box.h"
+  #include "../comm_tools.h"
+  #include "../grid.h"
+  #include "../init_md.h"
+  #include "../integrate.h"
+  #include "../io_tools.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+  #include "../lookup.h"
+#ifdef __cplusplus
+}
+#endif
+  #include "../random.h"
+  #include "../reset_tools.h"
+  #include "../tool_box.h"
+  #include "../vector.h"
+#elif defined(LAMMPS_REAX)
+  #include "../reax_box.h"
+  #include "../reax_comm_tools.h"
+  #include "../reax_grid.h"
+  #include "../reax_init_md.h"
+  #include "../reax_integrate.h"
+  #include "../reax_io_tools.h"
+  #include "../reax_list.h"
+  #include "../reax_lookup.h"
+  #include "../reax_random.h"
+  #include "../reax_reset_tools.h"
+  #include "../reax_tool_box.h"
+  #include "../reax_vector.h"
+#endif
+
+
+void Cuda_Init_ScratchArea( )
+{
+    cuda_malloc( (void **)&scratch, DEVICE_SCRATCH_SIZE, TRUE, "device:scratch" );
+
+    host_scratch = (void *) smalloc( HOST_SCRATCH_SIZE, "host:scratch" );
+}
+
+
+int Cuda_Init_System( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        mpi_datatypes *mpi_data, char *msg )
+{
+    int i, ret;
+    reax_atom *atom;
+    int nrecv[MAX_NBRS];
+
+    Setup_New_Grid( system, control, MPI_COMM_WORLD );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d GRID:\n", system->my_rank );
+    Print_Grid( &(system->my_grid), stderr );
+#endif
+
+    Bin_My_Atoms( system, &(workspace->realloc) );
+    Reorder_My_Atoms( system, workspace );
+
+    /* estimate N and total capacity */
+    for ( i = 0; i < MAX_NBRS; ++i )
+    {
+        nrecv[i] = 0;
+    }
+
+    MPI_Barrier( MPI_COMM_WORLD );
+    system->max_recved = 0;
+    system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
+            Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE );
+    system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
+    Bin_Boundary_Atoms( system );
+
+    /* Sync atoms here to continue the computation */
+    dev_alloc_system( system );
+    Sync_System( system );
+
+    /* estimate numH and Hcap */
+    Cuda_Reset_Atoms( system, control );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: n=%d local_cap=%d\n",
+             system->my_rank, system->n, system->local_cap );
+    fprintf( stderr, "p%d: N=%d total_cap=%d\n",
+             system->my_rank, system->N, system->total_cap );
+    fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
+             system->my_rank, system->numH, system->Hcap );
+#endif
+
+    Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D );
+
+    Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );
+
+//    if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE )
+//    {
+//        return FAILURE;
+//    }
+
+    /* initialize velocities so that desired init T can be attained */
+    if ( !control->restart || (control->restart && control->random_vel) )
+    {
+        Generate_Initial_Velocities( system, control->T_init );
+    }
+
+    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+    return SUCCESS;
+}
+
+
+void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
+        simulation_data *data, char *msg )
+{
+    dev_alloc_simulation_data( data );
+
+    Reset_Simulation_Data( data );
+
+    if ( !control->restart )
+    {
+        data->step = data->prev_steps = 0;
+    }
+
+    switch ( control->ensemble )
+    {
+    case NVE:
+        data->N_f = 3 * system->bigN;
+        Cuda_Evolve = Velocity_Verlet_NVE;
+        control->virial = 0;
+        break;
+
+    case bNVT:
+        data->N_f = 3 * system->bigN + 1;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
+        control->virial = 0;
+        break;
+
+    case nhNVT:
+        fprintf( stderr, "[WARNING] Nose-Hoover NVT is still under testing.\n" );
+        data->N_f = 3 * system->bigN + 1;
+        Cuda_Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
+        control->virial = 0;
+        if ( !control->restart || (control->restart && control->random_vel) )
+        {
+            data->therm.G_xi = control->Tau_T *
+                               (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
+            data->therm.v_xi = data->therm.G_xi * control->dt;
+            data->therm.v_xi_old = 0;
+            data->therm.xi = 0;
+        }
+        break;
+
+    case sNPT: /* Semi-Isotropic NPT */
+        data->N_f = 3 * system->bigN + 4;
+        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
+        control->virial = 1;
+        if ( !control->restart )
+        {
+            Reset_Pressures( data );
+        }
+        break;
+
+    case iNPT: /* Isotropic NPT */
+        data->N_f = 3 * system->bigN + 2;
+        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
+        control->virial = 1;
+        if ( !control->restart )
+        {
+            Reset_Pressures( data );
+        }
+        break;
+
+    case NPT: /* Anisotropic NPT */
+        data->N_f = 3 * system->bigN + 9;
+        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
+        control->virial = 1;
+
+        fprintf( stderr, "p%d: init_simulation_data: option not yet implemented\n",
+              system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
+        break;
+
+    default:
+        fprintf( stderr, "p%d: init_simulation_data: ensemble not recognized\n",
+              system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
+    }
+
+    /* initialize the timer(s) */
+    MPI_Barrier( MPI_COMM_WORLD );
+    if ( system->my_rank == MASTER_NODE )
+    {
+        data->timing.start = Get_Time( );
+
+#if defined(LOG_PERFORMANCE)
+        Reset_Timing( &data->timing );
+#endif
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "data->N_f: %8.3f\n", data->N_f );
+#endif
+}
+
+
+void Cuda_Init_Workspace( reax_system *system, control_params *control,
+        storage *workspace, char *msg )
+{
+    dev_alloc_workspace( system, control, dev_workspace,
+            system->local_cap, system->total_cap, msg );
+
+    memset( &(workspace->realloc), 0, sizeof(reallocate_data) );
+    Cuda_Reset_Workspace( system, workspace );
+
+    /* Initialize the Taper function */
+    Init_Taper( control, dev_workspace );
+}
+
+
+int Cuda_Init_Lists( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        mpi_datatypes *mpi_data, char *msg )
+{
+    int ret;
+    int Htop;
+   
+    /* ignore returned error, as system->d_max_far_nbrs was not valid */
+    ret = Cuda_Estimate_Neighbors( system, data->step );
+
+    Dev_Make_List( system->total_cap, system->total_far_nbrs,
+            TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n",
+            system->my_rank, system->total_far_nbrs,
+            (int)(system->total_far_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
+    fprintf( stderr, "N: %d and total_cap: %d \n", system->N, system->total_cap );
+#endif
+
+    Cuda_Init_Neighbor_Indices( system );
+
+    Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists );
+
+    /* estimate storage for bonds and hbonds */
+    Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step );
+
+    /* estimate storage for charge sparse matrix */
+//    Cuda_Estimate_Storage_Sparse_Matrix( system, control, data, dev_lists );
+
+    dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries );
+
+    Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) );
+
+    //MATRIX CHANGES
+    //workspace->L = NULL;
+    //workspace->U = NULL;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, cap: %d \n",
+            system->my_rank, system->total_cm_entries, dev_workspace->H.m );
+    fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
+            system->my_rank, Htop,
+            (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
+#endif
+
+    if ( control->hbond_cut > 0.0 &&  system->numH > 0 )
+    {
+        Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS );
+//        Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *lists + HBONDS );
+
+        Cuda_Init_HBond_Indices( system );
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
+                system->my_rank, system->total_hbonds,
+                (int)(system->total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
+#endif
+    }
+
+    /* bonds list */
+    Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS );
+//    Make_List( system->total_cap, system->total_bonds, TYP_BOND, *lists + BONDS );
+
+    Cuda_Init_Bond_Indices( system );
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
+            system->my_rank, total_bonds,
+            (int)(total_bonds * sizeof(bond_data) / (1024 * 1024)) );
+#endif
+
+    /* 3bodies list: since a more accurate estimate of the num.
+     * of three body interactions requires that bond orders have
+     * been computed, delay estimation until for computation */
+
+    return SUCCESS;
+}
+
+
+void Cuda_Initialize( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
+{
+    char msg[MAX_STR];
+    real t_start, t_end;
+
+    /* HOST/DEVICE SCRATCH */
+    Cuda_Init_ScratchArea( );
+
+    /* MPI_DATATYPES */
+    if ( Init_MPI_Datatypes( system, workspace, mpi_data, msg ) == FAILURE )
+    {
+        fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
+                 system->my_rank );
+        fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+    /* SYSTEM */
+    if ( Cuda_Init_System( system, control, data, workspace, mpi_data, msg ) == FAILURE )
+    {
+        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+    /* GRID */
+    dev_alloc_grid( system );
+    Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+    //validate_grid( system );
+
+    /* SIMULATION_DATA */
+    Cuda_Init_Simulation_Data( system, control, data, msg );
+
+    /* WORKSPACE */
+    Cuda_Init_Workspace( system, control, workspace, msg );
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
+#endif
+
+    //Sync the taper here from host to device.
+
+    /* CONTROL */
+    dev_alloc_control( control );
+
+    /* LISTS */
+    if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) ==
+            FAILURE )
+    {
+        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
+#endif
+
+    /* OUTPUT Files */
+    if ( Init_Output_Files( system, control, out_control, mpi_data, msg ) == FAILURE )
+    {
+        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+        fprintf( stderr, "p%d: could not open output files! terminating...\n",
+                 system->my_rank );
+        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: output files opened\n", system->my_rank );
+#endif
+
+    /* Lookup Tables */
+    if ( control->tabulate )
+    {
+        if ( Init_Lookup_Tables( system, control, dev_workspace->Tap, mpi_data, msg ) == FAILURE )
+        {
+            fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
+            fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
+                     system->my_rank );
+            MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
+        }
+
+#if defined(DEBUG)
+        fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank );
+#endif
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: Device Initialization Done \n", system->my_rank );
+#endif
+}
+
+
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.h b/PG-PuReMD/src/cuda/cuda_init_md.h
new file mode 100644
index 00000000..328674a5
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_init_md.h
@@ -0,0 +1,22 @@
+
+#ifndef __CUDA_INIT_MD_H__
+#define __CUDA_INIT_MD_H__
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Cuda_Init_ScratchArea( );
+
+void Cuda_Initialize( reax_system*, control_params*, simulation_data*,
+        storage*, reax_list**, output_controls*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu
new file mode 100644
index 00000000..dcb97292
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_integrate.cu
@@ -0,0 +1,249 @@
+
+#include "cuda_integrate.h"
+
+#include "cuda_allocate.h"
+#include "cuda_forces.h"
+#include "cuda_integrate.h"
+#include "cuda_copy.h"
+#include "cuda_neighbors.h"
+#include "cuda_reset_tools.h"
+#include "cuda_system_props.h"
+#include "cuda_utils.h"
+
+#include "../comm_tools.h"
+#include "../grid.h"
+#include "../vector.h"
+
+
+CUDA_GLOBAL void k_update_velocity_1( reax_atom *my_atoms, 
+        single_body_parameters *sbp, real dt, int n )
+{
+    real inv_m;
+    rvec dx;
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 1st part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute x(t + dt) */
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+    /* Compute v(t + dt/2) */
+    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+}
+
+
+void bNVT_update_velocity_part1( reax_system *system, real dt )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_update_velocity_2( reax_atom *my_atoms, 
+        single_body_parameters *sbp, real dt, int n )
+{
+    reax_atom *atom;
+    real inv_m;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 2nd part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute v(t + dt) */
+    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+}
+
+
+void bNVT_update_velocity_part2( reax_system *system, real dt )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_scale_velocities( reax_atom *my_atoms, real lambda, int n )
+{
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* Scale velocities and positions at t+dt */
+    atom = &(my_atoms[i]);
+    rvec_Scale( atom->v, lambda, atom->v );
+}
+
+
+void bNVT_scale_velocities( reax_system *system, real lambda )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, lambda, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int i, steps, renbr, ret;
+    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
+    real inv_m, dt, lambda;
+    rvec dx;
+    reax_atom *atom;
+    int *bond_top, *hb_top;
+    int Htop, num_3body;
+    int total_hbonds, count, total_bonds;
+    int bond_cap, cap_3body;
+    real t_over_start, t_over_elapsed;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+    ret = SUCCESS;
+
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+    if ( verlet_part1_done == FALSE )
+    {
+        /* velocity verlet, 1st part */
+        bNVT_update_velocity_part1( system, dt );
+        verlet_part1_done = TRUE;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Output_Sync_Atoms( system );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Sync_Atoms( system );
+
+        /* synch the Grid to the Device here */
+        Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+        init_blocks( system );
+
+#if defined(__CUDA_DEBUG_LOG__)
+        fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n",
+                system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
+#endif
+    }
+    
+    Cuda_Reset( system, control, data, workspace, lists );
+
+    if ( renbr )
+    {
+#if defined(DEBUG)
+        t_over_start  = Get_Time ();
+#endif
+
+        if ( estimate_nbrs_done == 0 )
+        {
+            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
+            ret = Cuda_Estimate_Neighbors( system, data->step );
+            estimate_nbrs_done = 1;
+        }
+
+        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
+        {
+            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+            estimate_nbrs_done = 2;
+    
+#if defined(DEBUG)
+            t_over_elapsed  = Get_Timing_Info( t_over_start );
+            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                    system->my_rank, data->step, t_over_elapsed );
+#endif
+        }
+    }
+
+    if ( ret == SUCCESS )
+    {
+        ret = Cuda_Compute_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+    }
+
+    if ( ret == SUCCESS )
+    {
+        /* velocity verlet, 2nd part */
+        bNVT_update_velocity_part2( system, dt );
+
+#if defined(DEBUG_FOCUS)
+        fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        /* temperature scaler */
+        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+        lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+        if ( lambda < MIN_dT )
+        {
+            lambda = MIN_dT;
+        }
+        else if (lambda > MAX_dT )
+        {
+            lambda = MAX_dT;
+        }
+        lambda = SQRT( lambda );
+
+        /* Scale velocities and positions at t+dt */
+        bNVT_scale_velocities( system, lambda );
+
+        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: scaled velocities\n",
+                 system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        verlet_part1_done = FALSE;
+        estimate_nbrs_done = 0;
+    }
+
+    return ret;
+}
diff --git a/PG-PuReMD/src/cuda_integrate.h b/PG-PuReMD/src/cuda/cuda_integrate.h
similarity index 86%
rename from PG-PuReMD/src/cuda_integrate.h
rename to PG-PuReMD/src/cuda/cuda_integrate.h
index b71e14e3..2797b3e3 100644
--- a/PG-PuReMD/src/cuda_integrate.h
+++ b/PG-PuReMD/src/cuda/cuda_integrate.h
@@ -22,18 +22,26 @@
 #ifndef __CUDA_INTEGRATE_H_
 #define __CUDA_INTEGRATE_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 void bNVT_update_velocity_part1( reax_system *, real );
+
 void bNVT_update_velocity_part2( reax_system *, real );
+
 void bNVT_scale_velocities( reax_system *, real );
 
+int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
+
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
new file mode 100644
index 00000000..dc7a2fc3
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
@@ -0,0 +1,1113 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_lin_alg.h"
+
+#include "cuda_shuffle.h"
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+
+#include "../basic_comm.h"
+
+
+//one thread per row
+CUDA_GLOBAL void k_matvec( sparse_matrix H, real *vec, real *results,
+        int rows )
+{
+    int i, col;
+    real results_row;
+    real val;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= rows )
+    {
+        return;
+    }
+
+    results_row = 0;
+
+    for (int c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
+
+        results_row += val * vec[col];
+    }
+
+    results[i] = results_row;
+}
+
+
+//32 thread warp per matrix row.
+//invoked as follows
+// <<< system->N, 32 >>>
+//CUDA_GLOBAL void __launch_bounds__(384, 16) k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows)
+CUDA_GLOBAL void k_matvec_csr( sparse_matrix H, real *vec, real *results,
+        int num_rows )
+{
+#if defined(__SM_35__)
+    real vals;
+    int x;
+#else
+    extern __shared__ real vals[];
+#endif
+    int jj;
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
+    int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    int row = warp_id;
+    
+#if defined(__SM_35__)
+    vals = 0;
+#else
+    vals[threadIdx.x] = 0;
+#endif
+
+    if (row < num_rows)
+    {
+        row_start = H.start[row];
+        row_end = H.end[row];
+
+        // compute running sum per thread
+        for ( jj = row_start + lane; jj < row_end;
+                jj += MATVEC_KER_THREADS_PER_ROW )
+#if defined(__SM_35__)
+        {
+            vals += H.entries[jj].val * vec[ H.entries[jj].j ];
+        }
+    }
+#else
+        {
+            vals[threadIdx.x] += H.entries[jj].val * vec[ H.entries[jj].j ];
+        }
+    }
+
+    __syncthreads( );
+#endif
+
+    // parallel reduction in shared memory
+    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+#if defined(__SM_35__)
+    for (x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2)
+    {
+        vals += shfl( vals, x );
+    }
+
+    if (lane == 0 && row < num_rows)
+    {
+        results[row] = vals;
+    }
+#else
+    if (lane < 16)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 16];
+    }
+    __syncthreads( );
+    if (lane < 8)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 8];
+    }
+    __syncthreads( );
+    if (lane < 4)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 4];
+    }
+    __syncthreads( );
+    if (lane < 2)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 2];
+    }
+    __syncthreads( );
+    if (lane < 1)
+    {
+        vals[threadIdx.x] += vals[threadIdx.x + 1];
+    }
+    __syncthreads( );
+
+    // first thread writes the result
+    if (lane == 0 && row < num_rows)
+    {
+        results[row] = vals[threadIdx.x];
+    }
+#endif
+}
+
+
+//one thread per row
+CUDA_GLOBAL void k_dual_matvec( sparse_matrix H, rvec2 *vec, rvec2 *results,
+        int rows )
+{
+    int i, c, col;
+    rvec2 results_row;
+    real val;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= rows)
+    {
+        return;
+    }
+
+    results_row[0] = 0.0;
+    results_row[1] = 0.0;
+
+    for (c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
+
+        results_row[0] += val * vec [col][0];
+        results_row[1] += val * vec [col][1];
+    }
+
+    results[i][0] = results_row[0];
+    results[i][1] = results_row[1];
+}
+
+
+//32 thread warp per matrix row.
+//invoked as follows
+// <<< system->N, 32 >>>
+//CUDA_GLOBAL void __launch_bounds__(384, 8) k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, int num_rows)
+CUDA_GLOBAL void  k_dual_matvec_csr( sparse_matrix H, rvec2 *vec,
+        rvec2 *results, int num_rows )
+{
+#if defined(__SM_35__)
+    rvec2 rvals;
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
+    int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    int row = warp_id;
+
+    rvals[0] = 0;
+    rvals[1] = 0;
+
+    if (row < num_rows)
+    {
+        row_start = H.start[row];
+        row_end = H.end[row];
+
+        for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW)
+        {
+            rvals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
+            rvals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
+        }
+    }
+
+    for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2)
+    {
+        rvals[0] += shfl( rvals[0], s);
+        rvals[1] += shfl( rvals[1], s);
+    }
+
+    if (lane == 0 && row < num_rows)
+    {
+        results[row][0] = rvals[0];
+        results[row][1] = rvals[1];
+    }
+
+#else
+    extern __shared__ rvec2 rvals[];
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / 32;
+    int lane = thread_id & (32 - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    //int row = warp_id;
+    int row = warp_id;
+
+    rvals[threadIdx.x][0] = 0;
+    rvals[threadIdx.x][1] = 0;
+
+    if (row < num_rows)
+    {
+        row_start = H.start[row];
+        row_end = H.end[row];
+
+        // compute running sum per thread
+        for(int jj = row_start + lane; jj < row_end; jj += 32)
+        {
+            rvals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
+            rvals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
+        }
+    }
+
+    __syncthreads( );
+
+    // parallel reduction in shared memory
+    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+    if (lane < 16)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 16][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 16][1]; 
+    }
+    __syncthreads( );
+    if (lane < 8)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 8][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 8][1]; 
+    }
+    __syncthreads( );
+    if (lane < 4)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 4][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 4][1]; 
+    }
+    __syncthreads( );
+    if (lane < 2)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 2][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 2][1]; 
+    }
+    __syncthreads( );
+    if (lane < 1)
+    {
+        rvals[threadIdx.x][0] += rvals[threadIdx.x + 1][0]; 
+        rvals[threadIdx.x][1] += rvals[threadIdx.x + 1][1]; 
+    }
+    __syncthreads( );
+
+    // first thread writes the result
+    if (lane == 0 && row < num_rows)
+    {
+        results[row][0] = rvals[threadIdx.x][0];
+        results[row][1] = rvals[threadIdx.x][1];
+    }
+
+#endif
+}
+
+
+void Cuda_Vector_Sum( real *res, real a, real *x, real b, real *y, int count )
+{
+    //res = ax + by
+    //use the cublas here
+    int blocks;
+
+    blocks = (count / DEF_BLOCK_SIZE) + 
+        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
+        ( res, a, x, b, y, count );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count )
+{
+    //res = a*b - vector multiplication
+    //use the cublas here.
+    int blocks;
+
+    blocks = (count / DEF_BLOCK_SIZE) + 
+        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
+        ( res, a, b, count );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
+{
+    storage *workspace;
+    int j;
+   
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= n )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+
+    //for( j = 0; j < system->n; ++j ) {
+    // residual 
+    workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
+    workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
+
+    // apply diagonal pre-conditioner
+    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
+    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
+    //}
+}
+
+
+void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
+        (*workspace, b, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, 
+        real alpha_0, real alpha_1, int n, rvec2 *my_dot )
+{
+    storage *workspace;
+    rvec2 alpha;
+    int j;
+   
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( j >= n )
+    {
+        return;
+    }
+
+    workspace = &( p_workspace );
+    alpha[0] = alpha_0;
+    alpha[1] = alpha_1;
+    my_dot[j][0] = my_dot[j][1] = 0.0;
+
+    //for( j = 0; j < system->n; ++j ) {
+    // update x 
+    x[j][0] += alpha[0] * workspace->d2[j][0];
+    x[j][1] += alpha[1] * workspace->d2[j][1];      
+
+    // update residual 
+    workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
+    workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
+
+    // apply diagonal pre-conditioner 
+    workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+    workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+
+    // dot product: r.p 
+    my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
+    my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
+    //}
+}
+
+
+void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha,
+        int n, rvec2 result )
+{
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    cuda_memset( tmp, 0, sizeof(rvec2) * ( 2 * n + 1),
+            "cuda_dualcg_preconditioner" );
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
+        (*workspace, x, alpha[0], alpha[1], n, tmp);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    //Reduction to calculate my_dot
+    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        ( tmp, tmp + n, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
+        ( tmp + n, tmp + 2*n, blocks);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( result, (tmp + 2*n), sizeof(rvec2),
+            cudaMemcpyDeviceToHost, "my_dot" );
+}
+
+
+void Cuda_Norm( rvec2 *arr, int n, rvec2 result )
+{
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        (arr, tmp, n, INITIAL);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
+        (tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_norm_rvec2" );
+}
+
+
+void Cuda_Dot( rvec2 *a, rvec2 *b, rvec2 result, int n )
+{
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
+        ( a, b, tmp, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> 
+    //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * BLOCKS_POW_2 >>> 
+        ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_dot" );
+}
+
+
+void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
+        ( x, a, b[0], b[1], c, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n )
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= n)
+    {
+        return;
+    }
+
+    dst[i] = src[i][index];
+}
+
+
+void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( dst, src, index, n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= n)
+    {
+        return;
+    }
+
+    dst[i][index] = src[i];
+}
+
+
+void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( dst, src, index, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+
+    cuda_memset( b, 0, sizeof(rvec2) * size, "dual_matvec:result" );
+
+    //One thread per row implementation
+    //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+    //        (*H, a, b, n);
+    //cudaThreadSynchronize ();
+    //cudaCheckError ();
+
+    //One warp per row implementation
+#if defined(__SM_35__)
+    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
+#else
+    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
+                      sizeof(rvec2) * MATVEC_BLOCK_SIZE >>>
+#endif
+            ( *H, a, b, n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size )
+{
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+
+    cuda_memset( b, 0, sizeof(real) * size, "dual_matvec:result" );
+
+    //one thread per row implementation
+    //k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+    //        (*H, a, b, n);
+    //cudaThreadSynchronize ();
+    //cudaCheckError ();
+
+#if defined(__SM_35__)
+    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
+#else
+    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
+                 sizeof(real) * MATVEC_BLOCK_SIZE>>>
+#endif
+                     (*H, a, b, n);
+
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
+        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
+        simulation_data *data )
+{
+    int  i, j, n, N, matvecs, scale;
+    rvec2 tmp, alpha, beta;
+    rvec2 my_sum, norm_sqr, b_norm, my_dot;
+    rvec2 sig_old, sig_new;
+    MPI_Comm comm;
+    rvec2 *spad = (rvec2 *) host_scratch;
+    int a;
+
+    n = system->n;
+    N = system->N;
+    comm = mpi_data->world;
+    matvecs = 0;
+    scale = sizeof(rvec2) / sizeof(void);
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        matvecs = 0;
+        t_start = matvec_time = dot_time = 0;
+        t_start = Get_Time( );
+    }
+#endif
+
+    //MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//  Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
+//#endif
+
+//  check_zeros_device( x, system->N, "x" );
+
+    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" );
+    Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
+    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" );
+
+//  check_zeros_device( x, system->N, "x" );
+
+//  compare_rvec2 (workspace->x, x, N, "x");
+//  if (data->step > 0) {
+//      compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
+//      compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
+//
+//      exit (0);
+//  }
+
+
+//#ifdef __CUDA_DEBUG__
+//  dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
+//#endif
+    //originally we were using only H->n which was system->n (init_md.c)
+    //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
+    
+    Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
+
+//  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
+//  if (data->step > 0) exit (0);
+
+    // tryQEq
+    //MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//  Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
+//#endif
+    
+    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+            cudaMemcpyDeviceToHost, "CG:q2:get" );
+    Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
+    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+            cudaMemcpyHostToDevice,"CG:q2:put" );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &matvec_time );
+    }
+#endif
+
+//#ifdef __CUDA_DEBUG__
+//  for( j = 0; j < system->n; ++j ) {
+//    // residual
+//    workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
+//    workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
+//    // apply diagonal pre-conditioner
+//    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+//    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+//  }
+//#endif
+    
+    Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n );
+
+//  compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
+//  compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
+
+    /* norm of b */
+//#ifdef __CUDA_DEBUG__
+//  my_sum[0] = my_sum[1] = 0;
+//  for( j = 0; j < n; ++j ) {
+//    my_sum[0] += SQR( workspace->b[j][0] );
+//    my_sum[1] += SQR( workspace->b[j][1] );
+//  }
+//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
+//#endif
+
+    my_sum[0] = my_sum[1] = 0;
+    Cuda_Norm (b, n, my_sum);
+
+//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
+
+    MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
+    b_norm[0] = SQRT( norm_sqr[0] );
+    b_norm[1] = SQRT( norm_sqr[1] );
+    //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
+
+    /* dot product: r.d */
+//#ifdef __CUDA_DEBUG__
+//  my_dot[0] = my_dot[1] = 0;
+//  for( j = 0; j < n; ++j ) {
+//    my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
+//    my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
+//  }
+//  fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
+//#endif
+
+    my_dot[0] = my_dot[1] = 0;
+    Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
+
+// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
+    
+    MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
+
+    //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &dot_time );
+    }
+#endif
+
+    for ( i = 1; i < 300; ++i )
+    {
+        //MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//    Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
+//#endif
+        
+        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cg:d2:get" );
+        Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
+        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyHostToDevice, "cg:d2:put" );
+
+        //print_device_rvec2 (dev_workspace->d2, N);
+
+//#ifdef __CUDA_DEBUG__
+//    dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
+//#endif
+        
+        Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
+                system->total_cap );
+
+        /*
+        fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
+        fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
+        print_sparse_matrix_host (&workspace->H);
+        fprintf (stderr, " ******* HOST Vector ***************\n");
+        print_host_rvec2 (workspace->d2, system->N);
+        fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
+        print_sparse_matrix (&dev_workspace->H);
+        fprintf (stderr, " ******* Device Vector ***************\n");
+        print_device_rvec2 (dev_workspace->d2, system->N);
+        */
+        //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
+        // tryQEq
+        // MVAPICH2
+//#ifdef __CUDA_DEBUG__
+//    Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
+//#endif
+
+        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cg:q2:get" );
+        Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
+        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
+                cudaMemcpyHostToDevice, "cg:q2:put" );
+
+//       compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &matvec_time );
+        }
+#endif
+
+        /* dot product: d.q */
+//#ifdef __CUDA_DEBUG__
+//    my_dot[0] = my_dot[1] = 0;
+//    for( j = 0; j < n; ++j ) {
+//      my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
+//      my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
+//    }
+//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+//#endif
+
+        my_dot[0] = my_dot[1] = 0;
+        Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
+        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+
+        MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
+        //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
+
+        alpha[0] = sig_new[0] / tmp[0];
+        alpha[1] = sig_new[1] / tmp[1];
+        my_dot[0] = my_dot[1] = 0;
+
+//#ifdef __CUDA_DEBUG__
+//    for( j = 0; j < system->n; ++j ) {
+//      // update x
+//      workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
+//      workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
+//      // update residual
+//      workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
+//      workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
+//      // apply diagonal pre-conditioner
+//      workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+//      workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+//      // dot product: r.p
+//      my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
+//      my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
+//    }
+//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+//#endif
+
+        my_dot[0] = my_dot[1] = 0;
+        Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
+
+        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
+
+//   compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
+//   compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
+//   compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
+
+        sig_old[0] = sig_new[0];
+        sig_old[1] = sig_new[1];
+        MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
+
+        //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &dot_time );
+        }
+#endif
+
+        if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
+        {
+            break;
+        }
+
+        beta[0] = sig_new[0] / sig_old[0];
+        beta[1] = sig_new[1] / sig_old[1];
+
+//#ifdef __CUDA_DEBUG__
+//    for( j = 0; j < system->n; ++j ) {
+//      // d = p + beta * d
+//      workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
+//      workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
+//    }
+//#endif
+
+        Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
+                dev_workspace->d2, system->n );
+
+//       compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
+    }
+
+
+    if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
+    {
+        //for( j = 0; j < n; ++j )
+        //  workspace->t[j] = workspace->x[j][1];
+        //fprintf (stderr, "Getting started with Cuda_CG1 \n");
+
+        Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
+
+        //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
+        //compare_array (workspace->t, dev_workspace->t, system->n, "t");
+
+        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
+                mpi_data, fout );
+
+        //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
+        //for( j = 0; j < n; ++j )
+        //  workspace->x[j][1] = workspace->t[j];
+
+        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
+    }
+    else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
+    {
+        //for( j = 0; j < n; ++j )
+        //  workspace->s[j] = workspace->x[j][0];
+
+        Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
+
+        //compare_array (workspace->s, dev_workspace->s, system->n, "s");
+        //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
+
+        //fprintf (stderr, "Getting started with Cuda_CG2 \n");
+
+        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
+                mpi_data, fout );
+
+        //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
+        //for( j = 0; j < system->n; ++j )
+        //  workspace->x[j][0] = workspace->s[j];
+
+        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
+    }
+
+    if ( i >= 300 )
+    {
+        fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n",
+                system->my_rank, i );
+        fprintf( stderr, "    [INFO] s lin solve error: %f\n", SQRT(sig_new[0]) / b_norm[0] );
+        fprintf( stderr, "    [INFO] t lin solve error: %f\n", SQRT(sig_new[1]) / b_norm[1] );
+    }
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n",
+                i + 1, matvecs, matvec_time, dot_time );
+    }
+#endif
+
+    return (i + 1) + matvecs;
+}
+
+
+int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
+        *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+{
+    int  i, j, scale;
+    real tmp, alpha, beta, b_norm;
+    real sig_old, sig_new, sig0;
+    real *spad = (real *) host_scratch;
+
+    scale = sizeof(real) / sizeof(void);
+
+    /* x is on the device */
+    //MVAPICH2
+    memset( spad, 0, sizeof(real) * system->total_cap );
+    copy_host_device( spad, x, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:x:get" );
+    Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
+
+    //MVAPICH2
+    copy_host_device( spad, x, sizeof(real) * system->total_cap,
+            cudaMemcpyHostToDevice, "cuda_cg:x:put" );
+    Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
+
+    // tryQEq
+    // MVAPICH2
+    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
+    Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
+
+    //MVAPICH2
+    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+            cudaMemcpyHostToDevice, "cuda_cg:q:put" );
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &matvec_time );
+    }
+#endif
+
+    Cuda_Vector_Sum( dev_workspace->r , 1.,  b, -1., dev_workspace->q,
+            system->n );
+    //for( j = 0; j < system->n; ++j )
+    //  workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+    Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
+            dev_workspace->Hdia_inv, system->n );
+
+    //TODO do the parallel_norm on the device for the local sum
+    copy_host_device( spad, b, sizeof(real) * system->n,
+            cudaMemcpyDeviceToHost, "cuda_cg:b:get" );
+    b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
+
+    //TODO do the parallel dot on the device for the local sum
+    copy_host_device( spad, dev_workspace->r, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
+    copy_host_device( spad + system->total_cap, dev_workspace->d, sizeof(real) * system->total_cap,
+            cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
+    sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
+            mpi_data->world );
+
+    sig0 = sig_new;
+
+#if defined(CG_PERFORMANCE)
+    if ( system->my_rank == MASTER_NODE )
+    {
+        Update_Timing_Info( &t_start, &dot_time );
+    }
+#endif
+
+    for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
+    {
+        //MVAPICH2
+        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
+        Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
+        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
+                cudaMemcpyHostToDevice, "cuda_cg:d:put" );
+
+        Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
+
+        //tryQEq
+        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
+        Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
+        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
+                cudaMemcpyHostToDevice, "cuda_cg:q:get" );
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &matvec_time );
+        }
+#endif
+
+        //TODO do the parallel dot on the device for the local sum
+        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
+        copy_host_device( spad + system->n, dev_workspace->q, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
+        tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world );
+
+        alpha = sig_new / tmp;
+        //Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
+        Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
+
+        //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
+        Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
+                dev_workspace->r, system->n );
+        /* pre-conditioning */
+        //for( j = 0; j < system->n; ++j )
+        //  workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
+                dev_workspace->Hdia_inv, system->n );
+
+        sig_old = sig_new;
+
+        //TODO do the parallel dot on the device for the local sum
+        copy_host_device( spad, dev_workspace->r, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
+        copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n,
+                cudaMemcpyDeviceToHost, "cuda_cg:p:get" );
+        sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
+        //fprintf (stderr, "Device: sig_new: %f \n", sig_new );
+
+        beta = sig_new / sig_old;
+        Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
+                dev_workspace->d, system->n );
+
+#if defined(CG_PERFORMANCE)
+        if ( system->my_rank == MASTER_NODE )
+        {
+            Update_Timing_Info( &t_start, &dot_time );
+        }
+#endif
+    }
+
+    if ( i >= 300 )
+    {
+        fprintf( stderr, "CG convergence failed!\n" );
+        return i;
+    }
+
+    return i;
+}
diff --git a/PG-PuReMD/src/cuda_lin_alg.h b/PG-PuReMD/src/cuda/cuda_lin_alg.h
similarity index 52%
rename from PG-PuReMD/src/cuda_lin_alg.h
rename to PG-PuReMD/src/cuda/cuda_lin_alg.h
index a7e3cc5f..aa31c126 100644
--- a/PG-PuReMD/src/cuda_lin_alg.h
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.h
@@ -22,29 +22,44 @@
 #ifndef __CUDA_LIN_ALG_H_
 #define __CUDA_LIN_ALG_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+void Cuda_Vector_Sum( real *, real, real *, real, real *, int );
 
-void Cuda_Vector_Sum(real *res, real a, real *x, real b, real *y, int count);
-void Cuda_CG_Preconditioner(real *res, real *a, real *b, int count);
-void Cuda_CG_Diagonal_Preconditioner(storage *workspace, rvec2 *b, int n);
-void Cuda_DualCG_Preconditioner(storage *workspace, rvec2 *, rvec2 alpha, int n, rvec2 result);
-void Cuda_Norm(rvec2 *arr, int n, rvec2 result);
-void Cuda_Dot(rvec2 *a, rvec2 *b, rvec2 result, int n);
-void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *, rvec2 , rvec2 *c, int n);
-void Cuda_RvecCopy_From(real *dst, rvec2 *src, int index, int n);
-void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n);
-void Cuda_Dual_Matvec(sparse_matrix *, rvec2 *, rvec2 *, int , int);
-void Cuda_Matvec(sparse_matrix *, real *, real *, int , int);
+void Cuda_CG_Preconditioner( real *, real *, real *, int );
 
+void Cuda_CG_Diagonal_Preconditioner( storage *, rvec2 *, int );
+
+void Cuda_DualCG_Preconditioner( storage *, rvec2 *, rvec2, int, rvec2 );
+
+void Cuda_Norm( rvec2 *, int, rvec2 );
+
+void Cuda_Dot( rvec2 *, rvec2 *, rvec2, int );
+
+void Cuda_Vector_Sum_Rvec2( rvec2 *, rvec2 *, rvec2, rvec2 *, int );
+
+void Cuda_RvecCopy_From( real *, rvec2 *, int, int );
+
+void Cuda_RvecCopy_To( rvec2 *, real *, int, int );
+
+void Cuda_Dual_Matvec( sparse_matrix *, rvec2 *, rvec2 *, int , int );
+
+void Cuda_Matvec( sparse_matrix *, real *, real *, int , int );
+
+int Cuda_dual_CG( reax_system*, storage*, sparse_matrix*,
+        rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data * );
+
+int Cuda_CG( reax_system*, storage*, sparse_matrix*,
+        real*, real, real*, mpi_datatypes*, FILE* );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_list.cu b/PG-PuReMD/src/cuda/cuda_list.cu
similarity index 96%
rename from PG-PuReMD/src/cuda_list.cu
rename to PG-PuReMD/src/cuda/cuda_list.cu
index 21d8d091..9d0626f1 100644
--- a/PG-PuReMD/src/cuda_list.cu
+++ b/PG-PuReMD/src/cuda/cuda_list.cu
@@ -19,15 +19,14 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
 #include "cuda_utils.h"
 
 #if defined(PURE_REAX)
-  #include "list.h"
-  #include "tool_box.h"
+  #include "../list.h"
+  #include "../tool_box.h"
 #elif defined(LAMMPS_REAX)
-  #include "reax_list.h"
-  #include "reax_tool_box.h"
+  #include "../reax_list.h"
+  #include "../reax_tool_box.h"
 #endif
 
 
diff --git a/PG-PuReMD/src/cuda_list.h b/PG-PuReMD/src/cuda/cuda_list.h
similarity index 98%
rename from PG-PuReMD/src/cuda_list.h
rename to PG-PuReMD/src/cuda/cuda_list.h
index 0b4e7aa0..fe06f4ce 100644
--- a/PG-PuReMD/src/cuda_list.h
+++ b/PG-PuReMD/src/cuda/cuda_list.h
@@ -22,13 +22,15 @@
 #ifndef __CUDA_LIST_H_
 #define __CUDA_LIST_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 void Dev_Make_List( int, int, int, reax_list* );
+
 void Dev_Delete_List( reax_list* );
 
 #ifdef __cplusplus
diff --git a/PG-PuReMD/src/cuda_lookup.cu b/PG-PuReMD/src/cuda/cuda_lookup.cu
similarity index 98%
rename from PG-PuReMD/src/cuda_lookup.cu
rename to PG-PuReMD/src/cuda/cuda_lookup.cu
index 837a3c71..01bc8a79 100644
--- a/PG-PuReMD/src/cuda_lookup.cu
+++ b/PG-PuReMD/src/cuda/cuda_lookup.cu
@@ -1,8 +1,9 @@
 
 #include "cuda_lookup.h"
-#include "index_utils.h"
+
 #include "cuda_utils.h"
-#include "reax_types.h"
+
+#include "../index_utils.h"
 
 
 void copy_LR_table_to_device( reax_system *system, control_params *control,
diff --git a/PG-PuReMD/src/cuda_lookup.h b/PG-PuReMD/src/cuda/cuda_lookup.h
similarity index 56%
rename from PG-PuReMD/src/cuda_lookup.h
rename to PG-PuReMD/src/cuda/cuda_lookup.h
index 88f5cfce..87026f7d 100644
--- a/PG-PuReMD/src/cuda_lookup.h
+++ b/PG-PuReMD/src/cuda/cuda_lookup.h
@@ -2,16 +2,18 @@
 #ifndef __CUDA_LOOKUP_H__
 #define __CUDA_LOOKUP_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void copy_LR_table_to_device (reax_system *, control_params *, int *);
+void copy_LR_table_to_device( reax_system *, control_params *, int * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_multi_body.cu b/PG-PuReMD/src/cuda/cuda_multi_body.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_multi_body.cu
rename to PG-PuReMD/src/cuda/cuda_multi_body.cu
index 09a12963..cb741571 100644
--- a/PG-PuReMD/src/cuda_multi_body.cu
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.cu
@@ -19,12 +19,13 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
 #include "cuda_multi_body.h"
-#include "index_utils.h"
+
 #include "cuda_helpers.h"
 #include "cuda_list.h"
 
+#include "../index_utils.h"
+
 
 CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, global_parameters gp, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
diff --git a/PG-PuReMD/src/cuda_multi_body.h b/PG-PuReMD/src/cuda/cuda_multi_body.h
similarity index 58%
rename from PG-PuReMD/src/cuda_multi_body.h
rename to PG-PuReMD/src/cuda/cuda_multi_body.h
index 332e6f06..06014b3a 100644
--- a/PG-PuReMD/src/cuda_multi_body.h
+++ b/PG-PuReMD/src/cuda/cuda_multi_body.h
@@ -22,21 +22,14 @@
 #ifndef __CUDA_MULTI_BODY_H_
 #define __CUDA_MULTI_BODY_H_
 
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Atom_Energy(  reax_atom *,
-                                    global_parameters ,
-                                    single_body_parameters *,
-                                    two_body_parameters *,
-                                    storage ,
-                                    reax_list ,
-                                    int ,
-                                    int ,
-                                    real *,
-                                    real *,
-                                    real *
-                                 );
-
-CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess (reax_list, storage, int );
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *, global_parameters,
+        single_body_parameters *, two_body_parameters *, storage,
+        reax_list, int, int, real *, real *, real *);
+
+CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess( reax_list, storage, int );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_neighbors.cu
rename to PG-PuReMD/src/cuda/cuda_neighbors.cu
index f9a20ebd..b1f2b85d 100644
--- a/PG-PuReMD/src/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu
@@ -21,15 +21,13 @@
 
 #include "cuda_neighbors.h"
 
-#include "reax_types.h"
-
 #include "cuda_list.h"
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
 
-#include "vector.h"
-#include "index_utils.h"
-#include "tool_box.h"
+#include "../index_utils.h"
+#include "../tool_box.h"
+#include "../vector.h"
 
 
 CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x ) 
diff --git a/PG-PuReMD/src/cuda_neighbors.h b/PG-PuReMD/src/cuda/cuda_neighbors.h
similarity index 95%
rename from PG-PuReMD/src/cuda_neighbors.h
rename to PG-PuReMD/src/cuda/cuda_neighbors.h
index f7d7cb15..4d4a9c4e 100644
--- a/PG-PuReMD/src/cuda_neighbors.h
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.h
@@ -2,14 +2,13 @@
 #ifndef __CUDA_NEIGHBORS_H__
 #define __CUDA_NEIGHBORS_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
 void Cuda_Generate_Neighbor_Lists( reax_system *, simulation_data *, storage *, reax_list ** );
 
 int Cuda_Estimate_Neighbors( reax_system *, int );
@@ -24,9 +23,9 @@ void Cuda_Init_Sparse_Matrix_Indices( reax_system *, sparse_matrix * );
 
 void Cuda_Init_Three_Body_Indices( int *, int );
 
-
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_nonbonded.cu b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_nonbonded.cu
rename to PG-PuReMD/src/cuda/cuda_nonbonded.cu
index 93bca2da..25c0b17d 100644
--- a/PG-PuReMD/src/cuda_nonbonded.cu
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.cu
@@ -25,10 +25,9 @@
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
 #include "cuda_shuffle.h"
-#include "vector.h"
 
-#include "reax_types.h"
-#include "index_utils.h"
+#include "../index_utils.h"
+#include "../vector.h"
 
 
 //CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy(    
diff --git a/PG-PuReMD/src/cuda_nonbonded.h b/PG-PuReMD/src/cuda/cuda_nonbonded.h
similarity index 79%
rename from PG-PuReMD/src/cuda_nonbonded.h
rename to PG-PuReMD/src/cuda/cuda_nonbonded.h
index 1c9916bf..238d49d7 100644
--- a/PG-PuReMD/src/cuda_nonbonded.h
+++ b/PG-PuReMD/src/cuda/cuda_nonbonded.h
@@ -19,15 +19,17 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __NONBONDED_H_
-#define __NONBONDED_H_
+#ifndef __CUDA_NONBONDED_H_
+#define __CUDA_NONBONDED_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 void Cuda_Compute_Polarization_Energy( reax_system *, simulation_data *);
-void Cuda_NonBonded_Energy ( reax_system *, control_params *,
-                             storage *, simulation_data *, reax_list **,
-                             output_controls *, bool );
+
+void Cuda_NonBonded_Energy( reax_system *, control_params *,
+        storage *, simulation_data *, reax_list **,
+        output_controls *, bool );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_post_evolve.cu b/PG-PuReMD/src/cuda/cuda_post_evolve.cu
similarity index 95%
rename from PG-PuReMD/src/cuda_post_evolve.cu
rename to PG-PuReMD/src/cuda/cuda_post_evolve.cu
index 9a478192..828a0e4b 100644
--- a/PG-PuReMD/src/cuda_post_evolve.cu
+++ b/PG-PuReMD/src/cuda/cuda_post_evolve.cu
@@ -1,9 +1,10 @@
 
 #include "cuda_post_evolve.h"
-#include "reax_types.h"
-#include "vector.h"
+
 #include "cuda_utils.h"
 
+#include "../vector.h"
+
 
 CUDA_GLOBAL void ker_post_evolve( reax_atom *my_atoms, 
         simulation_data *data, int n )
diff --git a/PG-PuReMD/src/cuda_post_evolve.h b/PG-PuReMD/src/cuda/cuda_post_evolve.h
similarity index 60%
rename from PG-PuReMD/src/cuda_post_evolve.h
rename to PG-PuReMD/src/cuda/cuda_post_evolve.h
index dcdcd50c..a1a0571a 100644
--- a/PG-PuReMD/src/cuda_post_evolve.h
+++ b/PG-PuReMD/src/cuda/cuda_post_evolve.h
@@ -2,16 +2,18 @@
 #ifndef __CUDA_POST_EVOLVE_H__
 #define __CUDA_POST_EVOLVE_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void post_evolve_velocities (reax_system *, simulation_data *);
+void post_evolve_velocities( reax_system *, simulation_data * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_reduction.cu b/PG-PuReMD/src/cuda/cuda_reduction.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_reduction.cu
rename to PG-PuReMD/src/cuda/cuda_reduction.cu
index 02d800ee..01bd3c81 100644
--- a/PG-PuReMD/src/cuda_reduction.cu
+++ b/PG-PuReMD/src/cuda/cuda_reduction.cu
@@ -4,10 +4,10 @@
 #include "cuda_shuffle.h"
 #include "cuda_utils.h"
 
-#include "vector.h"
+#include "../vector.h"
 
-#include "cub/cub/device/device_reduce.cuh"
-#include "cub/cub/device/device_scan.cuh"
+#include "../cub/cub/device/device_reduce.cuh"
+#include "../cub/cub/device/device_scan.cuh"
 
 
 //struct RvecSum
diff --git a/PG-PuReMD/src/cuda_reduction.h b/PG-PuReMD/src/cuda/cuda_reduction.h
similarity index 96%
rename from PG-PuReMD/src/cuda_reduction.h
rename to PG-PuReMD/src/cuda/cuda_reduction.h
index 15ca538f..cf9efc5d 100644
--- a/PG-PuReMD/src/cuda_reduction.h
+++ b/PG-PuReMD/src/cuda/cuda_reduction.h
@@ -2,32 +2,45 @@
 #ifndef __CUDA_REDUCTION_H__
 #define __CUDA_REDUCTION_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 #define  INITIAL  0
 #define  FINAL    1
 
 
 void Cuda_Reduction_Sum( int *, int *, size_t );
+
 void Cuda_Reduction_Sum( real *, real *, size_t );
+
 //void Cuda_Reduction_Sum( rvec *, rvec *, size_t );
+
 void Cuda_Reduction_Max( int *, int *, size_t );
+
 void Cuda_Scan_Excl_Sum( int *, int *, size_t );
 
 CUDA_GLOBAL void k_reduction( const real *, real *, const size_t );
+
 CUDA_GLOBAL void k_reduction_rvec( rvec *, rvec *, size_t );
+
 CUDA_GLOBAL void k_reduction_rvec2( rvec2 *, rvec2 *, size_t );
+
 CUDA_GLOBAL void k_norm( const real *, real *, const size_t, int );
+
 CUDA_GLOBAL void k_dot( const real *, const real *, real *,
         const size_t );
 
 CUDA_GLOBAL void k_vector_sum( real*, real, real*, real,
         real*, int );
+
 CUDA_GLOBAL void k_rvec2_pbetad( rvec2 *, rvec2 *, real, real,
         rvec2 *, int );
+
 CUDA_GLOBAL void k_rvec2_mul( rvec2*, rvec2*, rvec2*, int );
+
 CUDA_GLOBAL void k_vector_mul( real*, real*, real*, int );
+
 CUDA_GLOBAL void k_norm_rvec2( const rvec2 *, rvec2 *, const size_t, int );
+
 CUDA_GLOBAL void k_dot_rvec2( const rvec2 *, rvec2 *, rvec2 *, const size_t );
 
 
diff --git a/PG-PuReMD/src/cuda_reset_tools.cu b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
similarity index 98%
rename from PG-PuReMD/src/cuda_reset_tools.cu
rename to PG-PuReMD/src/cuda/cuda_reset_tools.cu
index 27cb4580..ca435269 100644
--- a/PG-PuReMD/src/cuda_reset_tools.cu
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.cu
@@ -5,7 +5,7 @@
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
 
-#include "reset_tools.h"
+#include "../reset_tools.h"
 
 
 extern "C"
diff --git a/PG-PuReMD/src/cuda_reset_tools.h b/PG-PuReMD/src/cuda/cuda_reset_tools.h
similarity index 94%
rename from PG-PuReMD/src/cuda_reset_tools.h
rename to PG-PuReMD/src/cuda/cuda_reset_tools.h
index f158afec..2e90b8eb 100644
--- a/PG-PuReMD/src/cuda_reset_tools.h
+++ b/PG-PuReMD/src/cuda/cuda_reset_tools.h
@@ -2,13 +2,13 @@
 #ifndef __CUDA_RESET_TOOLS_H__
 #define __CUDA_RESET_TOOLS_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
 void Cuda_Reset_Workspace( reax_system *, storage * );
 
 void Cuda_Reset_Atoms( reax_system *, control_params * );
@@ -19,9 +19,9 @@ int  Cuda_Reset_Neighbor_Lists( reax_system *, control_params *,
 void Cuda_Reset( reax_system*, control_params*, simulation_data*,
         storage*, reax_list** );
 
-
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_shuffle.h b/PG-PuReMD/src/cuda/cuda_shuffle.h
similarity index 97%
rename from PG-PuReMD/src/cuda_shuffle.h
rename to PG-PuReMD/src/cuda/cuda_shuffle.h
index f8dfddfa..0d687271 100644
--- a/PG-PuReMD/src/cuda_shuffle.h
+++ b/PG-PuReMD/src/cuda/cuda_shuffle.h
@@ -22,8 +22,7 @@
 #ifndef __CUDA_SHUFFLE_H_
 #define __CUDA_SHUFFLE_H_
 
-#include "reax_types.h"
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu
new file mode 100644
index 00000000..54957d00
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_system_props.cu
@@ -0,0 +1,1026 @@
+
+#include "cuda_system_props.h"
+
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+#include "cuda_copy.h"
+#include "cuda_shuffle.h"
+
+#include "../vector.h"
+
+
+CUDA_GLOBAL void center_of_mass_blocks( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
+{
+    extern __shared__ rvec xcm[];
+    extern __shared__ rvec vcm[];
+    extern __shared__ rvec amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    //unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 *(blockDim.x);
+
+    unsigned int index = 0;
+    rvec tmp;
+    real m;
+
+    rvec_MakeZero (xcm [threadIdx.x]);
+    rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
+    rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
+    rvec_MakeZero (tmp);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
+        rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
+        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
+        rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
+    }
+    __syncthreads ();
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (xcm [threadIdx.x], xcm[index]);
+            rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
+            rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
+        } 
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0)){
+        rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
+    }
+}
+
+
+#if defined( __SM_35__)
+CUDA_GLOBAL void center_of_mass_blocks_xcm( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_xcm, size_t n )
+{
+    extern __shared__ rvec my_xcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int index = 0;
+    rvec xcm;
+    real m;
+
+    rvec_MakeZero (xcm);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (xcm , m, atoms [i].x);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        xcm[0] += shfl( xcm[0], z);
+        xcm[1] += shfl( xcm[1], z);
+        xcm[2] += shfl( xcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm );
+    __syncthreads ();
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_xcm [threadIdx.x], my_xcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0))
+        rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]);
+}
+
+
+CUDA_GLOBAL void center_of_mass_blocks_vcm( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_vcm, size_t n )
+{
+    extern __shared__ rvec my_vcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    rvec vcm;
+    real m;
+
+    rvec_MakeZero (vcm);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (vcm , m, atoms [i].v);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        vcm[0] += shfl( vcm[0], z);
+        vcm[1] += shfl( vcm[1], z);
+        vcm[2] += shfl( vcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm );
+    __syncthreads ();
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_vcm [threadIdx.x], my_vcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0))
+        rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]);
+}
+
+
+CUDA_GLOBAL void center_of_mass_blocks_amcm( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_amcm, size_t n )
+{
+    extern __shared__ rvec my_amcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    rvec amcm;
+    real m;
+    rvec tmp;
+
+    rvec_MakeZero (amcm);
+    rvec_MakeZero( tmp );
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
+        rvec_ScaledAdd (amcm, m, tmp);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        amcm[0] += shfl( amcm[0], z);
+        amcm[1] += shfl( amcm[1], z);
+        amcm[2] += shfl( amcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm );
+    __syncthreads ();
+
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_amcm[threadIdx.x], my_amcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0)){
+        rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]);
+    }
+}
+#endif
+
+
+CUDA_GLOBAL void center_of_mass( rvec *xcm, rvec *vcm, rvec *amcm, 
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
+{
+    extern __shared__ rvec sh_xcm[];
+    extern __shared__ rvec sh_vcm[];
+    extern __shared__ rvec sh_amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 * (blockDim.x);
+
+    unsigned int index = 0;
+    rvec t_xcm, t_vcm, t_amcm;
+
+    rvec_MakeZero (t_xcm);
+    rvec_MakeZero (t_vcm);
+    rvec_MakeZero (t_amcm);
+
+    if (i < n){
+        rvec_Copy ( t_xcm, xcm[threadIdx.x]);
+        rvec_Copy ( t_vcm, vcm[threadIdx.x]);
+        rvec_Copy ( t_amcm, amcm[threadIdx.x]);
+    }
+
+    rvec_Copy (sh_xcm[xcm_id], t_xcm);
+    rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
+    rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
+
+    __syncthreads ();
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+
+        if (threadIdx.x < offset) {
+            index = threadIdx.x + offset;
+            rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
+            rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
+            rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
+        } 
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0){
+        rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass( single_body_parameters *sbp, 
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xx[ xx_i ] = diff[0] * diff[0] * m;
+        xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
+        xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
+        yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
+        yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
+        zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            xx[ threadIdx.x ] += xx[ index ];
+            xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 ] = xx [ 0 ];
+        results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
+        results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
+        results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
+        results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
+        results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass( real *input, real *output, size_t n )
+{
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n)
+    {
+        xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
+        xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
+        xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
+        yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
+        yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
+        zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset )
+        {
+            index = threadIdx.x + offset;
+            xx [ threadIdx.x ] += xx [ index ];
+            xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0)
+    {
+        output[0] = xx[0];
+        output[1] = xy[xy_i];
+        output[2] = xz[xz_i];
+        output[3] = xz[yy_i];
+        output[4] = xz[yz_i];
+        output[5] = xz[zz_i];
+    }
+}
+
+
+#if defined( __SM_35__)
+CUDA_GLOBAL void compute_center_mass_xx_xy( single_body_parameters *sbp,
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real my_results_xx[];
+    extern __shared__ real my_results_xy[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real xx = 0;
+    real xy = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xx = diff[0] * diff[0] * m;
+        xy = diff[0] * diff[1] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        xx += shfl( xx, z);
+        xy += shfl( xy, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_xx[threadIdx.x >> 5] = xx;    
+        my_results_xy[threadIdx.x >> 5] = xy;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_xx[ threadIdx.x ] += my_results_xx[ index ];
+            my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 ] = my_results_xx [ 0 ];
+        results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ];
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass_xz_yy( single_body_parameters *sbp,
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real my_results_xz[];
+    extern __shared__ real my_results_yy[];
+
+    unsigned int yy_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real xz = 0;
+    real yy = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xz = diff[0] * diff[2] * m;
+        yy = diff[1] * diff[1] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        xz += shfl( xz, z);
+        yy += shfl( yy, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_xz[threadIdx.x >> 5] = xz;    
+        my_results_yy[threadIdx.x >> 5] = yy;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_xz[ threadIdx.x ] += my_results_xz [ index ];
+            my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ];
+        results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ];
+    }
+}
+
+
+CUDA_GLOBAL void compute_center_mass_yz_zz( single_body_parameters *sbp,
+        reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2,
+        size_t n )
+{
+    extern __shared__ real my_results_yz[];
+    extern __shared__ real my_results_zz[];
+
+    unsigned int zz_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real yz = 0;
+    real zz = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+    if (i < n)
+    {
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        yz = diff[1] * diff[2] * m;
+        zz = diff[2] * diff[2] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        yz += shfl( yz, z);
+        zz += shfl( zz, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_yz[threadIdx.x >> 5] = yz;    
+        my_results_zz[threadIdx.x >> 5] = zz;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_yz[ threadIdx.x ] += my_results_yz [ index ];
+            my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ];
+        results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ];
+    }
+}
+#endif
+
+
+CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *my_atoms, 
+        real *block_results, int n )
+{
+#if defined(__SM_35__)
+    extern __shared__ real my_sbp[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real sdata = 0;
+
+    if (i < n)
+    {
+        sdata = sbp[ my_atoms[i].type ].mass;
+    }
+    __syncthreads( );
+
+    for(int z = 16; z >=1; z/=2)
+    {
+        sdata += shfl( sdata, z);
+    }
+
+    if (threadIdx.x % 32 == 0)
+    {
+        my_sbp[threadIdx.x >> 5] = sdata;
+    }
+
+    __syncthreads( );
+
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if(threadIdx.x == 0)
+    {
+        block_results[blockIdx.x] = my_sbp[0];
+    }
+
+#else
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if (i < n)
+    {
+        x = sbp[ my_atoms[i].type ].mass;
+    }
+
+    sdata[ threadIdx.x ] = x;
+    __syncthreads( );
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if (threadIdx.x == 0)
+    {
+        block_results[ blockIdx.x] = sdata [0];
+    }
+
+#endif
+}
+
+
+extern "C" void dev_compute_total_mass( reax_system *system, real *local_val )
+{
+    real *block_mass = (real *) scratch;
+    cuda_memset( block_mass, 0, sizeof(real) * (1 + BLOCKS_POW_2), "total_mass:tmp" );
+
+    k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
+        (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof(real), 
+            cudaMemcpyDeviceToHost, "total_mass:tmp");
+}
+
+
+CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_atom *my_atoms, 
+        real *block_results, int n )
+{
+#if defined(__SM_35__)
+    extern __shared__ real my_sbpdot[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real sdata = 0;
+    rvec p;
+
+    if (i < n)
+    {
+        sdata = sbp[ my_atoms[i].type ].mass;
+        rvec_Scale( p, sdata, my_atoms[ i ].v );
+        sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
+    }
+
+    __syncthreads( );
+
+    for(int z = 16; z >=1; z/=2)
+    {
+        sdata += shfl( sdata, z);
+    }
+
+    if (threadIdx.x % 32 == 0)
+    {
+        my_sbpdot[threadIdx.x >> 5] = sdata;
+    }
+
+    __syncthreads( );
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if (threadIdx.x == 0)
+    {
+        block_results[blockIdx.x] = my_sbpdot[0];
+    }
+
+#else
+    extern __shared__ real sdata [];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real    m = 0;
+    rvec p;
+
+    if (i < n)
+    {
+        m = sbp[ my_atoms[i].type ].mass;
+        rvec_Scale( p, m, my_atoms[ i ].v );
+        m = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
+    }
+
+    sdata[ threadIdx.x ] = m;
+    __syncthreads( );
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads( );
+    }
+
+    if (threadIdx.x == 0)
+    {
+        block_results[blockIdx.x] = sdata[0];
+    }
+#endif
+}
+
+extern "C" void dev_compute_kinetic_energy( reax_system *system,
+        simulation_data *data, real *local_val )
+{
+    real *block_energy = (real *) scratch;
+    cuda_memset( block_energy, 0, sizeof(real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp" );
+
+    k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
+        (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( local_val, block_energy + BLOCKS_POW_2,
+            //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
+            sizeof(real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp" );
+            //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
+            //        sizeof (real), "kinetic_energy");
+}
+
+
+extern "C" void dev_compute_momentum( reax_system *system, rvec xcm, 
+        rvec vcm, rvec amcm )
+{
+    rvec *l_xcm, *l_vcm, *l_amcm;
+    rvec *r_scratch = (rvec *)scratch;
+
+#if defined( __SM_35__)
+    // xcm
+    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
+    l_xcm = r_scratch;
+    
+    center_of_mass_blocks_xcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    
+    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
+            (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( xcm, l_xcm + BLOCKS_POW_2,
+            sizeof(rvec), cudaMemcpyDeviceToHost, "momentum:xcm" );
+    
+    // vcm
+    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
+    l_vcm = r_scratch;
+    
+    center_of_mass_blocks_vcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    
+    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
+        (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof(rvec),
+        cudaMemcpyDeviceToHost, "momentum:vcm" );
+    
+    // amcm
+    cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
+    l_amcm = r_scratch;
+    
+    center_of_mass_blocks_amcm <<< BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    
+    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
+        (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof(rvec),
+        cudaMemcpyDeviceToHost, "momemtum:amcm" );
+
+#else
+    cuda_memset( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
+    
+    l_xcm = r_scratch;
+    l_vcm = r_scratch + (BLOCKS_POW_2 + 1); 
+    l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); 
+    
+    center_of_mass_blocks <<< BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> 
+        ( system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n );
+    cudaThreadSynchronize( ); 
+    cudaCheckError( ); 
+    
+    center_of_mass <<< 1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> 
+        ( l_xcm, l_vcm, l_amcm, l_xcm + BLOCKS_POW_2, l_vcm + BLOCKS_POW_2,
+          l_amcm + BLOCKS_POW_2, BLOCKS_POW_2 );
+    cudaThreadSynchronize( ); 
+    cudaCheckError( );
+    
+    copy_host_device( xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" );
+    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" );
+    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" );
+#endif
+}
+
+
+extern "C" void dev_compute_inertial_tensor( reax_system *system, real *local_results, rvec my_xcm )
+{
+#if defined(__SM_35__)
+    real *partial_results = (real *) scratch;
+    cuda_memset( partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp" );
+
+    compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
+        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2,
+        sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results" );
+
+#else
+    real *partial_results = (real *) scratch;
+    //real *local_results;
+
+    cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
+    //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1));
+
+    compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
+        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, 
+            sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
+#endif
+}
+
+
+extern "C" void dev_sync_simulation_data( simulation_data *data )
+{
+    Output_Sync_Simulation_Data( data, (simulation_data *)data->d_simulation_data );
+}
+
+
+void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
+        MPI_Comm comm )
+{
+    int i;
+    rvec p;
+    real m;
+
+    data->my_en.e_kin = 0.0;
+
+    dev_compute_kinetic_energy( system, data, &data->my_en.e_kin );
+
+    MPI_Allreduce( &data->my_en.e_kin,  &data->sys_en.e_kin,
+            1, MPI_DOUBLE, MPI_SUM, comm );
+
+    data->therm.T = (2. * data->sys_en.e_kin) / (data->N_f * K_B);
+
+    // avoid T being an absolute zero, might cause F.P.E!
+    if ( FABS(data->therm.T) < ALMOST_ZERO )
+    {
+        data->therm.T = ALMOST_ZERO;
+    }
+}
+
+
+void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data,
+        MPI_Comm comm  )
+{
+    int  i;
+    real tmp;
+
+    //compute local total mass of the system
+    dev_compute_total_mass( system, &tmp );
+
+    MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
+
+    data->inv_M = 1. / data->M;
+}
+
+
+void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
+        mpi_datatypes *mpi_data, MPI_Comm comm )
+{
+    int i;
+    real m, det; //xx, xy, xz, yy, yz, zz;
+    real tmp_mat[6], tot_mat[6];
+    rvec my_xcm, my_vcm, my_amcm, my_avcm;
+    rvec tvec, diff;
+    rtensor mat, inv;
+
+    rvec_MakeZero( my_xcm );  // position of CoM
+    rvec_MakeZero( my_vcm );  // velocity of CoM
+    rvec_MakeZero( my_amcm ); // angular momentum of CoM
+    rvec_MakeZero( my_avcm ); // angular velocity of CoM
+
+    /* Compute the position, vel. and ang. momentum about the centre of mass */
+    dev_compute_momentum ( system, my_xcm, my_vcm, my_amcm );
+
+    MPI_Allreduce( my_xcm, data->xcm, 3, MPI_DOUBLE, MPI_SUM, comm );
+    MPI_Allreduce( my_vcm, data->vcm, 3, MPI_DOUBLE, MPI_SUM, comm );
+    MPI_Allreduce( my_amcm, data->amcm, 3, MPI_DOUBLE, MPI_SUM, comm );
+
+    rvec_Scale( data->xcm, data->inv_M, data->xcm );
+    rvec_Scale( data->vcm, data->inv_M, data->vcm );
+    rvec_Cross( tvec, data->xcm, data->vcm );
+    rvec_ScaledAdd( data->amcm, -data->M, tvec );
+    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
+
+    /* Calculate and then invert the inertial tensor */
+    for ( i = 0; i < 6; ++i )
+    {
+        tmp_mat[i] = 0;
+    }
+
+    dev_compute_inertial_tensor( system, tmp_mat, my_xcm );
+
+    MPI_Reduce( tmp_mat, tot_mat, 6, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm );
+
+    if ( system->my_rank == MASTER_NODE )
+    {
+        mat[0][0] = tot_mat[3] + tot_mat[5];  // yy + zz;
+        mat[0][1] = mat[1][0] = -tot_mat[1];  // -xy;
+        mat[0][2] = mat[2][0] = -tot_mat[2];  // -xz;
+        mat[1][1] = tot_mat[0] + tot_mat[5];  // xx + zz;
+        mat[2][1] = mat[1][2] = -tot_mat[4];  // -yz;
+        mat[2][2] = tot_mat[0] + tot_mat[3];  // xx + yy;
+
+        /* invert the inertial tensor */
+        det = ( mat[0][0] * mat[1][1] * mat[2][2] +
+                mat[0][1] * mat[1][2] * mat[2][0] +
+                mat[0][2] * mat[1][0] * mat[2][1] ) -
+              ( mat[0][0] * mat[1][2] * mat[2][1] +
+                mat[0][1] * mat[1][0] * mat[2][2] +
+                mat[0][2] * mat[1][1] * mat[2][0] );
+
+        inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
+        inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
+        inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
+        inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
+        inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
+        inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
+        inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
+        inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
+        inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
+
+        if ( det > ALMOST_ZERO )
+        {
+            rtensor_Scale( inv, 1. / det, inv );
+        }
+        else
+        {
+            rtensor_MakeZero( inv );
+        }
+
+        /* Compute the angular velocity about the centre of mass */
+        rtensor_MatVec( data->avcm, inv, data->amcm );
+    }
+
+    MPI_Bcast( data->avcm, 3, MPI_DOUBLE, MASTER_NODE, comm );
+
+    /* Compute the rotational energy */
+    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
+
+#if defined(DEBUG)
+    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",
+             data->xcm[0], data->xcm[1], data->xcm[2] );
+    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n",
+             data->vcm[0], data->vcm[1], data->vcm[2] );
+    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n",
+             data->amcm[0], data->amcm[1], data->amcm[2] );
+    /* fprintf( stderr, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
+       mat[0][0], mat[0][1], mat[0][2],
+       mat[1][0], mat[1][1], mat[1][2],
+       mat[2][0], mat[2][1], mat[2][2] );
+       fprintf( stderr, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
+       inv[0][0], inv[0][1], inv[0][2],
+       inv[1][0], inv[1][1], inv[1][2],
+       inv[2][0], inv[2][1], inv[2][2] ); */
+    fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n",
+             data->avcm[0], data->avcm[1], data->avcm[2] );
+#endif
+}
+
+
diff --git a/PG-PuReMD/src/cuda_system_props.h b/PG-PuReMD/src/cuda/cuda_system_props.h
similarity index 65%
rename from PG-PuReMD/src/cuda_system_props.h
rename to PG-PuReMD/src/cuda/cuda_system_props.h
index ce6fccc1..66f620b3 100644
--- a/PG-PuReMD/src/cuda_system_props.h
+++ b/PG-PuReMD/src/cuda/cuda_system_props.h
@@ -2,24 +2,35 @@
 #ifndef __CUDA_SYSTEM_PROPS_H__
 #define __CUDA_SYSTEM_PROPS_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
 void dev_compute_total_mass( reax_system *, real * );
+
 void dev_compute_kinetic_energy( reax_system *, simulation_data *, real * );
+
 void dev_compute_momentum( reax_system *, rvec, rvec, rvec );
+
 void dev_compute_inertial_tensor( reax_system *, real *, rvec my_xcm );
 
 void dev_sync_simulation_data( simulation_data * );
+
 //void dev_compute_kinetic_energy( reax_system *, simulation_data *, real * );
 
+void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
+
+void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
+
+void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*,
+        mpi_datatypes*, MPI_Comm );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_torsion_angles.cu
rename to PG-PuReMD/src/cuda/cuda_torsion_angles.cu
index e70c378b..47c087d2 100644
--- a/PG-PuReMD/src/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.cu
@@ -19,13 +19,14 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "reax_types.h"
-#include "index_utils.h"
 #include "cuda_torsion_angles.h"
-#include "vector.h"
+
 #include "cuda_list.h"
 #include "cuda_helpers.h"
 
+#include "../index_utils.h"
+#include "../vector.h"
+
 #define MIN_SINE 1e-10
 
 
diff --git a/PG-PuReMD/src/cuda_torsion_angles.h b/PG-PuReMD/src/cuda/cuda_torsion_angles.h
similarity index 57%
rename from PG-PuReMD/src/cuda_torsion_angles.h
rename to PG-PuReMD/src/cuda/cuda_torsion_angles.h
index 235e91b0..a7d9c3cb 100644
--- a/PG-PuReMD/src/cuda_torsion_angles.h
+++ b/PG-PuReMD/src/cuda/cuda_torsion_angles.h
@@ -19,24 +19,18 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#ifndef __TORSION_ANGLES_H_
-#define __TORSION_ANGLES_H_
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *,
-                                      global_parameters ,
-                                      four_body_header *,
-                                      control_params *,
-                                      reax_list , reax_list ,
-                                      storage ,
-                                      int , int ,
-                                      real *, real *,
-                                      rvec *);
-
-CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess ( reax_atom *,
-        storage ,
-        reax_list , int );
+#ifndef __CUDA_TORSION_ANGLES_H_
+#define __CUDA_TORSION_ANGLES_H_
+
+#include "../reax_types.h"
+
+
+CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *, global_parameters,
+        four_body_header *, control_params *, reax_list, reax_list,
+        storage, int, int, real *, real *, rvec * );
+
+CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess( reax_atom *,
+        storage, reax_list, int );
+
 
 #endif
diff --git a/PG-PuReMD/src/cuda_utils.cu b/PG-PuReMD/src/cuda/cuda_utils.cu
similarity index 88%
rename from PG-PuReMD/src/cuda_utils.cu
rename to PG-PuReMD/src/cuda/cuda_utils.cu
index 5899a1ec..7e1757bc 100644
--- a/PG-PuReMD/src/cuda_utils.cu
+++ b/PG-PuReMD/src/cuda/cuda_utils.cu
@@ -149,3 +149,20 @@ extern "C" void print_device_mem_usage( )
             total, (long long int)total/(1024.0*1024.0),
             free, (long long int)free/(1024.0*1024.0) );
 }
+
+
+extern "C" void init_blocks( reax_system *system )
+{
+    compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n );
+    compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 );
+
+    compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N );
+    compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N );
+
+    compute_matvec_blocks( &MATVEC_BLOCKS, system->N );
+
+#if defined(__CUDA_DEBUG_LOG__)
+    fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d  - N:%d \n",
+            MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N );
+#endif
+}
diff --git a/PG-PuReMD/src/cuda_utils.h b/PG-PuReMD/src/cuda/cuda_utils.h
similarity index 80%
rename from PG-PuReMD/src/cuda_utils.h
rename to PG-PuReMD/src/cuda/cuda_utils.h
index 3d63d5e3..bfc4256d 100644
--- a/PG-PuReMD/src/cuda_utils.h
+++ b/PG-PuReMD/src/cuda/cuda_utils.h
@@ -1,7 +1,7 @@
 #ifndef __CUDA_UTILS_H_
 #define __CUDA_UTILS_H_
 
-#include "reax_types.h"
+#include "../reax_types.h"
 
 
 #ifdef __cplusplus
@@ -9,22 +9,33 @@ extern "C"  {
 #endif
 
 void cuda_malloc( void **, size_t, int, const char * );
+
 void cuda_free( void *, const char * );
+
 void cuda_memset( void *, int , size_t , const char * );
+
 void copy_host_device( void *, void *, size_t, enum cudaMemcpyKind, const char * );
+
 void copy_device( void *, void *, size_t, const char * );
 
 void compute_blocks( int *, int *, int );
+
 void compute_matvec_blocks( int *, int );
+
 void compute_nearest_pow_2( int, int * );
 
+void init_blocks( reax_system * );
+
 void print_device_mem_usage( );
 
+
 #ifdef __cplusplus
 #define cudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
 static inline void __cudaCheckError( const char *file, const int line )
 {
-    cudaError err = cudaGetLastError();
+    cudaError err;
+
+    err = cudaGetLastError();
     if ( cudaSuccess != err )
     {
         fprintf( stderr, "[ERROR] runtime error encountered: %s:%d\n", file, line );
@@ -32,19 +43,22 @@ static inline void __cudaCheckError( const char *file, const int line )
         exit( RUNTIME_ERROR );
     }
 
+#if defined(DEBUG)
     /* More careful checking. However, this will affect performance. */
-//    err = cudaDeviceSynchronize();
-//    if( cudaSuccess != err )
-//    {
-//       exit( -1 );
-//    }
+    err = cudaDeviceSynchronize( );
+    if( cudaSuccess != err )
+    {
+       exit( RUNTIME_ERROR );
+    }
+#endif
 
     return;
 }
 #endif
 
-#endif
-
 #ifdef __cplusplus
 }
 #endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda_valence_angles.cu b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_valence_angles.cu
rename to PG-PuReMD/src/cuda/cuda_valence_angles.cu
index d778c3b2..21b8d2c8 100644
--- a/PG-PuReMD/src/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.cu
@@ -21,9 +21,10 @@
 
 #include "cuda_valence_angles.h"
 
-#include "index_utils.h"
 #include "cuda_list.h"
-#include "vector.h"
+
+#include "../index_utils.h"
+#include "../vector.h"
 
 
 /* Compute 3-body interactions, in which the main role is played by
diff --git a/PG-PuReMD/src/cuda_valence_angles.h b/PG-PuReMD/src/cuda/cuda_valence_angles.h
similarity index 98%
rename from PG-PuReMD/src/cuda_valence_angles.h
rename to PG-PuReMD/src/cuda/cuda_valence_angles.h
index 65109597..d8abac25 100644
--- a/PG-PuReMD/src/cuda_valence_angles.h
+++ b/PG-PuReMD/src/cuda/cuda_valence_angles.h
@@ -22,8 +22,10 @@
 #ifndef __CUDA_VALENCE_ANGLES_H_
 #define __CUDA_VALENCE_ANGLES_H_
 
-#include "reax_types.h"
-#include "vector.h"
+#include "../reax_types.h"
+
+#include "../vector.h"
+
 
 CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *, global_parameters,
         single_body_parameters *, three_body_header *, control_params *,
diff --git a/PG-PuReMD/src/cuda_validation.cu b/PG-PuReMD/src/cuda/cuda_validation.cu
similarity index 99%
rename from PG-PuReMD/src/cuda_validation.cu
rename to PG-PuReMD/src/cuda/cuda_validation.cu
index 34a42430..34ebf6e5 100644
--- a/PG-PuReMD/src/cuda_validation.cu
+++ b/PG-PuReMD/src/cuda/cuda_validation.cu
@@ -1,13 +1,12 @@
-#include "reax_types.h"
 
 #include "cuda_validation.h"
 
 #include "cuda_utils.h"
 
-#include "index_utils.h"
-#include "list.h"
-#include "tool_box.h"
-#include "vector.h"
+#include "../index_utils.h"
+#include "../list.h"
+#include "../tool_box.h"
+#include "../vector.h"
 
 
 bool check_zero( real p1, real p2 )
diff --git a/PG-PuReMD/src/cuda_validation.h b/PG-PuReMD/src/cuda/cuda_validation.h
similarity index 97%
rename from PG-PuReMD/src/cuda_validation.h
rename to PG-PuReMD/src/cuda/cuda_validation.h
index 42eb37a4..7faa773b 100644
--- a/PG-PuReMD/src/cuda_validation.h
+++ b/PG-PuReMD/src/cuda/cuda_validation.h
@@ -3,50 +3,60 @@
 #ifndef __CUDA_VALIDATION_H__
 #define __CUDA_VALIDATION_H__
 
-#include "reax_types.h"
+#include "../reax_types.h"
+
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
 int validate_neighbors( reax_system *, reax_list **lists );
+
 int validate_sym_dbond_indices( reax_system *system,
         storage *workspace, reax_list **lists );
 
 int validate_bonds( reax_system *, storage *, reax_list ** );
+
 int validate_hbonds( reax_system *, storage *, reax_list ** );
+
 int validate_sparse_matrix( reax_system *, storage * );
 
 int validate_grid( reax_system * );
+
 int validate_workspace( reax_system *, storage * );
 
 int validate_data( reax_system *, simulation_data * );
+
 int validate_three_bodies( reax_system *, storage *,
         reax_list ** );
+
 int validate_atoms( reax_system *, reax_list ** );
 
 int print_sparse_matrix( sparse_matrix *H );
+
 int print_sparse_matrix_host( sparse_matrix *H );
 
 int print_host_rvec2( rvec2 *, int );
+
 int print_device_rvec2( rvec2 *, int );
 
 int print_host_array( real *, int );
+
 int print_device_array( real *, int );
 
 void compare_rvec2( rvec2 *host, rvec2 *device, int N,
         const char *msg );
+
 void compare_array( real *host, real *device, int N,
         const char *msg );
 
 int check_zeros_host( rvec2 *host, int n, const char * );
-int check_zeros_device( rvec2 *device, int n, const char * );
-
 
+int check_zeros_device( rvec2 *device, int n, const char * );
 
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.h b/PG-PuReMD/src/cuda_hydrogen_bonds.h
deleted file mode 100644
index 7e1644f1..00000000
--- a/PG-PuReMD/src/cuda_hydrogen_bonds.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#ifndef __HBONDS_H_
-#define __HBONDS_H_
-
-#include "reax_types.h"
-#include "reax_types.h"
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs (  reax_atom *,
-        storage ,
-        reax_list );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL (  reax_atom *,
-        storage ,
-        reax_list, int );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess (  reax_atom *,
-        storage ,
-        reax_list , int );
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *,
-                                      single_body_parameters *,
-                                      hbond_parameters *,
-                                      global_parameters ,
-                                      control_params *,
-                                      storage ,
-                                      reax_list ,
-                                      reax_list ,
-                                      int ,
-                                      int ,
-                                      real *,
-                                      rvec *);
-
-CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT( reax_atom *,
-        single_body_parameters *,
-        hbond_parameters *,
-        global_parameters ,
-        control_params *,
-        storage ,
-        reax_list ,
-        reax_list ,
-        int ,
-        int ,
-        real *,
-        rvec *);
-
-#endif
diff --git a/PG-PuReMD/src/cuda_init_md.cu b/PG-PuReMD/src/cuda_init_md.cu
deleted file mode 100644
index 044e8e73..00000000
--- a/PG-PuReMD/src/cuda_init_md.cu
+++ /dev/null
@@ -1,14 +0,0 @@
-
-#include "cuda_init_md.h"
-
-#include "reax_types.h"
-#include "cuda_utils.h"
-
-#include "tool_box.h"
-
-void Cuda_Init_ScratchArea( )
-{
-    cuda_malloc( (void **)&scratch, DEVICE_SCRATCH_SIZE, TRUE, "device:scratch" );
-
-    host_scratch = (void *) smalloc( HOST_SCRATCH_SIZE, "host:scratch" );
-}
diff --git a/PG-PuReMD/src/cuda_init_md.h b/PG-PuReMD/src/cuda_init_md.h
deleted file mode 100644
index cf7b5249..00000000
--- a/PG-PuReMD/src/cuda_init_md.h
+++ /dev/null
@@ -1,15 +0,0 @@
-
-#ifndef __CUDA_INIT_MD_H__
-#define __CUDA_INIT_MD_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void Cuda_Init_ScratchArea( );
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/PG-PuReMD/src/cuda_integrate.cu b/PG-PuReMD/src/cuda_integrate.cu
deleted file mode 100644
index 936c6816..00000000
--- a/PG-PuReMD/src/cuda_integrate.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-
-#include "cuda_integrate.h"
-#include "reax_types.h"
-
-#include "vector.h"
-#include "cuda_utils.h"
-
-
-CUDA_GLOBAL void k_update_velocity_1( reax_atom *my_atoms, 
-        single_body_parameters *sbp, real dt, int n )
-{
-    real inv_m;
-    rvec dx;
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= n )
-    {
-        return;
-    }
-
-    /* velocity verlet, 1st part */
-    atom = &(my_atoms[i]);
-    inv_m = 1.0 / sbp[atom->type].mass;
-    /* Compute x(t + dt) */
-    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-    rvec_Add( atom->x, dx );
-    /* Compute v(t + dt/2) */
-    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-}
-
-
-void bNVT_update_velocity_part1( reax_system *system, real dt )
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_update_velocity_2( reax_atom *my_atoms, 
-        single_body_parameters *sbp, real dt, int n )
-{
-    reax_atom *atom;
-    real inv_m;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= n )
-    {
-        return;
-    }
-
-    /* velocity verlet, 2nd part */
-    atom = &(my_atoms[i]);
-    inv_m = 1.0 / sbp[atom->type].mass;
-    /* Compute v(t + dt) */
-    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-}
-
-
-void bNVT_update_velocity_part2( reax_system *system, real dt )
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_scale_velocities( reax_atom *my_atoms, real lambda, int n )
-{
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= n )
-    {
-        return;
-    }
-
-    /* Scale velocities and positions at t+dt */
-    atom = &(my_atoms[i]);
-    rvec_Scale( atom->v, lambda, atom->v );
-}
-
-
-void bNVT_scale_velocities( reax_system *system, real lambda )
-{
-    int blocks;
-
-    blocks = system->n / DEF_BLOCK_SIZE + 
-        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, lambda, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
diff --git a/PG-PuReMD/src/cuda_lin_alg.cu b/PG-PuReMD/src/cuda_lin_alg.cu
deleted file mode 100644
index 4f37d577..00000000
--- a/PG-PuReMD/src/cuda_lin_alg.cu
+++ /dev/null
@@ -1,624 +0,0 @@
-/*----------------------------------------------------------------------
-  PuReMD - Purdue ReaxFF Molecular Dynamics Program
-
-  Copyright (2010) Purdue University
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Joseph Fogarty, jcfogart@mail.usf.edu
-  Sagar Pandit, pandit@usf.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-
-#include "cuda_lin_alg.h"
-
-#include "reax_types.h"
-
-#include "cuda_shuffle.h"
-#include "cuda_utils.h"
-#include "cuda_reduction.h"
-
-
-//one thread per row
-CUDA_GLOBAL void k_matvec( sparse_matrix H, real *vec, real *results,
-        int rows )
-{
-    int i, col;
-    real results_row;
-    real val;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= rows )
-    {
-        return;
-    }
-
-    results_row = 0;
-
-    for (int c = H.start[i]; c < H.end[i]; c++)
-    {
-        col = H.entries [c].j;
-        val = H.entries[c].val;
-
-        results_row += val * vec[col];
-    }
-
-    results[i] = results_row;
-}
-
-
-//32 thread warp per matrix row.
-//invoked as follows
-// <<< system->N, 32 >>>
-//CUDA_GLOBAL void __launch_bounds__(384, 16) k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows)
-CUDA_GLOBAL void k_matvec_csr( sparse_matrix H, real *vec, real *results,
-        int num_rows )
-{
-#if defined(__SM_35__)
-    real vals;
-    int x;
-#else
-    extern __shared__ real vals[];
-#endif
-    int jj;
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
-    int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1);
-    int row_start;
-    int row_end;
-    // one warp per row
-    int row = warp_id;
-    
-#if defined(__SM_35__)
-    vals = 0;
-#else
-    vals[threadIdx.x] = 0;
-#endif
-
-    if (row < num_rows)
-    {
-        row_start = H.start[row];
-        row_end = H.end[row];
-
-        // compute running sum per thread
-        for ( jj = row_start + lane; jj < row_end;
-                jj += MATVEC_KER_THREADS_PER_ROW )
-#if defined(__SM_35__)
-        {
-            vals += H.entries[jj].val * vec[ H.entries[jj].j ];
-        }
-    }
-#else
-        {
-            vals[threadIdx.x] += H.entries[jj].val * vec[ H.entries[jj].j ];
-        }
-    }
-
-    __syncthreads( );
-#endif
-
-    // parallel reduction in shared memory
-    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
-#if defined(__SM_35__)
-    for (x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2)
-    {
-        vals += shfl( vals, x );
-    }
-
-    if (lane == 0 && row < num_rows)
-    {
-        results[row] = vals;
-    }
-#else
-    if (lane < 16)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 16];
-    }
-    __syncthreads( );
-    if (lane < 8)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 8];
-    }
-    __syncthreads( );
-    if (lane < 4)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 4];
-    }
-    __syncthreads( );
-    if (lane < 2)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 2];
-    }
-    __syncthreads( );
-    if (lane < 1)
-    {
-        vals[threadIdx.x] += vals[threadIdx.x + 1];
-    }
-    __syncthreads( );
-
-    // first thread writes the result
-    if (lane == 0 && row < num_rows)
-    {
-        results[row] = vals[threadIdx.x];
-    }
-#endif
-}
-
-
-//one thread per row
-CUDA_GLOBAL void k_dual_matvec( sparse_matrix H, rvec2 *vec, rvec2 *results,
-        int rows )
-{
-    int i, c, col;
-    rvec2 results_row;
-    real val;
-
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( i >= rows)
-    {
-        return;
-    }
-
-    results_row[0] = 0.0;
-    results_row[1] = 0.0;
-
-    for (c = H.start[i]; c < H.end[i]; c++)
-    {
-        col = H.entries [c].j;
-        val = H.entries[c].val;
-
-        results_row[0] += val * vec [col][0];
-        results_row[1] += val * vec [col][1];
-    }
-
-    results[i][0] = results_row[0];
-    results[i][1] = results_row[1];
-}
-
-
-//32 thread warp per matrix row.
-//invoked as follows
-// <<< system->N, 32 >>>
-//CUDA_GLOBAL void __launch_bounds__(384, 8) k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results, int num_rows)
-CUDA_GLOBAL void  k_dual_matvec_csr( sparse_matrix H, rvec2 *vec,
-        rvec2 *results, int num_rows )
-{
-#if defined(__SM_35__)
-    rvec2 rvals;
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
-    int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1);
-    int row_start;
-    int row_end;
-    // one warp per row
-    int row = warp_id;
-
-    rvals[0] = 0;
-    rvals[1] = 0;
-
-    if (row < num_rows)
-    {
-        row_start = H.start[row];
-        row_end = H.end[row];
-
-        for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW)
-        {
-            rvals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
-            rvals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
-        }
-    }
-
-    for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2)
-    {
-        rvals[0] += shfl( rvals[0], s);
-        rvals[1] += shfl( rvals[1], s);
-    }
-
-    if (lane == 0 && row < num_rows)
-    {
-        results[row][0] = rvals[0];
-        results[row][1] = rvals[1];
-    }
-
-#else
-    extern __shared__ rvec2 rvals[];
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / 32;
-    int lane = thread_id & (32 - 1);
-    int row_start;
-    int row_end;
-    // one warp per row
-    //int row = warp_id;
-    int row = warp_id;
-
-    rvals[threadIdx.x][0] = 0;
-    rvals[threadIdx.x][1] = 0;
-
-    if (row < num_rows)
-    {
-        row_start = H.start[row];
-        row_end = H.end[row];
-
-        // compute running sum per thread
-        for(int jj = row_start + lane; jj < row_end; jj += 32)
-        {
-            rvals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
-            rvals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
-        }
-    }
-
-    __syncthreads( );
-
-    // parallel reduction in shared memory
-    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
-    if (lane < 16)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 16][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 16][1]; 
-    }
-    __syncthreads( );
-    if (lane < 8)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 8][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 8][1]; 
-    }
-    __syncthreads( );
-    if (lane < 4)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 4][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 4][1]; 
-    }
-    __syncthreads( );
-    if (lane < 2)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 2][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 2][1]; 
-    }
-    __syncthreads( );
-    if (lane < 1)
-    {
-        rvals[threadIdx.x][0] += rvals[threadIdx.x + 1][0]; 
-        rvals[threadIdx.x][1] += rvals[threadIdx.x + 1][1]; 
-    }
-    __syncthreads( );
-
-    // first thread writes the result
-    if (lane == 0 && row < num_rows)
-    {
-        results[row][0] = rvals[threadIdx.x][0];
-        results[row][1] = rvals[threadIdx.x][1];
-    }
-
-#endif
-}
-
-
-void Cuda_Vector_Sum( real *res, real a, real *x, real b, real *y, int count )
-{
-    //res = ax + by
-    //use the cublas here
-    int blocks;
-
-    blocks = (count / DEF_BLOCK_SIZE) + 
-        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
-        ( res, a, x, b, y, count );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_CG_Preconditioner( real *res, real *a, real *b, int count )
-{
-    //res = a*b - vector multiplication
-    //use the cublas here.
-    int blocks;
-
-    blocks = (count / DEF_BLOCK_SIZE) + 
-        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
-        ( res, a, b, count );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_diagonal_preconditioner(storage p_workspace, rvec2 *b, int n)
-{
-    storage *workspace;
-    int j;
-   
-    j = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( j >= n )
-    {
-        return;
-    }
-
-    workspace = &( p_workspace );
-
-    //for( j = 0; j < system->n; ++j ) {
-    // residual 
-    workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
-    workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
-
-    // apply diagonal pre-conditioner
-    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
-    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
-    //}
-}
-
-
-void Cuda_CG_Diagonal_Preconditioner( storage *workspace, rvec2 *b, int n )
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_diagonal_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-        (*workspace, b, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_dual_cg_preconditioner( storage p_workspace, rvec2 *x, 
-        real alpha_0, real alpha_1, int n, rvec2 *my_dot )
-{
-    storage *workspace;
-    rvec2 alpha;
-    int j;
-   
-    j = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if ( j >= n )
-    {
-        return;
-    }
-
-    workspace = &( p_workspace );
-    alpha[0] = alpha_0;
-    alpha[1] = alpha_1;
-    my_dot[j][0] = my_dot[j][1] = 0.0;
-
-    //for( j = 0; j < system->n; ++j ) {
-    // update x 
-    x[j][0] += alpha[0] * workspace->d2[j][0];
-    x[j][1] += alpha[1] * workspace->d2[j][1];      
-
-    // update residual 
-    workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
-    workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
-
-    // apply diagonal pre-conditioner 
-    workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-    workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-
-    // dot product: r.p 
-    my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
-    my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
-    //}
-}
-
-
-void Cuda_DualCG_Preconditioner( storage *workspace, rvec2 *x, rvec2 alpha,
-        int n, rvec2 result )
-{
-    int blocks;
-    rvec2 *tmp = (rvec2 *) scratch;
-
-    cuda_memset( tmp, 0, sizeof(rvec2) * ( 2 * n + 1),
-            "cuda_dualcg_preconditioner" );
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-        (*workspace, x, alpha[0], alpha[1], n, tmp);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    //Reduction to calculate my_dot
-    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
-        ( tmp, tmp + n, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
-        ( tmp + n, tmp + 2*n, blocks);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( result, (tmp + 2*n), sizeof(rvec2),
-            cudaMemcpyDeviceToHost, "my_dot" );
-}
-
-
-void Cuda_Norm( rvec2 *arr, int n, rvec2 result )
-{
-    int blocks;
-    rvec2 *tmp = (rvec2 *) scratch;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
-        (arr, tmp, n, INITIAL);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>>
-        (tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
-            cudaMemcpyDeviceToHost, "cuda_norm_rvec2" );
-}
-
-
-void Cuda_Dot( rvec2 *a, rvec2 *b, rvec2 result, int n )
-{
-    int blocks;
-    rvec2 *tmp = (rvec2 *) scratch;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * DEF_BLOCK_SIZE >>>
-        ( a, b, tmp, n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof(rvec2) * BLOCKS_POW_2 >>> 
-    //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof(rvec2) * BLOCKS_POW_2 >>> 
-        ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( result, tmp + BLOCKS_POW_2, sizeof(rvec2), 
-            cudaMemcpyDeviceToHost, "cuda_dot" );
-}
-
-
-void Cuda_Vector_Sum_Rvec2(rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
-        ( x, a, b[0], b[1], c, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_rvec2_to_real_copy( real *dst, rvec2 *src, int index, int n )
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (i >= n)
-    {
-        return;
-    }
-
-    dst[i] = src[i][index];
-}
-
-
-void Cuda_RvecCopy_From( real *dst, rvec2 *src, int index, int n )
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
-        ( dst, src, index, n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-CUDA_GLOBAL void k_real_to_rvec2_copy( rvec2 *dst, real *src, int index, int n)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (i >= n)
-    {
-        return;
-    }
-
-    dst[i][index] = src[i];
-}
-
-
-void Cuda_RvecCopy_To(rvec2 *dst, real *src, int index, int n)
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-    k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
-        ( dst, src, index, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_Dual_Matvec( sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size )
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
-    cuda_memset( b, 0, sizeof(rvec2) * size, "dual_matvec:result" );
-
-    //One thread per row implementation
-    //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-    //        (*H, a, b, n);
-    //cudaThreadSynchronize ();
-    //cudaCheckError ();
-
-    //One warp per row implementation
-#if defined(__SM_35__)
-    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
-#else
-    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
-                      sizeof(rvec2) * MATVEC_BLOCK_SIZE >>>
-#endif
-            ( *H, a, b, n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
-
-
-void Cuda_Matvec( sparse_matrix *H, real *a, real *b, int n, int size )
-{
-    int blocks;
-
-    blocks = (n / DEF_BLOCK_SIZE) + 
-        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-
-    cuda_memset( b, 0, sizeof(real) * size, "dual_matvec:result" );
-
-    //one thread per row implementation
-    //k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-    //        (*H, a, b, n);
-    //cudaThreadSynchronize ();
-    //cudaCheckError ();
-
-#if defined(__SM_35__)
-    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
-#else
-    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE,
-                 sizeof(real) * MATVEC_BLOCK_SIZE>>>
-#endif
-                     (*H, a, b, n);
-
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-}
diff --git a/PG-PuReMD/src/cuda_system_props.cu b/PG-PuReMD/src/cuda_system_props.cu
deleted file mode 100644
index 3202f64a..00000000
--- a/PG-PuReMD/src/cuda_system_props.cu
+++ /dev/null
@@ -1,406 +0,0 @@
-
-#include "cuda_system_props.h"
-
-#include "cuda_utils.h"
-#include "cuda_reduction.h"
-#include "center_mass.h"
-#include "cuda_copy.h"
-#include "cuda_shuffle.h"
-
-#include "vector.h"
-
-
-CUDA_GLOBAL void k_compute_total_mass( single_body_parameters *sbp, reax_atom *my_atoms, 
-        real *block_results, int n )
-{
-#if defined(__SM_35__)
-    extern __shared__ real my_sbp[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real sdata = 0;
-
-    if (i < n)
-    {
-        sdata = sbp[ my_atoms[i].type ].mass;
-    }
-    __syncthreads( );
-
-    for(int z = 16; z >=1; z/=2)
-    {
-        sdata += shfl( sdata, z);
-    }
-
-    if (threadIdx.x % 32 == 0)
-    {
-        my_sbp[threadIdx.x >> 5] = sdata;
-    }
-
-    __syncthreads( );
-
-    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
-    {
-        if(threadIdx.x < offset)
-        {
-            my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset];
-        }
-
-        __syncthreads( );
-    }
-
-    if(threadIdx.x == 0)
-    {
-        block_results[blockIdx.x] = my_sbp[0];
-    }
-
-#else
-    extern __shared__ real sdata[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real x = 0;
-
-    if (i < n)
-    {
-        x = sbp[ my_atoms[i].type ].mass;
-    }
-
-    sdata[ threadIdx.x ] = x;
-    __syncthreads( );
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-        }
-
-        __syncthreads( );
-    }
-
-    if (threadIdx.x == 0)
-    {
-        block_results[ blockIdx.x] = sdata [0];
-    }
-
-#endif
-}
-
-
-extern "C" void dev_compute_total_mass( reax_system *system, real *local_val )
-{
-    real *block_mass = (real *) scratch;
-    cuda_memset( block_mass, 0, sizeof(real) * (1 + BLOCKS_POW_2), "total_mass:tmp" );
-
-    k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
-        (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof(real), 
-            cudaMemcpyDeviceToHost, "total_mass:tmp");
-}
-
-
-CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_atom *my_atoms, 
-        real *block_results, int n )
-{
-#if defined(__SM_35__)
-    extern __shared__ real my_sbpdot[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real sdata = 0;
-    rvec p;
-
-    if (i < n)
-    {
-        sdata = sbp[ my_atoms[i].type ].mass;
-        rvec_Scale( p, sdata, my_atoms[ i ].v );
-        sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
-    }
-
-    __syncthreads( );
-
-    for(int z = 16; z >=1; z/=2)
-    {
-        sdata += shfl( sdata, z);
-    }
-
-    if (threadIdx.x % 32 == 0)
-    {
-        my_sbpdot[threadIdx.x >> 5] = sdata;
-    }
-
-    __syncthreads( );
-
-    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset];
-        }
-
-        __syncthreads( );
-    }
-
-    if (threadIdx.x == 0)
-    {
-        block_results[blockIdx.x] = my_sbpdot[0];
-    }
-
-#else
-    extern __shared__ real sdata [];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real    m = 0;
-    rvec p;
-
-    if (i < n)
-    {
-        m = sbp[ my_atoms[i].type ].mass;
-        rvec_Scale( p, m, my_atoms[ i ].v );
-        m = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
-    }
-
-    sdata[ threadIdx.x ] = m;
-    __syncthreads( );
-
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-        }
-
-        __syncthreads( );
-    }
-
-    if (threadIdx.x == 0)
-    {
-        block_results[blockIdx.x] = sdata[0];
-    }
-#endif
-}
-
-extern "C" void dev_compute_kinetic_energy( reax_system *system,
-        simulation_data *data, real *local_val )
-{
-    real *block_energy = (real *) scratch;
-    cuda_memset( block_energy, 0, sizeof(real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp" );
-
-    k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
-        (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( local_val, block_energy + BLOCKS_POW_2,
-            //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
-            sizeof(real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp" );
-            //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
-            //        sizeof (real), "kinetic_energy");
-}
-
-
-extern "C" void dev_compute_momentum( reax_system *system, rvec xcm, 
-        rvec vcm, rvec amcm )
-{
-    rvec *l_xcm, *l_vcm, *l_amcm;
-    rvec *r_scratch = (rvec *)scratch;
-
-#if defined( __SM_35__)
-    // xcm
-    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
-    l_xcm = r_scratch;
-    
-    center_of_mass_blocks_xcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
-    (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    
-    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
-            (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    copy_host_device( xcm, l_xcm + BLOCKS_POW_2,
-            sizeof(rvec), cudaMemcpyDeviceToHost, "momentum:xcm" );
-    
-    // vcm
-    cuda_memset( scratch, 0, sizeof(rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
-    l_vcm = r_scratch;
-    
-    center_of_mass_blocks_vcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    
-    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
-        (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof(rvec),
-        cudaMemcpyDeviceToHost, "momentum:vcm" );
-    
-    // amcm
-    cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-    l_amcm = r_scratch;
-    
-    center_of_mass_blocks_amcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof(rvec) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n );
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    
-    k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof(rvec) * BLOCKS_POW_2) >>>
-        (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof(rvec),
-        cudaMemcpyDeviceToHost, "momemtum:amcm" );
-
-#else
-    cuda_memset( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp" );
-    
-    l_xcm = r_scratch;
-    l_vcm = r_scratch + (BLOCKS_POW_2 + 1); 
-    l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); 
-    
-    center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> 
-        (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n);
-    cudaThreadSynchronize( ); 
-    cudaCheckError( ); 
-    
-    center_of_mass <<<1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> 
-        (l_xcm, l_vcm, l_amcm,
-         l_xcm + BLOCKS_POW_2, 
-         l_vcm + BLOCKS_POW_2, 
-         l_amcm + BLOCKS_POW_2, 
-         BLOCKS_POW_2);
-    cudaThreadSynchronize( ); 
-    cudaCheckError( );
-    
-    copy_host_device( xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" );
-    copy_host_device( vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" );
-    copy_host_device( amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" );
-#endif
-}
-
-
-extern "C" void dev_compute_inertial_tensor( reax_system *system, real *local_results, rvec my_xcm )
-{
-#if defined(__SM_35__)
-    real *partial_results = (real *) scratch;
-    cuda_memset( partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp" );
-
-    compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
-        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2,
-        sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results" );
-
-#else
-    real *partial_results = (real *) scratch;
-    //real *local_results;
-
-    cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
-    //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1));
-
-    compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
-        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-    cudaThreadSynchronize( );
-    cudaCheckError( );
-
-    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, 
-            sizeof(real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
-#endif
-}
-
-
-extern "C" void dev_sync_simulation_data( simulation_data *data )
-{
-    Output_Sync_Simulation_Data( data, (simulation_data *)data->d_simulation_data );
-}
-
-
-/*
-CUDA_GLOBAL void ker_kinetic_energy (reax_atom *my_atoms, 
-   single_body_parameters *sbp, int n, real *block_results)
-{
-   extern __shared__ real sken[];
-   rvec p;
-   unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-   real x = 0;
-
-   if(i < n)
-   {
-   m = sbp[my_atoms[i].type].mass;
-   rvec_Scale( p, m, my_atoms[i].v );
-   x = 0.5 * rvec_Dot( p, my_atoms[i].v );
-   }
-   sken[threadIdx.x] = x;
-   __syncthreads();
-
-   for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-   {
-   if(threadIdx.x < offset)
-   {   
-   sken[threadIdx.x] += sken[threadIdx.x + offset];
-   }   
-
-   __syncthreads();
-   }
-
-   if(threadIdx.x == 0)
-   {
-   per_block_results[blockIdx.x] = sken[0];
-   }
-}
-
-
-void dev_compute_kinetic_energy (reax_system *system, simulation_data *data, real *p_ekin)
-{
-   real *spad = (real *) scratch;
-   cuda_memset (spad, 0, sizeof (real) * 2 * system->n, "kinetic_energy");
-
-   ker_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-   (spad, spad + system->n,  system->n);
-   cudaThreadSynchronize (); 
-   cudaCheckError (); 
-
-   k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>> 
-   (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, BLOCKS);
-   cudaThreadSynchronize (); 
-   cudaCheckError (); 
-
-   copy_host_device (p_ekin, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
-   sizeof (real), cudaMemcpyDeviceToHost, "kinetic_energy");
-}
-*/
diff --git a/PG-PuReMD/src/ffield.c b/PG-PuReMD/src/ffield.c
index 443d9051..d985339b 100644
--- a/PG-PuReMD/src/ffield.c
+++ b/PG-PuReMD/src/ffield.c
@@ -20,7 +20,8 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-  #if defined(PURE_REAX)
+
+#if defined(PURE_REAX)
   #include "ffield.h"
   #include "tool_box.h"
 #elif defined(LAMMPS_REAX)
diff --git a/PG-PuReMD/src/ffield.h b/PG-PuReMD/src/ffield.h
index 9aa2a27f..313c3e67 100644
--- a/PG-PuReMD/src/ffield.h
+++ b/PG-PuReMD/src/ffield.h
@@ -24,6 +24,16 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 char Read_Force_Field( char*, reax_interaction*, control_params* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/forces.c b/PG-PuReMD/src/forces.c
index c57527fe..19133fce 100644
--- a/PG-PuReMD/src/forces.c
+++ b/PG-PuReMD/src/forces.c
@@ -21,15 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-#ifdef HAVE_CUDA
-  #include "cuda_forces.h"
-  #include "cuda_lin_alg.h"
-  #include "cuda_neighbors.h"
-  #include "cuda_utils.h"
-  #include "cuda_validation.h"
-#endif
-
 #if defined(PURE_REAX)
   #include "forces.h"
   #include "bond_orders.h"
@@ -63,11 +54,7 @@
   #include "reax_vector.h"
 #endif
 
-
-#ifdef HAVE_CUDA
-void Cuda_Total_Forces( reax_system *, control_params *, simulation_data *, storage * );
-void Cuda_Total_Forces_PURE( reax_system *, storage * );
-#endif
+#include "index_utils.h"
 
 
 interaction_function Interaction_Functions[NUM_INTRS];
@@ -221,41 +208,6 @@ void Compute_Total_Force( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Compute_Total_Force( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace,
-        reax_list **lists, mpi_datatypes *mpi_data )
-{
-    rvec *f;
-
-    f = (rvec *) host_scratch;
-    memset( f, 0, sizeof(rvec) * system->N );
-
-    Cuda_Total_Forces( system, control, data, workspace );
-
-#if defined(PURE_REAX)
-    /* now all forces are computed to their partially-final values
-     * based on the neighbors information each processor has had.
-     * final values of force on each atom needs to be computed by adding up
-     * all partially-final pieces */
-
-    //MVAPICH2
-    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N ,
-            cudaMemcpyDeviceToHost, "total_force:f:get" );
-
-    Coll( system, mpi_data, f, mpi_data->mpi_rvec,
-          sizeof(rvec) / sizeof(void), rvec_unpacker );
-
-    copy_host_device( f, dev_workspace->f, sizeof(rvec) * system->N,
-            cudaMemcpyHostToDevice, "total_force:f:put" );
-
-    Cuda_Total_Forces_PURE( system, dev_workspace );
-#endif
-
-}
-#endif
-
-
 // Essentially no-cuda copies of cuda kernels, to be used only in the mpi-not-gpu version
 ////////////////////////
 // HBOND ISSUE
@@ -1851,173 +1803,6 @@ int Compute_Forces( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_Compute_Forces( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace, reax_list **lists,
-        output_controls *out_control, mpi_datatypes *mpi_data )
-{
-    int charge_flag, retVal;
-
-#if defined(LOG_PERFORMANCE)
-    real t_start = 0;
-
-    //MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-    {
-        t_start = Get_Time( );
-    }
-#endif
-
-    retVal = SUCCESS;
-
-    /********* init forces ************/
-    if ( control->charge_freq && (data->step - data->prev_steps) % control->charge_freq == 0 )
-    {
-        charge_flag = TRUE;
-    }
-    else
-    {
-        charge_flag = FALSE;
-    }
-
-    if ( charge_flag == TRUE )
-    {
-        retVal = Cuda_Init_Forces( system, control, data, workspace, lists, out_control );
-
-//        int i;
-//        static reax_list **temp_lists;
-//       
-//        if ( data->step == 0 )
-//        {
-//            temp_lists = (reax_list **) smalloc( LIST_N * sizeof (reax_list *), "temp_lists" );
-//            for ( i = 0; i < LIST_N; ++i )
-//            {
-//                temp_lists[i] = (reax_list *) smalloc( sizeof(reax_list), "lists[i]" );
-//                temp_lists[i]->allocated = FALSE;
-//            }
-//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
-//                    TYP_BOND, *temp_lists + BONDS );
-//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
-//                    TYP_HBOND, *temp_lists + HBONDS );
-//        }
-//        else
-//        {
-//            Delete_List( *temp_lists + BONDS );
-//            Make_List( (*dev_lists + BONDS)->n, (*dev_lists + BONDS)->num_intrs,
-//                    TYP_BOND, *temp_lists + BONDS );
-//            Delete_List( *temp_lists + HBONDS );
-//            Make_List( (*dev_lists + HBONDS)->n, (*dev_lists + HBONDS)->num_intrs,
-//                    TYP_HBOND, *temp_lists + HBONDS );
-//
-//        }
-//        Output_Sync_Lists( *temp_lists + BONDS, *dev_lists + BONDS, TYP_BOND );
-//        Print_Bonds( system, temp_lists, control );
-//        Output_Sync_Lists( *temp_lists + HBONDS, *dev_lists + HBONDS, TYP_HBOND );
-//        Print_HBonds( system, temp_lists, control, data->step );
-//        Print_HBond_Indices( system, temp_lists, control, data->step );
-//        exit( 0 );
-    }
-    else
-    {
-        retVal = Cuda_Init_Forces_No_Charges( system, control, data, workspace, lists, out_control );
-    }
-
-    if ( retVal == SUCCESS )
-    {
-        //validate_sparse_matrix( system, workspace );
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.init_forces) );
-        }
-#endif
-
-        /********* bonded interactions ************/
-        retVal = Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.bonded) );
-        }
-#endif
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: completed bonded\n",
-                 system->my_rank, data->step );
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-    }
-
-    if ( retVal == SUCCESS )
-    {
-    /**************** charges ************************/
-#if defined(PURE_REAX)
-        if ( charge_flag == TRUE )
-        {
-            Cuda_QEq( system, control, data, workspace, out_control, mpi_data );
-        }
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.qEq) );
-        }
-#endif
-
-#if defined(DEBUG_FOCUS)
-        fprintf(stderr, "p%d @ step%d: qeq completed\n", system->my_rank, data->step);
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-#endif //PURE_REAX
-
-        /********* nonbonded interactions ************/
-        Cuda_Compute_NonBonded_Forces( system, control, data, workspace,
-                lists, out_control, mpi_data );
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.nonb) );
-        }
-#endif
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: nonbonded forces completed\n",
-                system->my_rank, data->step );
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-        /*********** total force ***************/
-        Cuda_Compute_Total_Force( system, control, data, workspace, lists, mpi_data );
-
-#if defined(LOG_PERFORMANCE)
-        //MPI_Barrier( MPI_COMM_WORLD );
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &(data->timing.bonded) );
-        }
-#endif
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: total forces computed\n",
-                system->my_rank, data->step );
-        //Print_Total_Force( system, data, workspace );
-        MPI_Barrier( MPI_COMM_WORLD );
-
-#endif
-
-//        Print_Forces( system );
-    }
-
-    return retVal;
-}
-#endif
-
-
 int validate_device( reax_system *system, simulation_data *data,
         storage *workspace, reax_list **lists )
 {
diff --git a/PG-PuReMD/src/forces.h b/PG-PuReMD/src/forces.h
index 6b4218e8..0579f092 100644
--- a/PG-PuReMD/src/forces.h
+++ b/PG-PuReMD/src/forces.h
@@ -28,6 +28,10 @@
 extern interaction_function Interaction_Functions[NUM_INTRS];
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Init_Force_Functions( control_params* );
 
 int Compute_Forces( reax_system*, control_params*, simulation_data*,
@@ -36,10 +40,11 @@ int Compute_Forces( reax_system*, control_params*, simulation_data*,
 void Estimate_Storages( reax_system*, control_params*, reax_list**,
         int*, int*, int*, int* );
 
-int Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*,
-        storage*, reax_list**, output_controls*, mpi_datatypes* );
-
 int validate_device( reax_system *, simulation_data *, storage *, reax_list ** );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/geo_tools.c b/PG-PuReMD/src/geo_tools.c
index b97123a9..dff292e7 100644
--- a/PG-PuReMD/src/geo_tools.c
+++ b/PG-PuReMD/src/geo_tools.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "geo_tools.h"
+
 #include "allocate.h"
 #include "box.h"
 #include "tool_box.h"
diff --git a/PG-PuReMD/src/geo_tools.h b/PG-PuReMD/src/geo_tools.h
index 80786856..628e8f74 100644
--- a/PG-PuReMD/src/geo_tools.h
+++ b/PG-PuReMD/src/geo_tools.h
@@ -29,10 +29,6 @@
 // CUSTOM ATOM: serial element name x y z
 #define CUSTOM_ATOM_FORMAT " %d %s %s %lf %lf %lf"
 
-char Read_Geo( char*, reax_system*, control_params*,
-               simulation_data*, storage*, mpi_datatypes* );
-
-
 /*PDB format :
 http://www.rcsb.org/pdb/file_formats/pdb/pdbguide2.2/guide2.2_frame.html
 
@@ -114,10 +110,23 @@ COLUMNS       DATA TYPE       FIELD         DEFINITION
 #define PDB_ATOM_FORMAT_O_LENGTH 81
 #define PDB_CRYST1_FORMAT_O "%6s%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f%11s%4d\n"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+char Read_Geo( char*, reax_system*, control_params*,
+        simulation_data*, storage*, mpi_datatypes* );
+
 char Read_PDB( char*, reax_system*, control_params*,
-               simulation_data*, storage*, mpi_datatypes* );
+        simulation_data*, storage*, mpi_datatypes* );
 
 char Write_PDB( reax_system*, reax_list*, simulation_data*,
-                control_params*, mpi_datatypes*, output_controls* );
+        control_params*, mpi_datatypes*, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/grid.c b/PG-PuReMD/src/grid.c
index 3714766c..d893f6c6 100644
--- a/PG-PuReMD/src/grid.c
+++ b/PG-PuReMD/src/grid.c
@@ -19,15 +19,17 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "grid.h"
+
 #include "allocate.h"
+#include "index_utils.h"
 #include "io_tools.h"
 #include "reset_tools.h"
 #include "tool_box.h"
 #include "vector.h"
 
-#include "index_utils.h"
-
 
 /* determines the exchange boundaries with nbrs in terms of gcells */
 void Mark_GCells( reax_system* system, grid *g, ivec procs, MPI_Comm comm )
diff --git a/PG-PuReMD/src/grid.h b/PG-PuReMD/src/grid.h
index ad51e699..cb124da7 100644
--- a/PG-PuReMD/src/grid.h
+++ b/PG-PuReMD/src/grid.h
@@ -24,10 +24,24 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Setup_New_Grid( reax_system*, control_params*, MPI_Comm );
+
 void Update_Grid( reax_system*, control_params*, MPI_Comm );
+
 void Bin_My_Atoms( reax_system*, reallocate_data* );
+
 void Reorder_My_Atoms( reax_system*, storage* );
+
 void Bin_Boundary_Atoms( reax_system* );
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/hydrogen_bonds.c b/PG-PuReMD/src/hydrogen_bonds.c
index 5743feb5..dfd7abac 100644
--- a/PG-PuReMD/src/hydrogen_bonds.c
+++ b/PG-PuReMD/src/hydrogen_bonds.c
@@ -21,8 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-
 #if defined(PURE_REAX)
   #include "hydrogen_bonds.h"
   #include "bond_orders.h"
@@ -37,6 +35,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 // DANIEL
 // This function is taken straight from PuReMD, with minimal changes to accomodate the new datastructures
diff --git a/PG-PuReMD/src/hydrogen_bonds.h b/PG-PuReMD/src/hydrogen_bonds.h
index 346f0045..e4f58e10 100644
--- a/PG-PuReMD/src/hydrogen_bonds.h
+++ b/PG-PuReMD/src/hydrogen_bonds.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Hydrogen_Bonds( reax_system*, control_params*, simulation_data*,
-                     storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/init_md.c b/PG-PuReMD/src/init_md.c
index 595724bc..2e406d1a 100644
--- a/PG-PuReMD/src/init_md.c
+++ b/PG-PuReMD/src/init_md.c
@@ -23,17 +23,6 @@
 
 #include <stddef.h>
 
-#ifdef HAVE_CUDA
-  #include "cuda_allocate.h"
-  #include "cuda_list.h"
-  #include "cuda_copy.h"
-  #include "cuda_forces.h"
-  #include "cuda_init_md.h"
-  #include "cuda_neighbors.h"
-  #include "cuda_reset_tools.h"
-  #include "cuda_validation.h"
-#endif
-
 #if defined(PURE_REAX)
   #include "init_md.h"
   #include "allocate.h"
@@ -239,76 +228,6 @@ int Init_System( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_Init_System( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace,
-        mpi_datatypes *mpi_data, char *msg )
-{
-    int i, ret;
-    reax_atom *atom;
-    int nrecv[MAX_NBRS];
-
-    Setup_New_Grid( system, control, MPI_COMM_WORLD );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d GRID:\n", system->my_rank );
-    Print_Grid( &(system->my_grid), stderr );
-#endif
-
-    Bin_My_Atoms( system, &(workspace->realloc) );
-    Reorder_My_Atoms( system, workspace );
-
-    /* estimate N and total capacity */
-    for ( i = 0; i < MAX_NBRS; ++i )
-    {
-        nrecv[i] = 0;
-    }
-
-    MPI_Barrier( MPI_COMM_WORLD );
-    system->max_recved = 0;
-    system->N = SendRecv( system, mpi_data, mpi_data->boundary_atom_type, nrecv,
-            Estimate_Boundary_Atoms, Unpack_Estimate_Message, TRUE );
-    system->total_cap = MAX( (int)(system->N * SAFE_ZONE), MIN_CAP );
-    Bin_Boundary_Atoms( system );
-
-    /* Sync atoms here to continue the computation */
-    dev_alloc_system( system );
-    Sync_System( system );
-
-    /* estimate numH and Hcap */
-    Cuda_Reset_Atoms( system, control );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: n=%d local_cap=%d\n",
-             system->my_rank, system->n, system->local_cap );
-    fprintf( stderr, "p%d: N=%d total_cap=%d\n",
-             system->my_rank, system->N, system->total_cap );
-    fprintf( stderr, "p%d: numH=%d H_cap=%d\n",
-             system->my_rank, system->numH, system->Hcap );
-#endif
-
-    Cuda_Compute_Total_Mass( system, data, mpi_data->comm_mesh3D );
-
-    Cuda_Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );
-
-//    if( Reposition_Atoms( system, control, data, mpi_data, msg ) == FAILURE )
-//    {
-//        return FAILURE;
-//    }
-
-    /* initialize velocities so that desired init T can be attained */
-    if ( !control->restart || (control->restart && control->random_vel) )
-    {
-        Generate_Initial_Velocities( system, control->T_init );
-    }
-
-    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-    return SUCCESS;
-}
-#endif
-
-
 /************************ initialize simulation data ************************/
 void Init_Simulation_Data( reax_system *system, control_params *control,
         simulation_data *data, char *msg )
@@ -411,102 +330,6 @@ void Init_Simulation_Data( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
-        simulation_data *data, char *msg )
-{
-    dev_alloc_simulation_data( data );
-
-    Reset_Simulation_Data( data );
-
-    if ( !control->restart )
-    {
-        data->step = data->prev_steps = 0;
-    }
-
-    switch ( control->ensemble )
-    {
-    case NVE:
-        data->N_f = 3 * system->bigN;
-        Cuda_Evolve = Velocity_Verlet_NVE;
-        control->virial = 0;
-        break;
-
-    case bNVT:
-        data->N_f = 3 * system->bigN + 1;
-        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
-        control->virial = 0;
-        break;
-
-    case nhNVT:
-        fprintf( stderr, "[WARNING] Nose-Hoover NVT is still under testing.\n" );
-        data->N_f = 3 * system->bigN + 1;
-        Cuda_Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
-        control->virial = 0;
-        if ( !control->restart || (control->restart && control->random_vel) )
-        {
-            data->therm.G_xi = control->Tau_T *
-                               (2.0 * data->sys_en.e_kin - data->N_f * K_B * control->T );
-            data->therm.v_xi = data->therm.G_xi * control->dt;
-            data->therm.v_xi_old = 0;
-            data->therm.xi = 0;
-        }
-        break;
-
-    case sNPT: /* Semi-Isotropic NPT */
-        data->N_f = 3 * system->bigN + 4;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
-        control->virial = 1;
-        if ( !control->restart )
-        {
-            Reset_Pressures( data );
-        }
-        break;
-
-    case iNPT: /* Isotropic NPT */
-        data->N_f = 3 * system->bigN + 2;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
-        control->virial = 1;
-        if ( !control->restart )
-        {
-            Reset_Pressures( data );
-        }
-        break;
-
-    case NPT: /* Anisotropic NPT */
-        data->N_f = 3 * system->bigN + 9;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
-        control->virial = 1;
-
-        fprintf( stderr, "p%d: init_simulation_data: option not yet implemented\n",
-              system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
-        break;
-
-    default:
-        fprintf( stderr, "p%d: init_simulation_data: ensemble not recognized\n",
-              system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD,  INVALID_INPUT );
-    }
-
-    /* initialize the timer(s) */
-    MPI_Barrier( MPI_COMM_WORLD );
-    if ( system->my_rank == MASTER_NODE )
-    {
-        data->timing.start = Get_Time( );
-
-#if defined(LOG_PERFORMANCE)
-        Reset_Timing( &data->timing );
-#endif
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, "data->N_f: %8.3f\n", data->N_f );
-#endif
-}
-#endif
-
-
 #elif defined(LAMMPS_REAX)
 int Init_System( reax_system *system, char *msg )
 {
@@ -603,22 +426,6 @@ void Init_Workspace( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Init_Workspace( reax_system *system, control_params *control,
-        storage *workspace, char *msg )
-{
-    dev_alloc_workspace( system, control, dev_workspace,
-            system->local_cap, system->total_cap, msg );
-
-    memset( &(workspace->realloc), 0, sizeof(reallocate_data) );
-    Cuda_Reset_Workspace( system, workspace );
-
-    /* Initialize the Taper function */
-    Init_Taper( control, dev_workspace );
-}
-#endif
-
-
 /************** setup communication data structures  **************/
 int Init_MPI_Datatypes( reax_system *system, storage *workspace,
         mpi_datatypes *mpi_data, char *msg )
@@ -885,88 +692,6 @@ int Init_Lists( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_Init_Lists( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace, reax_list **lists,
-        mpi_datatypes *mpi_data, char *msg )
-{
-    int ret;
-    int Htop;
-   
-    /* ignore returned error, as system->d_max_far_nbrs was not valid */
-    ret = Cuda_Estimate_Neighbors( system, data->step );
-
-    Dev_Make_List( system->total_cap, system->total_far_nbrs,
-            TYP_FAR_NEIGHBOR, *dev_lists + FAR_NBRS );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: allocated far_nbrs: num_far=%d, space=%dMB\n",
-            system->my_rank, system->total_far_nbrs,
-            (int)(system->total_far_nbrs * sizeof(far_neighbor_data) / (1024 * 1024)) );
-    fprintf( stderr, "N: %d and total_cap: %d \n", system->N, system->total_cap );
-#endif
-
-    Cuda_Init_Neighbor_Indices( system );
-
-    Cuda_Generate_Neighbor_Lists( system, data, workspace, dev_lists );
-
-    /* estimate storage for bonds and hbonds */
-    Cuda_Estimate_Storages( system, control, dev_lists, &(dev_workspace->H), data->step );
-
-    /* estimate storage for charge sparse matrix */
-//    Cuda_Estimate_Storage_Sparse_Matrix( system, control, data, dev_lists );
-
-    dev_alloc_matrix( &(dev_workspace->H), system->total_cap, system->total_cm_entries );
-
-    Cuda_Init_Sparse_Matrix_Indices( system, &(dev_workspace->H) );
-
-    //MATRIX CHANGES
-    //workspace->L = NULL;
-    //workspace->U = NULL;
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p:%d - allocated H matrix: max_entries: %d, cap: %d \n",
-            system->my_rank, system->total_cm_entries, dev_workspace->H.m );
-    fprintf( stderr, "p%d: allocated H matrix: Htop=%d, space=%dMB\n",
-            system->my_rank, Htop,
-            (int)(Htop * sizeof(sparse_matrix_entry) / (1024 * 1024)) );
-#endif
-
-    if ( control->hbond_cut > 0.0 &&  system->numH > 0 )
-    {
-        Dev_Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *dev_lists + HBONDS );
-//        Make_List( system->total_cap, system->total_hbonds, TYP_HBOND, *lists + HBONDS );
-
-        Cuda_Init_HBond_Indices( system );
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d: allocated hbonds: total_hbonds=%d, space=%dMB\n",
-                system->my_rank, system->total_hbonds,
-                (int)(system->total_hbonds * sizeof(hbond_data) / (1024 * 1024)) );
-#endif
-    }
-
-    /* bonds list */
-    Dev_Make_List( system->total_cap, system->total_bonds, TYP_BOND, *dev_lists + BONDS );
-//    Make_List( system->total_cap, system->total_bonds, TYP_BOND, *lists + BONDS );
-
-    Cuda_Init_Bond_Indices( system );
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d: allocated bonds: total_bonds=%d, space=%dMB\n",
-            system->my_rank, total_bonds,
-            (int)(total_bonds * sizeof(bond_data) / (1024 * 1024)) );
-#endif
-
-    /* 3bodies list: since a more accurate estimate of the num.
-     * of three body interactions requires that bond orders have
-     * been computed, delay estimation until for computation */
-
-    return SUCCESS;
-}
-#endif
-
-
 #if defined(PURE_REAX)
 void Initialize( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
@@ -1106,108 +831,6 @@ void Pure_Initialize( reax_system *system, control_params *control,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Initialize( reax_system *system, control_params *control,
-        simulation_data *data, storage *workspace,
-        reax_list **lists, output_controls *out_control,
-        mpi_datatypes *mpi_data )
-{
-    char msg[MAX_STR];
-    real t_start, t_end;
-
-    /* HOST/DEVICE SCRATCH */
-    Cuda_Init_ScratchArea( );
-
-    /* MPI_DATATYPES */
-    if ( Init_MPI_Datatypes( system, workspace, mpi_data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: init_mpi_datatypes: could not create datatypes\n",
-                 system->my_rank );
-        fprintf( stderr, "p%d: mpi_data couldn't be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-    /* SYSTEM */
-    if ( Cuda_Init_System( system, control, data, workspace, mpi_data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-    /* GRID */
-    dev_alloc_grid( system );
-    Sync_Grid( &system->my_grid, &system->d_my_grid );
-
-    //validate_grid( system );
-
-    /* SIMULATION_DATA */
-    Cuda_Init_Simulation_Data( system, control, data, msg );
-
-    /* WORKSPACE */
-    Cuda_Init_Workspace( system, control, workspace, msg );
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized workspace\n", system->my_rank );
-#endif
-
-    //Sync the taper here from host to device.
-
-    /* CONTROL */
-    dev_alloc_control( control );
-
-    /* LISTS */
-    if ( Cuda_Init_Lists( system, control, data, workspace, lists, mpi_data, msg ) ==
-            FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: system could not be initialized! terminating.\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: initialized lists\n", system->my_rank );
-#endif
-
-    /* OUTPUT Files */
-    if ( Init_Output_Files( system, control, out_control, mpi_data, msg ) == FAILURE )
-    {
-        fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-        fprintf( stderr, "p%d: could not open output files! terminating...\n",
-                 system->my_rank );
-        MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: output files opened\n", system->my_rank );
-#endif
-
-    /* Lookup Tables */
-    if ( control->tabulate )
-    {
-        if ( Init_Lookup_Tables( system, control, dev_workspace->Tap, mpi_data, msg ) == FAILURE )
-        {
-            fprintf( stderr, "p%d: %s\n", system->my_rank, msg );
-            fprintf( stderr, "p%d: couldn't create lookup table! terminating.\n",
-                     system->my_rank );
-            MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );
-        }
-
-#if defined(DEBUG)
-        fprintf( stderr, "p%d: initialized lookup tables\n", system->my_rank );
-#endif
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, "p%d: Device Initialization Done \n", system->my_rank );
-#endif
-}
-#endif
-
-
 #elif defined(LAMMPS_REAX)
 void Initialize( reax_system *system, control_params *control,
         simulation_data *data, storage *workspace,
diff --git a/PG-PuReMD/src/init_md.h b/PG-PuReMD/src/init_md.h
index 5a66e4fb..c5222cbd 100644
--- a/PG-PuReMD/src/init_md.h
+++ b/PG-PuReMD/src/init_md.h
@@ -25,14 +25,25 @@
 #include "reax_types.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Generate_Initial_Velocities( reax_system *, real );
+
+int Init_MPI_Datatypes( reax_system *, storage *, mpi_datatypes *, char * );
+
 void Initialize( reax_system*, control_params*, simulation_data*,
         storage*, reax_list**, output_controls*, mpi_datatypes* );
 
 void Pure_Initialize( reax_system*, control_params*, simulation_data*,
         storage*, reax_list**, output_controls*, mpi_datatypes* );
 
-void Cuda_Initialize( reax_system*, control_params*, simulation_data*,
-        storage*, reax_list**, output_controls*, mpi_datatypes* );
+void Init_Taper( control_params *,  storage * );
+
+#ifdef __cplusplus
+}
+#endif
 
 
 #endif
diff --git a/PG-PuReMD/src/integrate.c b/PG-PuReMD/src/integrate.c
index 88b406b5..b0200897 100644
--- a/PG-PuReMD/src/integrate.c
+++ b/PG-PuReMD/src/integrate.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "integrate.h"
 
 #include "allocate.h"
@@ -33,14 +35,6 @@
 #include "tool_box.h"
 #include "vector.h"
 
-#ifdef HAVE_CUDA
-  #include "cuda_allocate.h"
-  #include "cuda_integrate.h"
-  #include "cuda_copy.h"
-  #include "cuda_neighbors.h"
-  #include "cuda_reset_tools.h"
-#endif
-
 
 int Velocity_Verlet_NVE( reax_system* system, control_params* control,
         simulation_data *data, storage *workspace, reax_list **lists,
@@ -339,143 +333,6 @@ int Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
-        simulation_data *data, storage *workspace, reax_list **lists,
-        output_controls *out_control, mpi_datatypes *mpi_data )
-{
-    int i, steps, renbr, ret;
-    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
-    real inv_m, dt, lambda;
-    rvec dx;
-    reax_atom *atom;
-    int *bond_top, *hb_top;
-    int Htop, num_3body;
-    int total_hbonds, count, total_bonds;
-    int bond_cap, cap_3body;
-    real t_over_start, t_over_elapsed;
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
-    MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-    dt = control->dt;
-    steps = data->step - data->prev_steps;
-    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
-    ret = SUCCESS;
-
-    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
-
-    if ( verlet_part1_done == FALSE )
-    {
-        /* velocity verlet, 1st part */
-        bNVT_update_velocity_part1( system, dt );
-        verlet_part1_done = TRUE;
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-        if ( renbr )
-        {
-            Update_Grid( system, control, mpi_data->world );
-        }
-
-        Output_Sync_Atoms( system );
-        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
-        Sync_Atoms( system );
-
-        /* synch the Grid to the Device here */
-        Sync_Grid( &system->my_grid, &system->d_my_grid );
-
-        init_blocks( system );
-
-#if defined(__CUDA_DEBUG_LOG__)
-        fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n",
-                system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
-#endif
-    }
-    
-    Cuda_Reset( system, control, data, workspace, lists );
-
-    if ( renbr )
-    {
-#if defined(DEBUG)
-        t_over_start  = Get_Time ();
-#endif
-
-        if ( estimate_nbrs_done == 0 )
-        {
-            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
-            ret = Cuda_Estimate_Neighbors( system, data->step );
-            estimate_nbrs_done = 1;
-        }
-
-        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
-        {
-            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
-            estimate_nbrs_done = 2;
-    
-#if defined(DEBUG)
-            t_over_elapsed  = Get_Timing_Info( t_over_start );
-            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
-                    system->my_rank, data->step, t_over_elapsed );
-#endif
-        }
-    }
-
-    if ( ret == SUCCESS )
-    {
-        ret = Cuda_Compute_Forces( system, control, data, workspace,
-                lists, out_control, mpi_data );
-    }
-
-    if ( ret == SUCCESS )
-    {
-        /* velocity verlet, 2nd part */
-        bNVT_update_velocity_part2( system, dt );
-
-#if defined(DEBUG_FOCUS)
-        fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-        /* temperature scaler */
-        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-        lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-        if ( lambda < MIN_dT )
-        {
-            lambda = MIN_dT;
-        }
-        else if (lambda > MAX_dT )
-        {
-            lambda = MAX_dT;
-        }
-        lambda = SQRT( lambda );
-
-        /* Scale velocities and positions at t+dt */
-        bNVT_scale_velocities( system, lambda );
-
-        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
-
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "p%d @ step%d: scaled velocities\n",
-                 system->my_rank, data->step );
-        MPI_Barrier( MPI_COMM_WORLD );
-#endif
-
-        verlet_part1_done = FALSE;
-        estimate_nbrs_done = 0;
-    }
-
-    return ret;
-}
-#endif
-
-
 /* uses Berendsen-type coupling for both T and P.
  * All box dimensions are scaled by the same amount,
  * there is no change in the angles between axes. */
diff --git a/PG-PuReMD/src/integrate.h b/PG-PuReMD/src/integrate.h
index 63fa9cbf..9a25c761 100644
--- a/PG-PuReMD/src/integrate.h
+++ b/PG-PuReMD/src/integrate.h
@@ -24,6 +24,11 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
         storage*, reax_list**, output_controls*, mpi_datatypes* );
 
@@ -49,9 +54,9 @@ int Velocity_Verlet_Flexible_NPT( reax_system*, control_params*,
                 output_controls*, mpi_datatypes* );
 */
 
-//CUDA SPECIFIC FUNCTIONS
-int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*,
-        simulation_data*, storage*, reax_list**, output_controls*,
-        mpi_datatypes* );
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/io_tools.c b/PG-PuReMD/src/io_tools.c
index c7c0f2fe..131f8a2e 100644
--- a/PG-PuReMD/src/io_tools.c
+++ b/PG-PuReMD/src/io_tools.c
@@ -20,7 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
   #include "io_tools.h"
   #include "basic_comm.h"
@@ -41,6 +41,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 print_interaction Print_Interactions[NUM_INTRS];
 
diff --git a/PG-PuReMD/src/io_tools.h b/PG-PuReMD/src/io_tools.h
index 6ae2d6d8..f83c9686 100644
--- a/PG-PuReMD/src/io_tools.h
+++ b/PG-PuReMD/src/io_tools.h
@@ -25,45 +25,71 @@
 #include "reax_types.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int Init_Output_Files( reax_system*, control_params*,
-                       output_controls*, mpi_datatypes*, char* );
+        output_controls*, mpi_datatypes*, char* );
+
 int Close_Output_Files( reax_system*, control_params*,
-                        output_controls*, mpi_datatypes* );
-
-void  Print_Box( simulation_box*, char*, FILE* );
-
-void  Print_Grid( grid*, FILE* );
-void  Print_GCell_Exchange_Bounds( int, neighbor_proc* );
-void  Print_Native_GCells( reax_system* );
-void  Print_All_GCells( reax_system*);
-
-void  Print_Init_Atoms( reax_system*, storage* );
-void  Print_My_Atoms( reax_system* );
-void  Print_My_Ext_Atoms( reax_system* );
-
-void  Print_Far_Neighbors( reax_system*, reax_list**, control_params *);
-void  Print_Sparse_Matrix( reax_system*, sparse_matrix* );
-void  Print_Sparse_Matrix2( reax_system*, sparse_matrix*, char* );
-void  Print_Linear_System( reax_system*, control_params*, storage*, int );
-void  Print_LinSys_Soln( reax_system*, real*, real*, real* );
-void  Print_Charges( reax_system* );
-void  Print_HBonds( reax_system*, reax_list**, control_params *, int );
-void  Print_HBond_Indices( reax_system*, reax_list**, control_params *, int );
-void  Print_Bonds( reax_system*, reax_list**, control_params *);
-void  Print_Bond_List2( reax_system*, reax_list*, char* );
-void  Print_Total_Force( reax_system*, simulation_data*, storage* );
-void  Output_Results( reax_system*, control_params*, simulation_data*,
-                      reax_list**, output_controls*, mpi_datatypes* );
+        output_controls*, mpi_datatypes* );
+
+void Print_Box( simulation_box*, char*, FILE* );
+
+void Print_Grid( grid*, FILE* );
+
+void Print_GCell_Exchange_Bounds( int, neighbor_proc* );
+
+void Print_Native_GCells( reax_system* );
+
+void Print_All_GCells( reax_system*);
+
+void Print_Init_Atoms( reax_system*, storage* );
+
+void Print_My_Atoms( reax_system* );
+
+void Print_My_Ext_Atoms( reax_system* );
+
+void Print_Far_Neighbors( reax_system*, reax_list**, control_params *);
+
+void Print_Sparse_Matrix( reax_system*, sparse_matrix* );
+
+void Print_Sparse_Matrix2( reax_system*, sparse_matrix*, char* );
+
+void Print_Linear_System( reax_system*, control_params*, storage*, int );
+
+void Print_LinSys_Soln( reax_system*, real*, real*, real* );
+
+void Print_Charges( reax_system* );
+
+void Print_HBonds( reax_system*, reax_list**, control_params *, int );
+
+void Print_HBond_Indices( reax_system*, reax_list**, control_params *, int );
+
+void Print_Bonds( reax_system*, reax_list**, control_params *);
+
+void Print_Bond_List2( reax_system*, reax_list*, char* );
+
+void Print_Total_Force( reax_system*, simulation_data*, storage* );
+
+void Output_Results( reax_system*, control_params*, simulation_data*,
+        reax_list**, output_controls*, mpi_datatypes* );
 
 #if defined(DEBUG_FOCUS) || defined(TEST_FORCES) || defined(TEST_ENERGY)
 void Debug_Marker_Bonded( output_controls*, int );
+
 void Debug_Marker_Nonbonded( output_controls*, int );
-void  Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*,
-                                 simulation_data*, output_controls* );
-void  Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*,
-                                simulation_data*, output_controls* );
-void  Print_Bond_List( reax_system*, control_params*, simulation_data*,
-                       reax_list**, output_controls* );
+
+void Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*,
+        simulation_data*, output_controls* );
+
+void Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*,
+        simulation_data*, output_controls* );
+
+void Print_Bond_List( reax_system*, control_params*, simulation_data*,
+        reax_list**, output_controls* );
+
 /*void Dummy_Printer( reax_system*, control_params*, simulation_data*,
             storage*, reax_list**, output_controls* );
 void Print_Bond_Orders( reax_system*, control_params*, simulation_data*,
@@ -89,23 +115,28 @@ void Print_Total_Force( reax_system*, control_params*, simulation_data*,
             storage*, reax_list**, output_controls* );
 void Compare_Total_Forces( reax_system*, control_params*, simulation_data*,
 storage*, reax_list**, output_controls* );*/
+
 //void  Print_Total_Force( reax_system*, control_params* );
+
 void Print_Force_Files( reax_system*, control_params*, simulation_data*,
-                        storage*, reax_list**, output_controls*,
-                        mpi_datatypes * );
+        storage*, reax_list**, output_controls*, mpi_datatypes * );
+
 //void Init_Force_Test_Functions( );
 
 int fn_qsort_intcmp( const void *, const void * );
 
 void Print_Far_Neighbors_List( reax_system*, reax_list**, control_params*,
-                               simulation_data*, output_controls* );
+        simulation_data*, output_controls* );
 
 void Print_Near_Neighbors_List( reax_system*, reax_list**, control_params*,
-                                simulation_data*, output_controls* );
+        simulation_data*, output_controls* );
 
 void Print_Bond_List( reax_system*, control_params*, simulation_data*,
-                      reax_list**, output_controls*);
+        reax_list**, output_controls*);
+#endif
 
+#ifdef __cplusplus
+}
 #endif
 
 
diff --git a/PG-PuReMD/src/lin_alg.c b/PG-PuReMD/src/lin_alg.c
index bac272a0..e9ce62e7 100644
--- a/PG-PuReMD/src/lin_alg.c
+++ b/PG-PuReMD/src/lin_alg.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "lin_alg.h"
 
 #include "basic_comm.h"
@@ -27,9 +29,7 @@
 #include "vector.h"
 
 #ifdef HAVE_CUDA
-  #include "cuda_lin_alg.h"
-  #include "cuda_utils.h"
-  #include "cuda_validation.h"
+  #include "cuda/cuda_validation.h"
 #endif
 
 #if defined(CG_PERFORMANCE)
@@ -100,13 +100,13 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
 #endif
 
 #ifdef HAVE_CUDA
-    check_zeros_host (x, system->N, "x");
+    check_zeros_host( x, system->N, "x" );
 #endif
 
     Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
 
 #ifdef HAVE_CUDA
-    check_zeros_host (x, system->N, "x");
+    check_zeros_host( x, system->N, "x" );
 #endif
 
     dual_Sparse_MatVec( H, x, workspace->q2, N );
@@ -285,352 +285,6 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H, rvec2
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
-        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout,
-        simulation_data *data )
-{
-    int  i, j, n, N, matvecs, scale;
-    rvec2 tmp, alpha, beta;
-    rvec2 my_sum, norm_sqr, b_norm, my_dot;
-    rvec2 sig_old, sig_new;
-    MPI_Comm comm;
-    rvec2 *spad = (rvec2 *) host_scratch;
-    int a;
-
-    n = system->n;
-    N = system->N;
-    comm = mpi_data->world;
-    matvecs = 0;
-    scale = sizeof(rvec2) / sizeof(void);
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        matvecs = 0;
-        t_start = matvec_time = dot_time = 0;
-        t_start = Get_Time( );
-    }
-#endif
-
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Dist( system, mpi_data, workspace->x, mpi_data->mpi_rvec2, scale, rvec2_packer );
-//#endif
-
-//  check_zeros_device( x, system->N, "x" );
-
-    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyDeviceToHost, "CG:x:get" );
-    Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
-    copy_host_device( spad, x, sizeof(rvec2) * system->total_cap, cudaMemcpyHostToDevice, "CG:x:put" );
-
-//  check_zeros_device( x, system->N, "x" );
-
-//  compare_rvec2 (workspace->x, x, N, "x");
-//  if (data->step > 0) {
-//      compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
-//      compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
-//
-//      exit (0);
-//  }
-
-
-//#ifdef __CUDA_DEBUG__
-//  dual_Sparse_MatVec( &workspace->H, workspace->x, workspace->q2, N );
-//#endif
-    //originally we were using only H->n which was system->n (init_md.c)
-    //Cuda_Dual_Matvec ( H, x, dev_workspace->q2, H->n, system->total_cap);
-    
-    Cuda_Dual_Matvec ( H, x, dev_workspace->q2, system->N, system->total_cap);
-
-//  compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-//  if (data->step > 0) exit (0);
-
-    // tryQEq
-    //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//  Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-    
-    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
-            cudaMemcpyDeviceToHost, "CG:q2:get" );
-    Coll(system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
-    copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
-            cudaMemcpyHostToDevice,"CG:q2:put" );
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &matvec_time );
-    }
-#endif
-
-//#ifdef __CUDA_DEBUG__
-//  for( j = 0; j < system->n; ++j ) {
-//    // residual
-//    workspace->r2[j][0] = workspace->b[j][0] - workspace->q2[j][0];
-//    workspace->r2[j][1] = workspace->b[j][1] - workspace->q2[j][1];
-//    // apply diagonal pre-conditioner
-//    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//  }
-//#endif
-    
-    Cuda_CG_Diagonal_Preconditioner( dev_workspace, b, system->n );
-
-//  compare_rvec2 (workspace->r2, dev_workspace->r2, n, "r2");
-//  compare_rvec2 (workspace->d2, dev_workspace->d2, n, "d2");
-
-    /* norm of b */
-//#ifdef __CUDA_DEBUG__
-//  my_sum[0] = my_sum[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_sum[0] += SQR( workspace->b[j][0] );
-//    my_sum[1] += SQR( workspace->b[j][1] );
-//  }
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-//#endif
-
-    my_sum[0] = my_sum[1] = 0;
-    Cuda_Norm (b, n, my_sum);
-
-//  fprintf (stderr, "cg: my_sum[ %f, %f] \n", my_sum[0], my_sum[1]);
-
-    MPI_Allreduce( &my_sum, &norm_sqr, 2, MPI_DOUBLE, MPI_SUM, comm );
-    b_norm[0] = SQRT( norm_sqr[0] );
-    b_norm[1] = SQRT( norm_sqr[1] );
-    //fprintf( stderr, "bnorm = %f %f\n", b_norm[0], b_norm[1] );
-
-    /* dot product: r.d */
-//#ifdef __CUDA_DEBUG__
-//  my_dot[0] = my_dot[1] = 0;
-//  for( j = 0; j < n; ++j ) {
-//    my_dot[0] += workspace->r2[j][0] * workspace->d2[j][0];
-//    my_dot[1] += workspace->r2[j][1] * workspace->d2[j][1];
-//  }
-//  fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-    my_dot[0] = my_dot[1] = 0;
-    Cuda_Dot (dev_workspace->r2, dev_workspace->d2, my_dot, n);
-
-// fprintf( stderr, "my_dot: %f %f\n", my_dot[0], my_dot[1] );
-    
-    MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
-
-    //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &dot_time );
-    }
-#endif
-
-    for ( i = 1; i < 300; ++i )
-    {
-        //MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Dist(system,mpi_data,workspace->d2,mpi_data->mpi_rvec2,scale,rvec2_packer);
-//#endif
-        
-        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
-                cudaMemcpyDeviceToHost, "cg:d2:get" );
-        Dist( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_packer );
-        copy_host_device( spad, dev_workspace->d2, sizeof(rvec2) * system->total_cap,
-                cudaMemcpyHostToDevice, "cg:d2:put" );
-
-        //print_device_rvec2 (dev_workspace->d2, N);
-
-//#ifdef __CUDA_DEBUG__
-//    dual_Sparse_MatVec( &workspace->H, workspace->d2, workspace->q2, N );
-//#endif
-        
-        Cuda_Dual_Matvec( H, dev_workspace->d2, dev_workspace->q2, system->N,
-                system->total_cap );
-
-        /*
-        fprintf (stderr, "******************* Device sparse Matrix--------> %d \n", H->n );
-        fprintf (stderr, " ******* HOST SPARSE MATRIX ******** \n");
-        print_sparse_matrix_host (&workspace->H);
-        fprintf (stderr, " ******* HOST Vector ***************\n");
-        print_host_rvec2 (workspace->d2, system->N);
-        fprintf (stderr, " ******* Device SPARSE MATRIX ******** \n");
-        print_sparse_matrix (&dev_workspace->H);
-        fprintf (stderr, " ******* Device Vector ***************\n");
-        print_device_rvec2 (dev_workspace->d2, system->N);
-        */
-        //compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-        // tryQEq
-        // MVAPICH2
-//#ifdef __CUDA_DEBUG__
-//    Coll(system,mpi_data,workspace->q2,mpi_data->mpi_rvec2,scale,rvec2_unpacker);
-//#endif
-
-        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
-                cudaMemcpyDeviceToHost, "cg:q2:get" );
-        Coll( system, mpi_data, spad, mpi_data->mpi_rvec2, scale, rvec2_unpacker );
-        copy_host_device( spad, dev_workspace->q2, sizeof(rvec2) * system->total_cap,
-                cudaMemcpyHostToDevice, "cg:q2:put" );
-
-//       compare_rvec2 (workspace->q2, dev_workspace->q2, N, "q2");
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &matvec_time );
-        }
-#endif
-
-        /* dot product: d.q */
-//#ifdef __CUDA_DEBUG__
-//    my_dot[0] = my_dot[1] = 0;
-//    for( j = 0; j < n; ++j ) {
-//      my_dot[0] += workspace->d2[j][0] * workspace->q2[j][0];
-//      my_dot[1] += workspace->d2[j][1] * workspace->q2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = my_dot[1] = 0;
-        Cuda_Dot (dev_workspace->d2, dev_workspace->q2, my_dot, n);
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-
-        MPI_Allreduce( &my_dot, &tmp, 2, MPI_DOUBLE, MPI_SUM, comm );
-        //fprintf( stderr, "tmp: %f %f\n", tmp[0], tmp[1] );
-
-        alpha[0] = sig_new[0] / tmp[0];
-        alpha[1] = sig_new[1] / tmp[1];
-        my_dot[0] = my_dot[1] = 0;
-
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // update x
-//      workspace->x[j][0] += alpha[0] * workspace->d2[j][0];
-//      workspace->x[j][1] += alpha[1] * workspace->d2[j][1];
-//      // update residual
-//      workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0];
-//      workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1];
-//      // apply diagonal pre-conditioner
-//      workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-//      workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-//      // dot product: r.p
-//      my_dot[0] += workspace->r2[j][0] * workspace->p2[j][0];
-//      my_dot[1] += workspace->r2[j][1] * workspace->p2[j][1];
-//    }
-//       fprintf( stderr, "H:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-//#endif
-
-        my_dot[0] = my_dot[1] = 0;
-        Cuda_DualCG_Preconditioner( dev_workspace, x, alpha, system->n, my_dot );
-
-        //fprintf( stderr, "D:my_dot: %f %f\n", my_dot[0], my_dot[1] );
-
-//   compare_rvec2 (workspace->x, dev_workspace->x, N, "x");
-//   compare_rvec2 (workspace->r2, dev_workspace->r2, N, "r2");
-//   compare_rvec2 (workspace->p2, dev_workspace->p2, N, "p2");
-
-        sig_old[0] = sig_new[0];
-        sig_old[1] = sig_new[1];
-        MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
-
-        //fprintf( stderr, "DEVICE:sig_new: %f %f\n", sig_new[0], sig_new[1] );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &dot_time );
-        }
-#endif
-
-        if ( SQRT(sig_new[0]) / b_norm[0] <= tol || SQRT(sig_new[1]) / b_norm[1] <= tol )
-        {
-            break;
-        }
-
-        beta[0] = sig_new[0] / sig_old[0];
-        beta[1] = sig_new[1] / sig_old[1];
-
-//#ifdef __CUDA_DEBUG__
-//    for( j = 0; j < system->n; ++j ) {
-//      // d = p + beta * d
-//      workspace->d2[j][0] = workspace->p2[j][0] + beta[0] * workspace->d2[j][0];
-//      workspace->d2[j][1] = workspace->p2[j][1] + beta[1] * workspace->d2[j][1];
-//    }
-//#endif
-
-        Cuda_Vector_Sum_Rvec2( dev_workspace->d2, dev_workspace->p2, beta,
-                dev_workspace->d2, system->n );
-
-//       compare_rvec2 (workspace->d2, dev_workspace->d2, N, "q2");
-    }
-
-
-    if ( SQRT(sig_new[0]) / b_norm[0] <= tol )
-    {
-        //for( j = 0; j < n; ++j )
-        //  workspace->t[j] = workspace->x[j][1];
-        //fprintf (stderr, "Getting started with Cuda_CG1 \n");
-
-        Cuda_RvecCopy_From( dev_workspace->t, dev_workspace->x, 1, system->n );
-
-        //compare_array (workspace->b_t, dev_workspace->b_t, system->n, "b_t");
-        //compare_array (workspace->t, dev_workspace->t, system->n, "t");
-
-        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_t, tol, dev_workspace->t,
-                mpi_data, fout );
-
-        //fprintf (stderr, " Cuda_CG1: iterations --> %d \n", matvecs );
-        //for( j = 0; j < n; ++j )
-        //  workspace->x[j][1] = workspace->t[j];
-
-        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->t, 1, system->n );
-    }
-    else if ( SQRT(sig_new[1]) / b_norm[1] <= tol )
-    {
-        //for( j = 0; j < n; ++j )
-        //  workspace->s[j] = workspace->x[j][0];
-
-        Cuda_RvecCopy_From( dev_workspace->s, dev_workspace->x, 0, system->n );
-
-        //compare_array (workspace->s, dev_workspace->s, system->n, "s");
-        //compare_array (workspace->b_s, dev_workspace->b_s, system->n, "b_s");
-
-        //fprintf (stderr, "Getting started with Cuda_CG2 \n");
-
-        matvecs = Cuda_CG( system, workspace, H, dev_workspace->b_s, tol, dev_workspace->s,
-                mpi_data, fout );
-
-        //fprintf (stderr, " Cuda_CG2: iterations --> %d \n", matvecs );
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->x[j][0] = workspace->s[j];
-
-        Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
-    }
-
-    if ( i >= 300 )
-    {
-        fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n",
-                system->my_rank, i );
-        fprintf( stderr, "    [INFO] s lin solve error: %f\n", SQRT(sig_new[0]) / b_norm[0] );
-        fprintf( stderr, "    [INFO] t lin solve error: %f\n", SQRT(sig_new[1]) / b_norm[1] );
-    }
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n",
-                i + 1, matvecs, matvec_time, dot_time );
-    }
-#endif
-
-    return (i + 1) + matvecs;
-}
-#endif
-
-
 void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
 {
     int  i, j, k, si;
@@ -745,153 +399,6 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
 }
 
 
-#ifdef HAVE_CUDA
-int Cuda_CG( reax_system *system, storage *workspace, sparse_matrix *H, real
-        *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
-{
-    int  i, j, scale;
-    real tmp, alpha, beta, b_norm;
-    real sig_old, sig_new, sig0;
-    real *spad = (real *) host_scratch;
-
-    scale = sizeof(real) / sizeof(void);
-
-    /* x is on the device */
-    //MVAPICH2
-    memset( spad, 0, sizeof(real) * system->total_cap );
-    copy_host_device( spad, x, sizeof(real) * system->total_cap,
-            cudaMemcpyDeviceToHost, "cuda_cg:x:get" );
-    Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
-
-    //MVAPICH2
-    copy_host_device( spad, x, sizeof(real) * system->total_cap,
-            cudaMemcpyHostToDevice, "cuda_cg:x:put" );
-    Cuda_Matvec( H, x, dev_workspace->q, system->N, system->total_cap );
-
-    // tryQEq
-    // MVAPICH2
-    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
-            cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
-    Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
-
-    //MVAPICH2
-    copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
-            cudaMemcpyHostToDevice, "cuda_cg:q:put" );
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &matvec_time );
-    }
-#endif
-
-    Cuda_Vector_Sum( dev_workspace->r , 1.,  b, -1., dev_workspace->q,
-            system->n );
-    //for( j = 0; j < system->n; ++j )
-    //  workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
-    Cuda_CG_Preconditioner( dev_workspace->d, dev_workspace->r,
-            dev_workspace->Hdia_inv, system->n );
-
-    //TODO do the parallel_norm on the device for the local sum
-    copy_host_device( spad, b, sizeof(real) * system->n,
-            cudaMemcpyDeviceToHost, "cuda_cg:b:get" );
-    b_norm = Parallel_Norm( spad, system->n, mpi_data->world );
-
-    //TODO do the parallel dot on the device for the local sum
-    copy_host_device( spad, dev_workspace->r, sizeof(real) * system->total_cap,
-            cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
-    copy_host_device( spad + system->total_cap, dev_workspace->d, sizeof(real) * system->total_cap,
-            cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
-    sig_new = Parallel_Dot( spad, spad + system->total_cap, system->n,
-            mpi_data->world );
-
-    sig0 = sig_new;
-
-#if defined(CG_PERFORMANCE)
-    if ( system->my_rank == MASTER_NODE )
-    {
-        Update_Timing_Info( &t_start, &dot_time );
-    }
-#endif
-
-    for ( i = 1; i < 300 && SQRT(sig_new) / b_norm > tol; ++i )
-    {
-        //MVAPICH2
-        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
-                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
-        Dist( system, mpi_data, spad, MPI_DOUBLE, scale, real_packer );
-        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->total_cap,
-                cudaMemcpyHostToDevice, "cuda_cg:d:put" );
-
-        Cuda_Matvec( H, dev_workspace->d, dev_workspace->q, system->N, system->total_cap );
-
-        //tryQEq
-        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
-                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
-        Coll( system, mpi_data, spad, MPI_DOUBLE, scale, real_unpacker );
-        copy_host_device( spad, dev_workspace->q, sizeof(real) * system->total_cap,
-                cudaMemcpyHostToDevice, "cuda_cg:q:get" );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &matvec_time );
-        }
-#endif
-
-        //TODO do the parallel dot on the device for the local sum
-        copy_host_device( spad, dev_workspace->d, sizeof(real) * system->n,
-                cudaMemcpyDeviceToHost, "cuda_cg:d:get" );
-        copy_host_device( spad + system->n, dev_workspace->q, sizeof(real) * system->n,
-                cudaMemcpyDeviceToHost, "cuda_cg:q:get" );
-        tmp = Parallel_Dot( spad, spad + system->n, system->n, mpi_data->world );
-
-        alpha = sig_new / tmp;
-        //Cuda_Vector_Add( x, alpha, dev_workspace->d, system->n );
-        Cuda_Vector_Sum( x, alpha, dev_workspace->d, 1.0, x, system->n );
-
-        //Cuda_Vector_Add( workspace->r, -alpha, workspace->q, system->n );
-        Cuda_Vector_Sum( dev_workspace->r, -alpha, dev_workspace->q, 1.0,
-                dev_workspace->r, system->n );
-        /* pre-conditioning */
-        //for( j = 0; j < system->n; ++j )
-        //  workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-        Cuda_CG_Preconditioner( dev_workspace->p, dev_workspace->r,
-                dev_workspace->Hdia_inv, system->n );
-
-        sig_old = sig_new;
-
-        //TODO do the parallel dot on the device for the local sum
-        copy_host_device( spad, dev_workspace->r, sizeof(real) * system->n,
-                cudaMemcpyDeviceToHost, "cuda_cg:r:get" );
-        copy_host_device( spad + system->n, dev_workspace->p, sizeof(real) * system->n,
-                cudaMemcpyDeviceToHost, "cuda_cg:p:get" );
-        sig_new = Parallel_Dot( spad , spad + system->n, system->n, mpi_data->world );
-        //fprintf (stderr, "Device: sig_new: %f \n", sig_new );
-
-        beta = sig_new / sig_old;
-        Cuda_Vector_Sum( dev_workspace->d, 1., dev_workspace->p, beta,
-                dev_workspace->d, system->n );
-
-#if defined(CG_PERFORMANCE)
-        if ( system->my_rank == MASTER_NODE )
-        {
-            Update_Timing_Info( &t_start, &dot_time );
-        }
-#endif
-    }
-
-    if ( i >= 300 )
-    {
-        fprintf( stderr, "CG convergence failed!\n" );
-        return i;
-    }
-
-    return i;
-}
-#endif
-
-
 int CG_test( reax_system *system, storage *workspace, sparse_matrix *H, real
         *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
 {
diff --git a/PG-PuReMD/src/lin_alg.h b/PG-PuReMD/src/lin_alg.h
index f401fb2d..3663978e 100644
--- a/PG-PuReMD/src/lin_alg.h
+++ b/PG-PuReMD/src/lin_alg.h
@@ -24,23 +24,32 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int GMRES( reax_system*, storage*, sparse_matrix*,
-           real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes*, FILE* );
+
 int GMRES_HouseHolder( reax_system*, storage*, sparse_matrix*,
-                       real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes*, FILE* );
+
 int dual_CG( reax_system*, storage*, sparse_matrix*,
-             rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *);
+        rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *);
+
 int CG( reax_system*, storage*, sparse_matrix*,
         real*, real, real*, mpi_datatypes*, FILE* );
+
 int PCG( reax_system*, storage*, sparse_matrix*, real*, real,
-         sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
+        sparse_matrix*, sparse_matrix*, real*, mpi_datatypes*, FILE* );
+
 int sCG( reax_system*, storage*, sparse_matrix*,
-         real*, real, real*, mpi_datatypes*, FILE* );
+        real*, real, real*, mpi_datatypes*, FILE* );
+
+#ifdef __cplusplus
+}
+#endif
 
-//CUDA Functions
-int Cuda_dual_CG( reax_system*, storage*, sparse_matrix*,
-                  rvec2*, real, rvec2*, mpi_datatypes*, FILE* , simulation_data *);
-int Cuda_CG( reax_system*, storage*, sparse_matrix*,
-             real*, real, real*, mpi_datatypes*, FILE* );
 
 #endif
diff --git a/PG-PuReMD/src/list.c b/PG-PuReMD/src/list.c
index 05213cb3..69736afb 100644
--- a/PG-PuReMD/src/list.c
+++ b/PG-PuReMD/src/list.c
@@ -22,11 +22,11 @@
 #include "reax_types.h"
 
 #if defined(PURE_REAX)
-#include "list.h"
-#include "tool_box.h"
+  #include "list.h"
+  #include "tool_box.h"
 #elif defined(LAMMPS_REAX)
-#include "reax_list.h"
-#include "reax_tool_box.h"
+  #include "reax_list.h"
+  #include "reax_tool_box.h"
 #endif
 
 
diff --git a/PG-PuReMD/src/list.h b/PG-PuReMD/src/list.h
index 1f29f5f8..df6ec82f 100644
--- a/PG-PuReMD/src/list.h
+++ b/PG-PuReMD/src/list.h
@@ -24,17 +24,21 @@
 
 #include "reax_types.h"
 
+
 #ifdef _cplusplus
 extern "C" {
 #endif
 
-
 void Print_List( reax_list* );
 
 void Make_List( int, int, int, reax_list* );
 
 void Delete_List( reax_list* );
 
+#ifdef _cplusplus
+}
+#endif
+
 #if defined(LAMMPS_REAX) || defined(PURE_REAX)
 static inline int Num_Entries( int i, reax_list *l )
 {
@@ -60,12 +64,7 @@ static inline void Set_End_Index( int i, int val, reax_list *l )
 {
     l->end_index[i] = val;
 }
-
 #endif
 
 
-#ifdef _cplusplus
-}
-#endif
-
 #endif
diff --git a/PG-PuReMD/src/lookup.c b/PG-PuReMD/src/lookup.c
index 2c6652f9..b071ea89 100644
--- a/PG-PuReMD/src/lookup.c
+++ b/PG-PuReMD/src/lookup.c
@@ -21,12 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-
-#ifdef HAVE_CUDA
-  #include "cuda_lookup.h"
-#endif
-
 #if defined(PURE_REAX)
   #include "lookup.h"
   #include "nonbonded.h"
@@ -37,6 +31,12 @@
   #include "reax_tool_box.h"
 #endif
 
+#include "index_utils.h"
+
+#ifdef HAVE_CUDA
+  #include "cuda/cuda_lookup.h"
+#endif
+
 
 /* Fills solution into x. Warning: will modify c and d! */
 void Tridiagonal_Solve( const real *a, const real *b,
diff --git a/PG-PuReMD/src/lookup.h b/PG-PuReMD/src/lookup.h
index f6e45bd1..4db34ce0 100644
--- a/PG-PuReMD/src/lookup.h
+++ b/PG-PuReMD/src/lookup.h
@@ -26,7 +26,17 @@
 
 //extern LR_lookup_table **LR;
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 int Init_Lookup_Tables( reax_system*, control_params*, real *,
-                        mpi_datatypes*, char* );
+        mpi_datatypes*, char* );
+
+#ifdef _cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/multi_body.c b/PG-PuReMD/src/multi_body.c
index b480d3bb..aab4957d 100644
--- a/PG-PuReMD/src/multi_body.c
+++ b/PG-PuReMD/src/multi_body.c
@@ -32,6 +32,7 @@
   #include "reax_list.h"
   #include "reax_vector.h"
 #endif
+
 #include "index_utils.h"
 
 
diff --git a/PG-PuReMD/src/multi_body.h b/PG-PuReMD/src/multi_body.h
index aaed59e5..9cc865b4 100644
--- a/PG-PuReMD/src/multi_body.h
+++ b/PG-PuReMD/src/multi_body.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 void Atom_Energy( reax_system*, control_params*, simulation_data*,
-                  storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef _cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/neighbors.c b/PG-PuReMD/src/neighbors.c
index 753ecc36..e938329a 100644
--- a/PG-PuReMD/src/neighbors.c
+++ b/PG-PuReMD/src/neighbors.c
@@ -19,14 +19,16 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "neighbors.h"
+
+#include "index_utils.h"
 #include "io_tools.h"
 #include "list.h"
 #include "tool_box.h"
 #include "vector.h"
 
-#include "index_utils.h"
-
 
 int compare_far_nbrs( const void *p1, const void *p2 )
 {
diff --git a/PG-PuReMD/src/neighbors.h b/PG-PuReMD/src/neighbors.h
index 0a1e3daf..37c3642b 100644
--- a/PG-PuReMD/src/neighbors.h
+++ b/PG-PuReMD/src/neighbors.h
@@ -31,8 +31,18 @@
                      int, int*, int*, int*, int,
                      int, int, real, rvec, ivec );*/
 
-void Generate_Neighbor_Lists( reax_system*, simulation_data*, storage*,
-                              reax_list** );
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
+void Generate_Neighbor_Lists( reax_system*, simulation_data*, storage*, reax_list** );
+
 int Estimate_NumNeighbors( reax_system*, reax_list** );
 
+#ifdef _cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/nonbonded.c b/PG-PuReMD/src/nonbonded.c
index 8edd2b11..e073ec62 100644
--- a/PG-PuReMD/src/nonbonded.c
+++ b/PG-PuReMD/src/nonbonded.c
@@ -20,7 +20,7 @@
   ----------------------------------------------------------------------*/
 
 #include "reax_types.h"
-#include "index_utils.h"
+
 #if defined(PURE_REAX)
   #include "nonbonded.h"
   #include "bond_orders.h"
@@ -34,10 +34,12 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void vdW_Coulomb_Energy( reax_system *system, control_params *control,
-                         simulation_data *data, storage *workspace,
-                         reax_list **lists, output_controls *out_control )
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control )
 {
     int i, j, pj, natoms;
     int start_i, end_i, orig_i, orig_j;
diff --git a/PG-PuReMD/src/nonbonded.h b/PG-PuReMD/src/nonbonded.h
index 81613be5..45137bf8 100644
--- a/PG-PuReMD/src/nonbonded.h
+++ b/PG-PuReMD/src/nonbonded.h
@@ -24,14 +24,24 @@
 
 #include "reax_types.h"
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 void vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*,
-                         storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
 
 void Tabulated_vdW_Coulomb_Energy( reax_system*, control_params*,
-                                   simulation_data*, storage*,
-                                   reax_list**, output_controls* );
+        simulation_data*, storage*, reax_list**, output_controls* );
 
 void Compute_Polarization_Energy( reax_system*, simulation_data* );
 
 void LR_vdW_Coulomb( reax_system*, real *, int, int, real, LR_data* );
+
+#ifdef _cplusplus
+}
+#endif
+
+
 #endif
diff --git a/PG-PuReMD/src/parallelreax.c b/PG-PuReMD/src/parallelreax.c
index 4d677687..30c23722 100644
--- a/PG-PuReMD/src/parallelreax.c
+++ b/PG-PuReMD/src/parallelreax.c
@@ -40,13 +40,13 @@
 #include "vector.h"
 
 #ifdef HAVE_CUDA
-  #include "cuda_copy.h"
-  #include "cuda_environment.h"
-  #include "cuda_neighbors.h"
-  #include "cuda_post_evolve.h"
-  #include "cuda_reset_tools.h"
-  #include "cuda_utils.h"
-  #include "cuda_validation.h"
+  #include "cuda/cuda_copy.h"
+  #include "cuda/cuda_environment.h"
+  #include "cuda/cuda_neighbors.h"
+  #include "cuda/cuda_post_evolve.h"
+  #include "cuda/cuda_reset_tools.h"
+  #include "cuda/cuda_utils.h"
+  #include "cuda/cuda_validation.h"
 #endif
 
 evolve_function Evolve;
@@ -156,25 +156,6 @@ int Cuda_Post_Evolve( reax_system* system, control_params* control,
 #endif
 
 
-#ifdef HAVE_CUDA
-void init_blocks( reax_system *system )
-{
-    compute_blocks( &BLOCKS, &BLOCK_SIZE, system->n );
-    compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 );
-
-    compute_blocks( &BLOCKS_N, &BLOCK_SIZE, system->N );
-    compute_nearest_pow_2( BLOCKS_N, &BLOCKS_POW_2_N );
-
-    compute_matvec_blocks( &MATVEC_BLOCKS, system->N );
-
-#if defined(__CUDA_DEBUG_LOG__)
-    fprintf( stderr, " MATVEC_BLOCKS: %d BLOCKSIZE: %d  - N:%d \n",
-            MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, system->N );
-#endif
-}
-#endif
-
-
 static void usage( char* argv[] )
 {
     fprintf( stderr, "usage: ./%s geometry ffield control\n", argv[0] );
diff --git a/PG-PuReMD/src/random.c b/PG-PuReMD/src/random.c
index 2811a6b5..ffe55458 100644
--- a/PG-PuReMD/src/random.c
+++ b/PG-PuReMD/src/random.c
@@ -19,6 +19,8 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "random.h"
 
 /* System random number generator used linear congruance method with
diff --git a/PG-PuReMD/src/random.h b/PG-PuReMD/src/random.h
index a3ce3526..66a5d59d 100644
--- a/PG-PuReMD/src/random.h
+++ b/PG-PuReMD/src/random.h
@@ -24,19 +24,28 @@
 
 #include "reax_types.h"
 
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
 /* System random number generator used linear congruance method with
    large periodicity for generation of pseudo random number. function
    Random returns this random number appropriately scaled so that
    0 <= Random(range) < range */
-double Random(double);
+double Random( double );
 
 /* This function seeds the system pseudo random number generator with
    current time. Use this function once in the begining to initialize
    the system */
-void Randomize();
+void Randomize( );
 
 /* GRandom return random number with gaussian distribution with mean
    and standard deviation "sigma" */
-double GRandom(double, double);
+double GRandom( double, double );
+
+#ifdef _cplusplus
+}
+#endif
 
 #endif
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index c39277b8..38810bd6 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -96,6 +96,14 @@
 #define FABS   fabs
 #define FMOD   fmod
 
+/* transcendental constant pi */
+#if defined(M_PI)
+  /* GNU C library (libc), defined in math.h */
+  #define PI (M_PI)
+#else
+  #define PI            3.14159265
+#endif
+
 #define SQR(x)        ((x)*(x))
 #define CUBE(x)       ((x)*(x)*(x))
 #define DEG2RAD(a)    ((a)*PI/180.0)
@@ -104,13 +112,6 @@
 #define MIN(x,y)      (((x) < (y)) ? (x) : (y))
 #define MAX3(x,y,z)   MAX( MAX(x,y), z)
 
-/* transcendental constant pi */
-#if defined(M_PI)
-  /* GNU C library (libc), defined in math.h */
-  #define PI (M_PI)
-#else
-  #define PI            3.14159265
-#endif
 /* ??? */
 #define C_ele          332.06371
 /* ??? */
diff --git a/PG-PuReMD/src/reset_tools.c b/PG-PuReMD/src/reset_tools.c
index a605cc79..c3778145 100644
--- a/PG-PuReMD/src/reset_tools.c
+++ b/PG-PuReMD/src/reset_tools.c
@@ -21,8 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
-
 #if defined(PURE_REAX)
   #include "reset_tools.h"
   #include "list.h"
@@ -35,6 +33,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 
 void Reset_Atoms( reax_system* system, control_params *control )
 {
diff --git a/PG-PuReMD/src/reset_tools.h b/PG-PuReMD/src/reset_tools.h
index 34f38760..001b7f57 100644
--- a/PG-PuReMD/src/reset_tools.h
+++ b/PG-PuReMD/src/reset_tools.h
@@ -24,11 +24,11 @@
 
 #include "reax_types.h"
 
+
 #ifdef __cplusplus
 extern "C"  {
 #endif
 
-
 void Reset_Pressures( simulation_data* );
 
 void Reset_Simulation_Data( simulation_data* );
@@ -49,9 +49,9 @@ void Reset( reax_system*, control_params*, simulation_data*, storage*, reax_list
 void Reset_Test_Forces( reax_system*, storage* );
 #endif
 
-
 #ifdef __cplusplus
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/restart.c b/PG-PuReMD/src/restart.c
index 967e025d..6b8ddcdf 100644
--- a/PG-PuReMD/src/restart.c
+++ b/PG-PuReMD/src/restart.c
@@ -19,7 +19,10 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
+#include "reax_types.h"
+
 #include "restart.h"
+
 #include "allocate.h"
 #include "box.h"
 #include "tool_box.h"
diff --git a/PG-PuReMD/src/restart.h b/PG-PuReMD/src/restart.h
index 39a5dcd5..3d13a5a1 100644
--- a/PG-PuReMD/src/restart.h
+++ b/PG-PuReMD/src/restart.h
@@ -24,6 +24,7 @@
 
 #include "reax_types.h"
 
+
 #define RESTART_HEADER "%8d%12d%8.3f%8.3f%8.3f%8.3f%8.3f\n%15.5f%15.5f%15.5f\n%15.5f%15.5f%15.5f\n%15.5f%15.5f%15.5f\n"
 #define RESTART_HEADER_LINE_LEN 200
 /* step, system->bigN, data->therm.T, data->therm.xi,
@@ -39,16 +40,26 @@
 #define READ_RESTART_HEADER " %d %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf"
 #define READ_RESTART_LINE " %d %d %s %lf %lf %lf %lf %lf %lf"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Write_Binary_Restart( reax_system*, control_params*,
-                           simulation_data*, output_controls*, mpi_datatypes* );
+        simulation_data*, output_controls*, mpi_datatypes* );
 
 void Write_Restart( reax_system*, control_params*,
-                    simulation_data*, output_controls*, mpi_datatypes* );
+        simulation_data*, output_controls*, mpi_datatypes* );
 
 void Read_Binary_Restart( char*, reax_system*, control_params*,
-                          simulation_data*, storage*, mpi_datatypes* );
+        simulation_data*, storage*, mpi_datatypes* );
 
 void Read_Restart( char*, reax_system*, control_params*,
-                   simulation_data*, storage*, mpi_datatypes* );
+        simulation_data*, storage*, mpi_datatypes* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/system_props.c b/PG-PuReMD/src/system_props.c
index e2852a4b..ea4465c5 100644
--- a/PG-PuReMD/src/system_props.c
+++ b/PG-PuReMD/src/system_props.c
@@ -21,10 +21,6 @@
 
 #include "reax_types.h"
 
-#ifdef HAVE_CUDA
-  #include "cuda_system_props.h"
-#endif
-
 #if defined(PURE_REAX)
   #include "system_props.h"
   #include "tool_box.h"
@@ -35,6 +31,10 @@
   #include "reax_vector.h"
 #endif
 
+#ifdef HAVE_CUDA
+  #include "cuda/cuda_system_props.h"
+#endif
+
 
 void Temperature_Control( control_params *control, simulation_data *data )
 {
@@ -83,29 +83,6 @@ void Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
         data->my_en.e_kin += 0.5 * rvec_Dot( p, system->my_atoms[i].v );
     }
 
-    MPI_Allreduce( &data->my_en.e_kin,  &data->sys_en.e_kin,
-                   1, MPI_DOUBLE, MPI_SUM, comm );
-
-    data->therm.T = (2. * data->sys_en.e_kin) / (data->N_f * K_B);
-
-    // avoid T being an absolute zero, might cause F.P.E!
-    if ( FABS(data->therm.T) < ALMOST_ZERO )
-        data->therm.T = ALMOST_ZERO;
-}
-
-
-#ifdef HAVE_CUDA
-void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
-        MPI_Comm comm )
-{
-    int i;
-    rvec p;
-    real m;
-
-    data->my_en.e_kin = 0.0;
-
-    dev_compute_kinetic_energy( system, data, &data->my_en.e_kin );
-
     MPI_Allreduce( &data->my_en.e_kin,  &data->sys_en.e_kin,
             1, MPI_DOUBLE, MPI_SUM, comm );
 
@@ -117,7 +94,6 @@ void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
         data->therm.T = ALMOST_ZERO;
     }
 }
-#endif
 
 
 void Compute_System_Energy( reax_system *system, simulation_data *data,
@@ -130,7 +106,7 @@ void Compute_System_Energy( reax_system *system, simulation_data *data,
 
 #ifdef HAVE_CUDA
     //Cuda Wrapper here
-    dev_sync_simulation_data ( data );
+    dev_sync_simulation_data( data );
 #endif
 
     my_en[0] = data->my_en.e_bond;
@@ -205,23 +181,6 @@ void Compute_Total_Mass( reax_system *system, simulation_data *data,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data,
-        MPI_Comm comm  )
-{
-    int  i;
-    real tmp;
-
-    //compute local total mass of the system
-    dev_compute_total_mass( system, &tmp );
-
-    MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
-
-    data->inv_M = 1. / data->M;
-}
-#endif
-
-
 void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
         mpi_datatypes *mpi_data, MPI_Comm comm )
 {
@@ -342,112 +301,6 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 }
 
 
-#ifdef HAVE_CUDA
-void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
-        mpi_datatypes *mpi_data, MPI_Comm comm )
-{
-    int i;
-    real m, det; //xx, xy, xz, yy, yz, zz;
-    real tmp_mat[6], tot_mat[6];
-    rvec my_xcm, my_vcm, my_amcm, my_avcm;
-    rvec tvec, diff;
-    rtensor mat, inv;
-
-    rvec_MakeZero( my_xcm );  // position of CoM
-    rvec_MakeZero( my_vcm );  // velocity of CoM
-    rvec_MakeZero( my_amcm ); // angular momentum of CoM
-    rvec_MakeZero( my_avcm ); // angular velocity of CoM
-
-    /* Compute the position, vel. and ang. momentum about the centre of mass */
-    dev_compute_momentum ( system, my_xcm, my_vcm, my_amcm );
-
-    MPI_Allreduce( my_xcm, data->xcm, 3, MPI_DOUBLE, MPI_SUM, comm );
-    MPI_Allreduce( my_vcm, data->vcm, 3, MPI_DOUBLE, MPI_SUM, comm );
-    MPI_Allreduce( my_amcm, data->amcm, 3, MPI_DOUBLE, MPI_SUM, comm );
-
-    rvec_Scale( data->xcm, data->inv_M, data->xcm );
-    rvec_Scale( data->vcm, data->inv_M, data->vcm );
-    rvec_Cross( tvec, data->xcm, data->vcm );
-    rvec_ScaledAdd( data->amcm, -data->M, tvec );
-    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
-
-    /* Calculate and then invert the inertial tensor */
-    for ( i = 0; i < 6; ++i )
-    {
-        tmp_mat[i] = 0;
-    }
-
-    dev_compute_inertial_tensor( system, tmp_mat, my_xcm );
-
-    MPI_Reduce( tmp_mat, tot_mat, 6, MPI_DOUBLE, MPI_SUM, MASTER_NODE, comm );
-
-    if ( system->my_rank == MASTER_NODE )
-    {
-        mat[0][0] = tot_mat[3] + tot_mat[5];  // yy + zz;
-        mat[0][1] = mat[1][0] = -tot_mat[1];  // -xy;
-        mat[0][2] = mat[2][0] = -tot_mat[2];  // -xz;
-        mat[1][1] = tot_mat[0] + tot_mat[5];  // xx + zz;
-        mat[2][1] = mat[1][2] = -tot_mat[4];  // -yz;
-        mat[2][2] = tot_mat[0] + tot_mat[3];  // xx + yy;
-
-        /* invert the inertial tensor */
-        det = ( mat[0][0] * mat[1][1] * mat[2][2] +
-                mat[0][1] * mat[1][2] * mat[2][0] +
-                mat[0][2] * mat[1][0] * mat[2][1] ) -
-              ( mat[0][0] * mat[1][2] * mat[2][1] +
-                mat[0][1] * mat[1][0] * mat[2][2] +
-                mat[0][2] * mat[1][1] * mat[2][0] );
-
-        inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
-        inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
-        inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
-        inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
-        inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
-        inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
-        inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
-        inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
-        inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
-
-        if ( det > ALMOST_ZERO )
-        {
-            rtensor_Scale( inv, 1. / det, inv );
-        }
-        else
-        {
-            rtensor_MakeZero( inv );
-        }
-
-        /* Compute the angular velocity about the centre of mass */
-        rtensor_MatVec( data->avcm, inv, data->amcm );
-    }
-
-    MPI_Bcast( data->avcm, 3, MPI_DOUBLE, MASTER_NODE, comm );
-
-    /* Compute the rotational energy */
-    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
-
-#if defined(DEBUG)
-    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",
-             data->xcm[0], data->xcm[1], data->xcm[2] );
-    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n",
-             data->vcm[0], data->vcm[1], data->vcm[2] );
-    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n",
-             data->amcm[0], data->amcm[1], data->amcm[2] );
-    /* fprintf( stderr, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
-       mat[0][0], mat[0][1], mat[0][2],
-       mat[1][0], mat[1][1], mat[1][2],
-       mat[2][0], mat[2][1], mat[2][2] );
-       fprintf( stderr, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
-       inv[0][0], inv[0][1], inv[0][2],
-       inv[1][0], inv[1][1], inv[1][2],
-       inv[2][0], inv[2][1], inv[2][2] ); */
-    fprintf( stderr, "avcm: %24.15e %24.15e %24.15e\n",
-             data->avcm[0], data->avcm[1], data->avcm[2] );
-#endif
-}
-#endif
-
-
 /* IMPORTANT: This function assumes that current kinetic energy
  * the system is already computed
  *
diff --git a/PG-PuReMD/src/system_props.h b/PG-PuReMD/src/system_props.h
index 5efff3c5..f04a9590 100644
--- a/PG-PuReMD/src/system_props.h
+++ b/PG-PuReMD/src/system_props.h
@@ -24,6 +24,11 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C"  {
+#endif
+
 void Temperature_Control( control_params*, simulation_data* );
 
 void Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
@@ -33,16 +38,16 @@ void Compute_System_Energy( reax_system*, simulation_data*, MPI_Comm );
 void Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
 
 void Compute_Center_of_Mass( reax_system*, simulation_data*,
-                             mpi_datatypes*, MPI_Comm );
+        mpi_datatypes*, MPI_Comm );
 
 void Compute_Pressure( reax_system*, control_params*,
-                       simulation_data*, mpi_datatypes* );
+        simulation_data*, mpi_datatypes* );
+
 //void Compute_Pressure( reax_system*, simulation_data* );
 
-//CUDA Functions
-void Cuda_Compute_Total_Mass( reax_system*, simulation_data*, MPI_Comm );
-void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
-void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*,
-                                  mpi_datatypes*, MPI_Comm );
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/tool_box.h b/PG-PuReMD/src/tool_box.h
index ecaee197..a1f55910 100644
--- a/PG-PuReMD/src/tool_box.h
+++ b/PG-PuReMD/src/tool_box.h
@@ -29,7 +29,6 @@
 extern "C" {
 #endif
 
-
 /* from comm_tools.h */
 int SumScan( int, int, int, MPI_Comm );
 
@@ -76,7 +75,6 @@ void *scalloc( size_t, size_t, const char* );
 
 void sfree( void*, const char* );
 
-
 #ifdef __cplusplus
 }
 #endif
@@ -227,4 +225,5 @@ static inline real DistSqr_to_Special_Point( rvec cp, rvec x )
 }
 #endif
 
+
 #endif
diff --git a/PG-PuReMD/src/torsion_angles.c b/PG-PuReMD/src/torsion_angles.c
index 58e71f4e..29cfb444 100644
--- a/PG-PuReMD/src/torsion_angles.c
+++ b/PG-PuReMD/src/torsion_angles.c
@@ -21,7 +21,6 @@
 
 #include "reax_types.h"
 
-#include "index_utils.h"
 #if defined(PURE_REAX)
   #include "torsion_angles.h"
   #include "bond_orders.h"
@@ -36,6 +35,8 @@
   #include "reax_vector.h"
 #endif
 
+#include "index_utils.h"
+
 #define MIN_SINE 1e-10
 
 
diff --git a/PG-PuReMD/src/torsion_angles.h b/PG-PuReMD/src/torsion_angles.h
index d0762a4e..454f0679 100644
--- a/PG-PuReMD/src/torsion_angles.h
+++ b/PG-PuReMD/src/torsion_angles.h
@@ -24,7 +24,17 @@
 
 #include "reax_types.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Torsion_Angles( reax_system*, control_params*, simulation_data*,
-                     storage*, reax_list**, output_controls* );
+        storage*, reax_list**, output_controls* );
+
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/traj.c b/PG-PuReMD/src/traj.c
index d561a45f..b7ba1112 100644
--- a/PG-PuReMD/src/traj.c
+++ b/PG-PuReMD/src/traj.c
@@ -32,7 +32,7 @@
 #endif
 
 #ifdef HAVE_CUDA
-  #include "cuda_copy.h"
+  #include "cuda/cuda_copy.h"
 #endif
 
 
diff --git a/PG-PuReMD/src/traj.h b/PG-PuReMD/src/traj.h
index 8f09c4a7..13435ecb 100644
--- a/PG-PuReMD/src/traj.h
+++ b/PG-PuReMD/src/traj.h
@@ -22,10 +22,8 @@
 #ifndef __TRAJ_H__
 #define __TRAJ_H__
 
-
 #include "reax_types.h"
 
-
 #define MAX_TRJ_LINE_LEN     120
 #define MAX_TRJ_BUFFER_SIZE  (MAX_TRJ_LINE_LEN * 100)
 
@@ -80,6 +78,10 @@ enum ANGLE_LINE_OPTS
 };
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 int Init_Traj( reax_system*, control_params*, output_controls*, mpi_datatypes*, char* );
 
 int End_Traj( int, output_controls* );
@@ -87,5 +89,9 @@ int End_Traj( int, output_controls* );
 int Append_Frame( reax_system*, control_params*, simulation_data*, reax_list**,
         output_controls*, mpi_datatypes* );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/valence_angles.h b/PG-PuReMD/src/valence_angles.h
index c7a56eaa..1958b0cb 100644
--- a/PG-PuReMD/src/valence_angles.h
+++ b/PG-PuReMD/src/valence_angles.h
@@ -25,6 +25,10 @@
 #include "reax_types.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void Valence_Angles( reax_system*, control_params*, simulation_data*,
         storage*, reax_list**, output_controls* );
 
@@ -32,5 +36,9 @@ void Calculate_Theta( rvec, real, rvec, real, real*, real* );
 
 void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* );
 
+#ifdef __cplusplus
+}
+#endif
+
 
 #endif
diff --git a/PG-PuReMD/src/vector.h b/PG-PuReMD/src/vector.h
index adfe7da2..14250909 100644
--- a/PG-PuReMD/src/vector.h
+++ b/PG-PuReMD/src/vector.h
@@ -136,6 +136,7 @@ CUDA_HOST_DEVICE static inline void rvec_Copy( rvec dest, rvec src )
     dest[2] = src[2];
 }
 
+
 CUDA_HOST_DEVICE static inline void rvec_Scale( rvec ret, real c, rvec v )
 {
     ret[0] = c * v[0];
@@ -497,8 +498,8 @@ CUDA_HOST_DEVICE static inline void rtensor_Transpose( rtensor ret, rtensor t )
 CUDA_HOST_DEVICE static inline real rtensor_Det( rtensor t )
 {
     return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
-             t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
-             t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
+            t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
+            t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
 }
 
 
-- 
GitLab