diff --git a/PuReMD-GPU/Makefile.am b/PuReMD-GPU/Makefile.am
index 1a9148165035654b79216d6f008ba12a92cfbdf5..f4da052ff99380ba05833075878a643fbdaca999 100644
--- a/PuReMD-GPU/Makefile.am
+++ b/PuReMD-GPU/Makefile.am
@@ -18,29 +18,33 @@ NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict
 #NVCCFLAGS += --ptxas-options -v
 
 bin_PROGRAMS = bin/puremd-gpu
-bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c src/restart.c src/param.c src/pdb_tools.c \
-	src/GMRES.cu src/QEq.cu src/allocate.cu src/bond_orders.cu \
-	src/box.cu src/forces.cu src/four_body_interactions.cu \
+bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \
+	src/restart.c src/param.c src/pdb_tools.c src/box.c \
+	src/GMRES.cu src/QEq.cu src/allocate.c src/bond_orders.cu \
+	src/forces.cu src/four_body_interactions.cu \
 	src/grid.cu src/init_md.cu src/integrate.cu src/list.cu \
 	src/lookup.cu src/neighbors.cu \
 	src/reset_utils.cu src/single_body_interactions.cu \
-	src/system_props.cu src/three_body_interactions.cu \
-	src/traj.cu src/two_body_interactions.cu src/vector.cu \
-	src/testmd.cu \
+	src/system_props.c src/three_body_interactions.cu \
+	src/traj.c src/two_body_interactions.cu src/vector.c \
 	src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/reduction.cu \
-	src/center_mass.cu src/helpers.cu src/validation.cu src/matvec.cu
-
+	src/center_mass.cu src/helpers.cu src/validation.cu src/matvec.cu \
+        src/cuda_allocate.cu \
+        src/cuda_system_props.cu \
+	src/testmd.c
 include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
-        src/restart.h src/param.h src/pdb_tools.h \
+        src/restart.h src/param.h src/pdb_tools.h src/box.h \
 	src/GMRES.h src/QEq.h src/allocate.h src/bond_orders.h \
-	src/box.h src/forces.h src/four_body_interactions.h \
+	src/forces.h src/four_body_interactions.h \
 	src/grid.h src/init_md.h src/integrate.h src/list.h \
 	src/lookup.h src/neighbors.h \
 	src/reset_utils.h src/single_body_interactions.h \
 	src/system_props.h src/three_body_interactions.h \
 	src/traj.h src/two_body_interactions.h src/vector.h \
 	src/cuda_utils.h src/cuda_copy.h src/cuda_init.h src/reduction.h \
-	src/center_mass.h src/helpers.h src/validation.h src/matvec.h
+	src/center_mass.h src/helpers.h src/validation.h src/matvec.h \
+        src/cuda_allocate.h \
+        src/cuda_system_props.h
 
 # dummy source to cause C linking
 nodist_EXTRA_bin_puremd_gpu_SOURCES = src/dummy.c
diff --git a/PuReMD-GPU/src/allocate.c b/PuReMD-GPU/src/allocate.c
new file mode 100644
index 0000000000000000000000000000000000000000..b776a6a783b23d39b5843bff1f66e5c8d19aa8aa
--- /dev/null
+++ b/PuReMD-GPU/src/allocate.c
@@ -0,0 +1,277 @@
+/*----------------------------------------------------------------------
+  PuReMD-GPU - Reax Force Field Simulator
+
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "allocate.h"
+
+#include "list.h"
+
+
+void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
+{
+    Delete_List( far_nbrs, TYP_HOST );
+    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_HOST )){
+        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
+        exit( INIT_ERR );
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
+            num_intrs, far_nbrs->num_intrs );  
+    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
+            num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
+#endif
+}
+
+
+int Allocate_Matrix( sparse_matrix *H, int n, int m )
+{
+    H->n = n;
+    H->m = m;
+    if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL )
+        return 0;
+
+    if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL )
+        return 0;
+
+    if( (H->entries = 
+                (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL )
+        return 0;
+
+    return 1;
+}
+
+
+void Deallocate_Matrix( sparse_matrix *H )
+{
+    free(H->start);
+    free(H->entries);
+    free(H->end);
+}
+
+
+int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
+{
+    Deallocate_Matrix( H );
+    if( !Allocate_Matrix( H, n, m ) ) {
+        fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
+        exit( 1 );
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
+            name, n, m );
+    fprintf( stderr, "memory allocated: %s = %ldMB\n", 
+            name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
+#endif
+    return 1;
+}
+
+
+int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, 
+        list *hbonds )
+{
+    int i, num_hbonds;
+
+    num_hbonds = 0;
+    /* find starting indexes for each H and the total number of hbonds */
+    for( i = 1; i < n; ++i )
+        hb_top[i] += hb_top[i-1];
+    num_hbonds = hb_top[n-1];
+
+    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds, TYP_HOST ) ) {
+        fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
+        exit( INIT_ERR );
+    }
+
+    for( i = 0; i < n; ++i )
+        if( h_index[i] == 0 ){
+            Set_Start_Index( 0, 0, hbonds ); 
+            Set_End_Index( 0, 0, hbonds ); 
+        }
+        else if( h_index[i] > 0 ){
+            Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); 
+            Set_End_Index( h_index[i], hb_top[i-1], hbonds ); 
+        }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds );
+    fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
+            num_hbonds * sizeof(hbond_data) / (1024*1024) );
+#endif
+    return 1;
+}
+
+
+int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
+{
+    int i;
+    int *hb_top;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "reallocating hbonds\n" );
+#endif
+    hb_top = (int *)calloc( n, sizeof(int) );
+    for( i = 0; i < n; ++i )
+        if( h_index[i] >= 0 )
+            hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS);
+
+    Delete_List( hbonds, TYP_HOST );
+
+    Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
+
+    free( hb_top );
+
+    return 1;
+}
+
+
+int Allocate_Bond_List( int n, int *bond_top, list *bonds )
+{
+    int i, num_bonds;
+
+    num_bonds = 0;
+    /* find starting indexes for each atom and the total number of bonds */
+    for( i = 1; i < n; ++i )
+        bond_top[i] += bond_top[i-1];
+    num_bonds = bond_top[n-1];
+
+    if( !Make_List(n, num_bonds, TYP_BOND, bonds, TYP_HOST ) ) {
+        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
+        exit( INIT_ERR );
+    }
+
+    Set_Start_Index( 0, 0, bonds ); 
+    Set_End_Index( 0, 0, bonds ); 
+    for( i = 1; i < n; ++i ) {
+        Set_Start_Index( i, bond_top[i-1], bonds ); 
+        Set_End_Index( i, bond_top[i-1], bonds ); 
+    }
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds );
+    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
+            num_bonds * sizeof(bond_data) / (1024*1024) );
+#endif
+    return 1;
+}
+
+
+int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
+{
+    int i;
+    int *bond_top;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "reallocating bonds\n" );
+#endif
+    bond_top = (int *)calloc( n, sizeof(int) );
+    *est_3body = 0;
+    for( i = 0; i < n; ++i ){
+        *est_3body += SQR( Num_Entries( i, bonds ) );
+        bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
+    }
+
+    Delete_List( bonds, TYP_HOST );
+
+    Allocate_Bond_List( n, bond_top, bonds );
+    *num_bonds = bond_top[n-1];
+
+    free( bond_top );
+
+    return 1;
+}
+
+
+void Reallocate( reax_system *system, static_storage *workspace, list **lists, 
+        int nbr_flag )
+{
+    int num_bonds, est_3body;
+    reallocate_data *realloc;
+    grid *g;
+
+    realloc = &(workspace->realloc);
+    g = &(system->g);
+
+    if( realloc->num_far > 0 && nbr_flag ) {
+        fprintf (stderr, " Reallocating neighbors \n");
+        Reallocate_Neighbor_List( (*lists)+FAR_NBRS, 
+                system->N, realloc->num_far * SAFE_ZONE );
+        realloc->num_far = -1;
+    }
+
+    if( realloc->Htop > 0 ){
+        fprintf (stderr, " Reallocating Matrix \n");
+        Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
+        realloc->Htop = -1;
+
+        Deallocate_Matrix( &workspace->L );
+        Deallocate_Matrix( &workspace->U );
+    }
+
+    if( realloc->hbonds > 0 ){
+        fprintf (stderr, " Reallocating hbonds \n");
+        Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
+                (*lists)+HBONDS );
+        realloc->hbonds = -1;
+    }
+
+    num_bonds = est_3body = -1;
+    if( realloc->bonds > 0 ){
+        fprintf (stderr, " Reallocating bonds \n");
+        Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body );
+        realloc->bonds = -1;
+        realloc->num_3body = MAX( realloc->num_3body, est_3body );
+    }
+
+    if( realloc->num_3body > 0 ) {
+        fprintf (stderr, " Reallocating 3Body \n");
+        Delete_List( (*lists)+THREE_BODIES, TYP_HOST );
+
+        if( num_bonds == -1 )
+            num_bonds = ((*lists)+BONDS)->num_intrs;
+        realloc->num_3body *= SAFE_ZONE;
+
+        if( !Make_List( num_bonds, realloc->num_3body,
+                    TYP_THREE_BODY, (*lists)+THREE_BODIES, TYP_HOST ) ) {
+            fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
+            exit( INIT_ERR );
+        }
+        realloc->num_3body = -1;
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "reallocating 3 bodies\n" );
+        fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds );
+        fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body );
+        fprintf( stderr, "reallocated 3body memory: %ldMB\n", 
+                realloc->num_3body*sizeof(three_body_interaction_data)/
+                (1024*1024) );
+#endif
+    }
+
+    if( realloc->gcell_atoms > -1 ){
+#if defined(DEBUG_FOCUS)
+        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
+#endif
+
+        free (g->atoms);
+        g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
+                sizeof (int) * workspace->realloc.gcell_atoms);
+        realloc->gcell_atoms = -1;
+    }
+}
diff --git a/PuReMD-GPU/src/allocate.h b/PuReMD-GPU/src/allocate.h
index 7ab146bbb763073377b2ffb16905b6694a85765d..4ec4b541ec2f764cbbf583c4262b2793253a4e99 100644
--- a/PuReMD-GPU/src/allocate.h
+++ b/PuReMD-GPU/src/allocate.h
@@ -32,13 +32,5 @@ int Allocate_HBond_List( int, int, int*, int*, list* );
 
 int Allocate_Bond_List( int, int*, list* );
 
-//Cuda Functions
-int Cuda_Allocate_Matrix( sparse_matrix*, int, int );
-int Cuda_Allocate_HBond_List( int, int, int*, int*, list* );
-int Cuda_Allocate_Bond_List( int, int*, list* );
-void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int );
-
-GLOBAL void Init_HBond_Indexes ( int *, int *, list , int  );
-GLOBAL void Init_Bond_Indexes ( int *, list , int  );
 
 #endif
diff --git a/PuReMD-GPU/src/bond_orders.cu b/PuReMD-GPU/src/bond_orders.cu
index d39f1b4158f1bbf3ae8ad6bf86ef06bee972832f..bb72417b05808ec0006104a4713f7bffa226a066 100644
--- a/PuReMD-GPU/src/bond_orders.cu
+++ b/PuReMD-GPU/src/bond_orders.cu
@@ -36,6 +36,7 @@ inline real Cf45( real p1, real p2 )
         ( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) );
 }
 
+
 #ifdef TEST_FORCES
 void Get_dBO( reax_system *system, list **lists, 
         int i, int pj, real C, rvec *v )
@@ -66,7 +67,8 @@ void Get_dBOpinpi2( reax_system *system, list **lists,
     start_pj = Start_Index(pj, dBOs);
     end_pj = End_Index(pj, dBOs);
 
-    for( k = start_pj; k < end_pj; ++k ) {
+    for( k = start_pj; k < end_pj; ++k )
+    {
         dbo_k = &(dBOs->select.dbo_list[k]);
         rvec_Scale( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi );
         rvec_Scale( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 );
@@ -179,7 +181,6 @@ void Add_dDelta_to_Forces( reax_system *system, list **lists, int i, real C )
 }
 
 
-
 HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace, 
         list p_bonds, list p_dBOs, int *top )
 {
@@ -367,7 +368,6 @@ HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace,
 #endif
 
 
-
 void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system, 
         simulation_data *data, static_storage *workspace, 
         list **lists )
@@ -521,10 +521,10 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
        temp[0], temp[1], temp[2] ); */
 }
 
+
 /////////////////////////////////////////////////////////////
 //Cuda Functions
 /////////////////////////////////////////////////////////////
-
 HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms, 
         simulation_data *data, static_storage *workspace, 
         list *bonds )
@@ -646,6 +646,7 @@ HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms,
     rvec_Add( data->ext_press, ext_press );
 }
 
+
 /////////////////////////////////////////////////////////////
 //Cuda Functions
 /////////////////////////////////////////////////////////////
@@ -761,6 +762,7 @@ void Add_dBond_to_Forces( int i, int pj, reax_system *system,
     /*3rd, dBOpi2*/
 }
 
+
 HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms, 
         static_storage *workspace, list *bonds )
 {
@@ -890,6 +892,7 @@ HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms,
     }
 }
 
+
 HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list *bonds)
 {
     int pk;
@@ -910,6 +913,7 @@ HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list
     }
 }
 
+
 /* Locate j on i's list.
    This function assumes that j is there for sure!
    And this is the case given our method of neighbor generation*/
@@ -1377,7 +1381,6 @@ GLOBAL void Cuda_Calculate_Bond_Orders_Init (  reax_atom *atoms, global_paramete
 /* A very important and crucial assumption here is that each segment
    belonging to a different atom in nbrhoods->nbr_list is sorted in its own.
    This can either be done in the general coordinator function or here */
-
 GLOBAL void Cuda_Calculate_Bond_Orders (  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
         two_body_parameters *tbp, static_storage workspace, list bonds,
         list dDeltas, list dBOs, int num_atom_types, int N )
@@ -1398,7 +1401,6 @@ GLOBAL void Cuda_Calculate_Bond_Orders (  reax_atom *atoms, global_parameters g_
     bond_order_data *bo_ij, *bo_ji;
     single_body_parameters *sbp_i, *sbp_j;
 
-
 #if defined(TEST_FORCES)
     int  k, pk, start_j, end_j;
     int  top_dbo=0, top_dDelta=0;
@@ -1433,7 +1435,6 @@ GLOBAL void Cuda_Calculate_Bond_Orders (  reax_atom *atoms, global_parameters g_
 
     // fprintf( stderr, "done with uncorrected bond orders\n" );
 
-
     /* Corrected Bond Order calculations */
     //for( i = 0; i < system->N; ++i ) {
     type_i = atoms[i].type;
@@ -1778,6 +1779,7 @@ workspace.dDelta_lp_temp[j] = workspace.Clp[j];
 #endif
 }
 
+
 GLOBAL void Cuda_Update_Uncorrected_BO (  static_storage workspace, list bonds, int N )
 {
     int i, j, pj;
@@ -1812,6 +1814,7 @@ GLOBAL void Cuda_Update_Uncorrected_BO (  static_storage workspace, list bonds,
     }
 }
 
+
 GLOBAL void Cuda_Update_Workspace_After_Bond_Orders(  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
         static_storage workspace, int N )
 {
@@ -1867,8 +1870,8 @@ GLOBAL void Cuda_Update_Workspace_After_Bond_Orders(  reax_atom *atoms, global_p
 
 }
 
-//Import from the forces file. 
 
+//Import from the forces file. 
 GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data, 
         static_storage workspace, list p_bonds, int ensemble, int N)
 {
@@ -1889,6 +1892,7 @@ GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data,
     }
 }
 
+
 GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *atoms, simulation_data *data, 
         static_storage workspace, list p_bonds, int ensemble, int N)
 {
diff --git a/PuReMD-GPU/src/box.cu b/PuReMD-GPU/src/box.c
similarity index 100%
rename from PuReMD-GPU/src/box.cu
rename to PuReMD-GPU/src/box.c
diff --git a/PuReMD-GPU/src/box.h b/PuReMD-GPU/src/box.h
index ed8cc9f4e1900a71f5a6b319f98e7d425400f928..4d97e4242ffbfc9e0711710ae1de4a1d479cccea 100644
--- a/PuReMD-GPU/src/box.h
+++ b/PuReMD-GPU/src/box.h
@@ -21,8 +21,10 @@
 #ifndef __BOX_H__
 #define __BOX_H__
 
+
 #include "mytypes.h"
 
+
 /* Initializes box from CRYST1 line of PDB */
 void Init_Box_From_CRYST(real, real, real, real, real, real,
                          simulation_box*/*, int*/);
@@ -98,7 +100,6 @@ void Print_Box_Information( simulation_box*, FILE* );
 
 //CUDA Device Functions
 //HOST_DEVICE inline void Inc_on_T3( rvec, rvec, simulation_box* );
-
 HOST_DEVICE inline void Inc_on_T3( rvec x, rvec dx, simulation_box *box )
 {
     int i;
diff --git a/PuReMD-GPU/src/allocate.cu b/PuReMD-GPU/src/cuda_allocate.cu
similarity index 66%
rename from PuReMD-GPU/src/allocate.cu
rename to PuReMD-GPU/src/cuda_allocate.cu
index bae0dde872fa34ccf102076e099b348a29f52685..5f2b322bd3dafe1e901833cdfc5ae2cf808ec7c5 100644
--- a/PuReMD-GPU/src/allocate.cu
+++ b/PuReMD-GPU/src/cuda_allocate.cu
@@ -18,27 +18,17 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "allocate.h"
+#include "cuda_allocate.h"
+
 #include "list.h"
 
 #include "cuda_utils.h"
 #include "reduction.h"
 
-void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
-{
-    Delete_List( far_nbrs, TYP_HOST );
-    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_HOST )){
-        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        exit( INIT_ERR );
-    }
 
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
-            num_intrs, far_nbrs->num_intrs );  
-    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-            num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
-#endif
-}
+GLOBAL void Init_HBond_Indexes ( int *, int *, list , int  );
+GLOBAL void Init_Bond_Indexes ( int *, list , int  );
+
 
 void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 {
@@ -57,23 +47,6 @@ void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 }
 
 
-int Allocate_Matrix( sparse_matrix *H, int n, int m )
-{
-    H->n = n;
-    H->m = m;
-    if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL )
-        return 0;
-
-    if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL )
-        return 0;
-
-    if( (H->entries = 
-                (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL )
-        return 0;
-
-    return 1;
-}
-
 int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m )
 {
     H->n = n;
@@ -87,13 +60,6 @@ int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m )
 }
 
 
-void Deallocate_Matrix( sparse_matrix *H )
-{
-    free(H->start);
-    free(H->entries);
-    free(H->end);
-}
-
 void Cuda_Deallocate_Matrix( sparse_matrix *H )
 {
     cuda_free(H->start, RES_SPARSE_MATRIX_INDEX);
@@ -106,23 +72,6 @@ void Cuda_Deallocate_Matrix( sparse_matrix *H )
 }
 
 
-int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
-{
-    Deallocate_Matrix( H );
-    if( !Allocate_Matrix( H, n, m ) ) {
-        fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
-        exit( 1 );
-    }
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
-            name, n, m );
-    fprintf( stderr, "memory allocated: %s = %ldMB\n", 
-            name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
-#endif
-    return 1;
-}
-
 int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
 {
     Cuda_Deallocate_Matrix( H );
@@ -142,56 +91,6 @@ int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
 }
 
 
-int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, 
-        list *hbonds )
-{
-    int i, num_hbonds;
-
-    num_hbonds = 0;
-    /* find starting indexes for each H and the total number of hbonds */
-    for( i = 1; i < n; ++i )
-        hb_top[i] += hb_top[i-1];
-    num_hbonds = hb_top[n-1];
-
-    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds, TYP_HOST ) ) {
-        fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-        exit( INIT_ERR );
-    }
-
-    for( i = 0; i < n; ++i )
-        if( h_index[i] == 0 ){
-            Set_Start_Index( 0, 0, hbonds ); 
-            Set_End_Index( 0, 0, hbonds ); 
-        }
-        else if( h_index[i] > 0 ){
-            Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); 
-            Set_End_Index( h_index[i], hb_top[i-1], hbonds ); 
-        }
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds );
-    fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
-            num_hbonds * sizeof(hbond_data) / (1024*1024) );
-#endif
-    return 1;
-}
-
-GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N )
-{
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (index >= N) return;
-
-    if( h_index[index] == 0 ){
-        Set_Start_Index( 0, 0, &hbonds ); 
-        Set_End_Index( 0, 0, &hbonds ); 
-    }
-    else if( h_index[index] > 0 ){
-        Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); 
-        Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); 
-    }
-}
-
 int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list *hbonds )
 {
     int i, num_hbonds;
@@ -225,27 +124,6 @@ int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list
     return 1;
 }
 
-int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
-{
-    int i;
-    int *hb_top;
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "reallocating hbonds\n" );
-#endif
-    hb_top = (int *)calloc( n, sizeof(int) );
-    for( i = 0; i < n; ++i )
-        if( h_index[i] >= 0 )
-            hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS);
-
-    Delete_List( hbonds, TYP_HOST );
-
-    Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
-
-    free( hb_top );
-
-    return 1;
-}
 
 int Cuda_Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
 {
@@ -281,21 +159,6 @@ int Cuda_Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
     return 1;
 }
 
-GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N )
-{
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (index >= N) return;
-
-    if( index == 0 ){
-        Set_Start_Index( 0, 0, &bonds ); 
-        Set_End_Index( 0, 0, &bonds ); 
-    }
-    else if( index > 0 ){
-        Set_Start_Index( index, b_top[index-1], &bonds ); 
-        Set_End_Index( index, b_top[index-1], &bonds ); 
-    }
-}
 
 int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds )
 {
@@ -326,93 +189,6 @@ int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds )
 }
 
 
-int Allocate_Bond_List( int n, int *bond_top, list *bonds )
-{
-    int i, num_bonds;
-
-    num_bonds = 0;
-    /* find starting indexes for each atom and the total number of bonds */
-    for( i = 1; i < n; ++i )
-        bond_top[i] += bond_top[i-1];
-    num_bonds = bond_top[n-1];
-
-    if( !Make_List(n, num_bonds, TYP_BOND, bonds, TYP_HOST ) ) {
-        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-        exit( INIT_ERR );
-    }
-
-    Set_Start_Index( 0, 0, bonds ); 
-    Set_End_Index( 0, 0, bonds ); 
-    for( i = 1; i < n; ++i ) {
-        Set_Start_Index( i, bond_top[i-1], bonds ); 
-        Set_End_Index( i, bond_top[i-1], bonds ); 
-    }
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds );
-    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
-            num_bonds * sizeof(bond_data) / (1024*1024) );
-#endif
-    return 1;
-}
-
-
-int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
-{
-    int i;
-    int *bond_top;
-
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "reallocating bonds\n" );
-#endif
-    bond_top = (int *)calloc( n, sizeof(int) );
-    *est_3body = 0;
-    for( i = 0; i < n; ++i ){
-        *est_3body += SQR( Num_Entries( i, bonds ) );
-        bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
-    }
-
-    Delete_List( bonds, TYP_HOST );
-
-    Allocate_Bond_List( n, bond_top, bonds );
-    *num_bonds = bond_top[n-1];
-
-    free( bond_top );
-
-    return 1;
-}
-
-void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n)
-{
-    extern __shared__ int sh_input[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real x = 0;
-
-    if(i < n)
-    {
-        x = SQR (Num_Entries( i, &bonds ) );
-        bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS );
-    }
-    sh_input[threadIdx.x] = x;
-    __syncthreads();
-
-    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if(threadIdx.x < offset)
-        {   
-            sh_input[threadIdx.x] += sh_input[threadIdx.x + offset];
-        }   
-
-        __syncthreads();
-    }
-
-    if(threadIdx.x == 0)
-    {
-        per_block_results[blockIdx.x] = sh_input[0];
-    }
-}
-
-
 int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body )
 {
     int i;
@@ -450,6 +226,7 @@ int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body )
     return i;
 }
 
+
 int Cuda_Reallocate_ThreeBody_List ( list *thblist, int count )
 {
     int i;
@@ -542,83 +319,6 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH );
  */
 
 
-void Reallocate( reax_system *system, static_storage *workspace, list **lists, 
-        int nbr_flag )
-{
-    int num_bonds, est_3body;
-    reallocate_data *realloc;
-    grid *g;
-
-    realloc = &(workspace->realloc);
-    g = &(system->g);
-
-    if( realloc->num_far > 0 && nbr_flag ) {
-        fprintf (stderr, " Reallocating neighbors \n");
-        Reallocate_Neighbor_List( (*lists)+FAR_NBRS, 
-                system->N, realloc->num_far * SAFE_ZONE );
-        realloc->num_far = -1;
-    }
-
-    if( realloc->Htop > 0 ){
-        fprintf (stderr, " Reallocating Matrix \n");
-        Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
-        realloc->Htop = -1;
-
-        Deallocate_Matrix( &workspace->L );
-        Deallocate_Matrix( &workspace->U );
-    }
-
-    if( realloc->hbonds > 0 ){
-        fprintf (stderr, " Reallocating hbonds \n");
-        Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
-                (*lists)+HBONDS );
-        realloc->hbonds = -1;
-    }
-
-    num_bonds = est_3body = -1;
-    if( realloc->bonds > 0 ){
-        fprintf (stderr, " Reallocating bonds \n");
-        Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body );
-        realloc->bonds = -1;
-        realloc->num_3body = MAX( realloc->num_3body, est_3body );
-    }
-
-    if( realloc->num_3body > 0 ) {
-        fprintf (stderr, " Reallocating 3Body \n");
-        Delete_List( (*lists)+THREE_BODIES, TYP_HOST );
-
-        if( num_bonds == -1 )
-            num_bonds = ((*lists)+BONDS)->num_intrs;
-        realloc->num_3body *= SAFE_ZONE;
-
-        if( !Make_List( num_bonds, realloc->num_3body,
-                    TYP_THREE_BODY, (*lists)+THREE_BODIES, TYP_HOST ) ) {
-            fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-            exit( INIT_ERR );
-        }
-        realloc->num_3body = -1;
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "reallocating 3 bodies\n" );
-        fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds );
-        fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body );
-        fprintf( stderr, "reallocated 3body memory: %ldMB\n", 
-                realloc->num_3body*sizeof(three_body_interaction_data)/
-                (1024*1024) );
-#endif
-    }
-
-    if( realloc->gcell_atoms > -1 ){
-#if defined(DEBUG_FOCUS)
-        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
-#endif
-
-        free (g->atoms);
-        g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
-                sizeof (int) * workspace->realloc.gcell_atoms);
-        realloc->gcell_atoms = -1;
-    }
-}
-
 void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *lists, 
         int nbr_flag, int step )
 {
@@ -724,3 +424,68 @@ void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *list
         realloc->gcell_atoms = -1;
     }
 }
+
+
+GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N )
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (index >= N) return;
+
+    if( h_index[index] == 0 ){
+        Set_Start_Index( 0, 0, &hbonds ); 
+        Set_End_Index( 0, 0, &hbonds ); 
+    }
+    else if( h_index[index] > 0 ){
+        Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); 
+        Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); 
+    }
+}
+
+
+GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N )
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (index >= N) return;
+
+    if( index == 0 ){
+        Set_Start_Index( 0, 0, &bonds ); 
+        Set_End_Index( 0, 0, &bonds ); 
+    }
+    else if( index > 0 ){
+        Set_Start_Index( index, b_top[index-1], &bonds ); 
+        Set_End_Index( index, b_top[index-1], &bonds ); 
+    }
+}
+
+
+void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n)
+{
+    extern __shared__ int sh_input[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if(i < n)
+    {
+        x = SQR (Num_Entries( i, &bonds ) );
+        bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS );
+    }
+    sh_input[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {   
+            sh_input[threadIdx.x] += sh_input[threadIdx.x + offset];
+        }   
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sh_input[0];
+    }
+}
diff --git a/PuReMD-GPU/src/cuda_allocate.h b/PuReMD-GPU/src/cuda_allocate.h
new file mode 100644
index 0000000000000000000000000000000000000000..1de435a0f70a3cca0f3a39d63d3ddccf685774a1
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_allocate.h
@@ -0,0 +1,31 @@
+/*----------------------------------------------------------------------
+  PuReMD-GPU - Reax Force Field Simulator
+
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_ALLOCATE_H_
+#define __CUDA_ALLOCATE_H_
+
+#include "mytypes.h"
+
+int Cuda_Allocate_Matrix( sparse_matrix*, int, int );
+int Cuda_Allocate_HBond_List( int, int, int*, int*, list* );
+int Cuda_Allocate_Bond_List( int, int*, list* );
+void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int );
+
+#endif
diff --git a/PuReMD-GPU/src/cuda_helpers.h b/PuReMD-GPU/src/cuda_helpers.h
index adbf73ce456238256c7b76be1545fb69051a9baa..292d561311a455287f9c107e4cef61a6ea61fa45 100644
--- a/PuReMD-GPU/src/cuda_helpers.h
+++ b/PuReMD-GPU/src/cuda_helpers.h
@@ -25,7 +25,7 @@
 #include "mytypes.h"
 
 
-DEVICE static inline int cuda_strcmp(char *a, char *b, int len)
+DEVICE inline int cuda_strcmp(char *a, char *b, int len)
 {
     char *src, *dst;
 
@@ -52,8 +52,7 @@ DEVICE static inline int cuda_strcmp(char *a, char *b, int len)
 }
 
 
-#if __CUDA_ARCH__ < 600
-DEVICE static inline real atomicAdd(double* address, double val)
+DEVICE inline real myAtomicAdd(double* address, double val)
 {
     unsigned long long int* address_as_ull =
         (unsigned long long int*)address;
@@ -67,22 +66,21 @@ DEVICE static inline real atomicAdd(double* address, double val)
     while (assumed != old);
     return __longlong_as_double(old);
 }
-#endif
 
 
-DEVICE static inline void atomic_rvecAdd( rvec ret, rvec v )
+DEVICE inline void atomic_rvecAdd( rvec ret, rvec v )
 {
-    atomicAdd( (double*)&ret[0], (double)v[0] );
-    atomicAdd( (double*)&ret[1], (double)v[1] );
-    atomicAdd( (double*)&ret[2], (double)v[2] );
+    MYATOMICADD( (double*)&ret[0], (double)v[0] );
+    MYATOMICADD( (double*)&ret[1], (double)v[1] );
+    MYATOMICADD( (double*)&ret[2], (double)v[2] );
 }
 
 
-DEVICE static inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
+DEVICE inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
 {
-    atomicAdd( (double*)&ret[0], (double)(c * v[0]) );
-    atomicAdd( (double*)&ret[1], (double)(c * v[1]) );
-    atomicAdd( (double*)&ret[2], (double)(c * v[2]) );
+    MYATOMICADD( (double*)&ret[0], (double)(c * v[0]) );
+    MYATOMICADD( (double*)&ret[1], (double)(c * v[1]) );
+    MYATOMICADD( (double*)&ret[2], (double)(c * v[2]) );
 }
 
 
diff --git a/PuReMD-GPU/src/system_props.cu b/PuReMD-GPU/src/cuda_system_props.cu
similarity index 56%
rename from PuReMD-GPU/src/system_props.cu
rename to PuReMD-GPU/src/cuda_system_props.cu
index e0bb2626e86f0e637bd15dba8dbaf8d86ba26116..dfbc58920afa60ebf19ab3ef979206ae19844843 100644
--- a/PuReMD-GPU/src/system_props.cu
+++ b/PuReMD-GPU/src/cuda_system_props.cu
@@ -18,57 +18,21 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "system_props.h"
+#include "cuda_system_props.h"
+
 #include "box.h"
 #include "vector.h"
 
-#include "cuda_utils.h"
+#include "center_mass.h"
 #include "cuda_copy.h"
+#include "cuda_utils.h"
 #include "reduction.h"
-#include "center_mass.h"
-#include "validation.h"
-
-
-real Get_Time( )
-{
-    struct timeval tim;
-
-    gettimeofday(&tim, NULL );
-    return( tim.tv_sec + (tim.tv_usec / 1000000.0) );
-}
-
-
-real Get_Timing_Info( real t_start )
-{
-    struct timeval tim;
-    real t_end;
-
-    gettimeofday(&tim, NULL );
-    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
-    return (t_end - t_start);
-}
 
 
-void Temperature_Control( control_params *control, simulation_data *data, 
-        output_controls *out_control )
-{
-    real tmp;
-
-    if( control->T_mode == 1 ) { // step-wise temperature control
-        if( (data->step - data->prev_steps) % 
-                ((int)(control->T_freq / control->dt)) == 0 ) {
-            if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
-                control->T += control->T_rate;
-            else control->T = control->T_final;     
-        }
-    }
-    else if( control->T_mode == 2 ) { // constant slope control
-        tmp = control->T_rate * control->dt / control->T_freq;
+GLOBAL void k_Compute_Total_Mass(single_body_parameters *, reax_atom *, real *, size_t );
+GLOBAL void k_Compute_Kinetic_Energy(single_body_parameters *, reax_atom *, unsigned int , real *);
+GLOBAL void k_Kinetic_Energy_Reduction(simulation_data *, real *, int);
 
-        if( fabs( control->T - control->T_final ) >= fabs( tmp ) )
-            control->T += tmp;       
-    }
-}
 
 void prep_dev_system (reax_system *system) 
 {
@@ -77,21 +41,6 @@ void prep_dev_system (reax_system *system)
 }
 
 
-void Compute_Total_Mass( reax_system *system, simulation_data *data )
-{
-    int i;
-    int blocks;
-    int block_size;
-    real    *partial_sums = 0;
-
-    data->M = 0;
-
-    for( i = 0; i < system->N; i++ ) 
-        data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;  
-
-    data->inv_M = 1. / data->M;    
-}
-
 void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data )
 {
     real    *partial_sums = (real *) scratch;
@@ -133,158 +82,6 @@ void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data )
 }
 
 
-GLOBAL void k_Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) 
-{
-    extern __shared__ real sdata[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real x = 0; 
-
-    if(i < n) 
-        x = sbp [ atoms[ i ].type ].mass;
-
-    sdata[threadIdx.x] = x; 
-    __syncthreads();
-
-    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) 
-    {  
-        if(threadIdx.x < offset)
-        {  
-            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-        }  
-        __syncthreads();
-    }  
-
-    if(threadIdx.x == 0) 
-    {  
-        per_block_results[blockIdx.x] = sdata[0];
-    }
-}
-
-
-void Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
-        FILE *fout )
-{
-    int i;
-    real m, xx, xy, xz, yy, yz, zz, det;
-    rvec tvec, diff;
-    rtensor mat, inv;
-
-    int blocks;
-    int block_size;
-    rvec *l_xcm, *l_vcm, *l_amcm;
-    real t_start, t_end;
-
-    rvec_MakeZero( data->xcm );  // position of CoM
-    rvec_MakeZero( data->vcm );  // velocity of CoM
-    rvec_MakeZero( data->amcm ); // angular momentum of CoM
-    rvec_MakeZero( data->avcm ); // angular velocity of CoM
-
-    /* Compute the position, velocity and angular momentum about the CoM */
-    for( i = 0; i < system->N; ++i ) {
-        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
-
-        rvec_ScaledAdd( data->xcm, m, system->atoms[i].x );
-        rvec_ScaledAdd( data->vcm, m, system->atoms[i].v );
-
-        rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v );
-        rvec_ScaledAdd( data->amcm, m, tvec );
-
-        /*fprintf( fout,"%3d  %g %g %g\n",
-          i+1, 
-          system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2]  );
-          fprintf( fout, "vcm:  %g %g %g\n", 
-          data->vcm[0], data->vcm[1], data->vcm[2] );  
-         */
-    }
-
-    rvec_Scale( data->xcm, data->inv_M, data->xcm );
-    rvec_Scale( data->vcm, data->inv_M, data->vcm );
-
-    rvec_Cross( tvec, data->xcm, data->vcm );
-    rvec_ScaledAdd( data->amcm, -data->M, tvec );
-
-    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
-
-    /* Calculate and then invert the inertial tensor */
-    xx = xy = xz = yy = yz = zz = 0;
-
-    for( i = 0; i < system->N; ++i ) {
-        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
-
-        rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
-        xx += diff[0] * diff[0] * m;
-        xy += diff[0] * diff[1] * m;
-        xz += diff[0] * diff[2] * m;
-        yy += diff[1] * diff[1] * m;
-        yz += diff[1] * diff[2] * m;
-        zz += diff[2] * diff[2] * m;      
-    }
-
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " xx: %f \n", xx);
-    fprintf (stderr, " xy: %f \n", xy);
-    fprintf (stderr, " xz: %f \n", xz);
-    fprintf (stderr, " yy: %f \n", yy);
-    fprintf (stderr, " yz: %f \n", yz);
-    fprintf (stderr, " zz: %f \n", zz);
-#endif
-
-    mat[0][0] = yy + zz;     
-    mat[0][1] = mat[1][0] = -xy;
-    mat[0][2] = mat[2][0] = -xz;
-    mat[1][1] = xx + zz;
-    mat[2][1] = mat[1][2] = -yz;
-    mat[2][2] = xx + yy;
-
-    /* invert the inertial tensor */
-    det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
-            mat[0][1] * mat[1][2] * mat[2][0] + 
-            mat[0][2] * mat[1][0] * mat[2][1] ) -
-        ( mat[0][0] * mat[1][2] * mat[2][1] + 
-          mat[0][1] * mat[1][0] * mat[2][2] + 
-          mat[0][2] * mat[1][1] * mat[2][0] );
-
-    inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
-    inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
-    inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
-    inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
-    inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
-    inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
-    inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
-    inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
-    inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
-
-    if( fabs(det) > ALMOST_ZERO )
-        rtensor_Scale( inv, 1./det, inv );
-    else 
-        rtensor_MakeZero( inv );
-
-    /* Compute the angular velocity about the centre of mass */
-    rtensor_MatVec( data->avcm, inv, data->amcm );  
-    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
-
-#if defined(DEBUG)
-    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
-            data->xcm[0], data->xcm[1], data->xcm[2] );
-    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
-            data->vcm[0], data->vcm[1], data->vcm[2] );
-    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
-            data->amcm[0], data->amcm[1], data->amcm[2] );
-    /* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
-       mat[0][0], mat[0][1], mat[0][2], 
-       mat[1][0], mat[1][1], mat[1][2], 
-       mat[2][0], mat[2][1], mat[2][2] );
-       fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
-       inv[0][0], inv[0][1], inv[0][2], 
-       inv[1][0], inv[1][1], inv[1][2], 
-       inv[2][0], inv[2][1], inv[2][2] );
-       fflush( fout ); */
-    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
-            data->avcm[0], data->avcm[1], data->avcm[2] );
-#endif
-}
-
-
 void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
         FILE *fout )
 {
@@ -489,34 +286,51 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 }
 
 
-
-void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
+void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data)
 {
-    int i;
-    rvec p;
-    real m;
+    real *results = (real *) scratch;
+    cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
+    k_Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+        (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results);
+    cudaThreadSynchronize (); 
+    cudaCheckError ();
 
-    data->E_Kin = 0.0;
+    k_Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
+        ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2);
+    cudaThreadSynchronize (); 
+    cudaCheckError ();
+}
 
-    for (i=0; i < system->N; i++) {
-        m = system->reaxprm.sbp[system->atoms[i].type].mass;
 
-        rvec_Scale( p, m, system->atoms[i].v );
-        data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v );
+GLOBAL void k_Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) 
+{
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0; 
 
-        /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n",
-           i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
-           system->reaxprm.sbp[system->atoms[i].type].mass); */
-    }
+    if(i < n) 
+        x = sbp [ atoms[ i ].type ].mass;
 
-    data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
+    sdata[threadIdx.x] = x; 
+    __syncthreads();
 
-    if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */
-        data->therm.T = ALMOST_ZERO;
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) 
+    {  
+        if(threadIdx.x < offset)
+        {  
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }  
+        __syncthreads();
+    }  
+
+    if(threadIdx.x == 0) 
+    {  
+        per_block_results[blockIdx.x] = sdata[0];
+    }
 }
 
 
-GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, 
+GLOBAL void k_Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, 
         unsigned int N, real *output)
 {
     extern __shared__ real sh_ekin[];
@@ -547,8 +361,8 @@ GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atom
     }
 }
 
-GLOBAL void Kinetic_Energy_Reduction (simulation_data *data,
-        real *input, int n)
+
+GLOBAL void k_Kinetic_Energy_Reduction (simulation_data *data, real *input, int n)
 {
     extern __shared__ real sdata[];
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -582,20 +396,6 @@ GLOBAL void Kinetic_Energy_Reduction (simulation_data *data,
     }
 }
 
-void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data)
-{
-    real *results = (real *) scratch;
-    cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
-    Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-        (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results);
-    cudaThreadSynchronize (); 
-    cudaCheckError ();
-
-    Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
-        ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2);
-    cudaThreadSynchronize (); 
-    cudaCheckError ();
-}
 
 /*
    GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, 
@@ -660,117 +460,3 @@ data->therm.T = ALMOST_ZERO;
  */
 
 
-/* IMPORTANT: This function assumes that current kinetic energy and 
- *  the center of mass of the system is already computed before. 
- *
- * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
- *  to be added when there are long-range interactions or long-range 
- *  corrections to short-range interactions present.
- *  We may want to add that for more accuracy. 
- */
-void Compute_Pressure_Isotropic( reax_system* system, control_params *control, 
-        simulation_data* data, 
-        output_controls *out_control )
-{
-    int i;
-    reax_atom *p_atom;
-    rvec tx;
-    rvec tmp;
-    simulation_box *box = &(system->box);
-
-    /* Calculate internal pressure */
-    rvec_MakeZero( data->int_press );
-
-    // 0: both int and ext, 1: ext only, 2: int only
-    if( control->press_mode == 0 || control->press_mode == 2 ) {
-        for( i = 0; i < system->N; ++i ) {
-            p_atom = &( system->atoms[i] );
-
-            /* transform x into unitbox coordinates */
-            Transform_to_UnitBox( p_atom->x, box, 1, tx );
-
-            /* this atom's contribution to internal pressure */
-            rvec_Multiply( tmp, p_atom->f, tx );
-            rvec_Add( data->int_press, tmp );
-
-            if( out_control->debug_level > 0 ) {
-                fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", 
-                        i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
-                fprintf( out_control->prs, "%8.2f%8.2f%8.2f", 
-                        p_atom->f[0], p_atom->f[1], p_atom->f[2] );
-                fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", 
-                        data->int_press[0],data->int_press[1],data->int_press[2]);
-            }
-        }
-    }
-
-    /* kinetic contribution */
-    data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV );
-
-    /* Calculate total pressure in each direction */  
-    data->tot_press[0] = data->kin_press - 
-        ((data->int_press[0] + data->ext_press[0]) /
-         (box->box_norms[1] * box->box_norms[2] * P_CONV));
-
-    data->tot_press[1] = data->kin_press - 
-        ((data->int_press[1] + data->ext_press[1])/
-         (box->box_norms[0] * box->box_norms[2] * P_CONV));
-
-    data->tot_press[2] = data->kin_press - 
-        ((data->int_press[2] + data->ext_press[2])/
-         (box->box_norms[0] * box->box_norms[1] * P_CONV));
-
-    /* Average pressure for the whole box */
-    data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3;
-}
-
-
-void Compute_Pressure_Isotropic_Klein( reax_system* system, 
-        simulation_data* data )
-{
-    int i;
-    reax_atom *p_atom;
-    rvec dx;
-
-    // IMPORTANT: This function assumes that current kinetic energy and 
-    // the center of mass of the system is already computed before.
-    data->iso_bar.P = 2.0 * data->E_Kin;
-
-    for( i = 0; i < system->N; ++i )
-    {
-        p_atom = &( system->atoms[i] );
-        rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm);
-        data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) );
-    }
-
-    data->iso_bar.P /= (3.0 * system->box.volume);
-
-    // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
-    // to be added when there are long-range interactions or long-range 
-    // corrections to short-range interactions present.
-    // We may want to add that for more accuracy.
-}
-
-
-void Compute_Pressure( reax_system* system, simulation_data* data, 
-        static_storage *workspace )
-{
-    int i;
-    reax_atom *p_atom;
-    rtensor temp;
-
-    rtensor_MakeZero( data->flex_bar.P );
-
-    for( i = 0; i < system->N; ++i ) {
-        p_atom = &( system->atoms[i] );
-        // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx );
-        rvec_OuterProduct( temp, p_atom->v, p_atom->v );
-        rtensor_ScaledAdd( data->flex_bar.P, 
-                system->reaxprm.sbp[ p_atom->type ].mass, temp );
-        // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); 
-        rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp );
-    }
-
-    rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P );
-    data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0;
-}
diff --git a/PuReMD-GPU/src/cuda_system_props.h b/PuReMD-GPU/src/cuda_system_props.h
new file mode 100644
index 0000000000000000000000000000000000000000..28aea0136a54342722a3607cab4ce562a6176a40
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_system_props.h
@@ -0,0 +1,32 @@
+/*----------------------------------------------------------------------
+  PuReMD-GPU - Reax Force Field Simulator
+
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_SYSTEM_PROP_H_
+#define __CUDA_SYSTEM_PROP_H_
+
+#include "mytypes.h"
+
+void prep_dev_system (reax_system *system);
+
+void Cuda_Compute_Total_Mass( reax_system*, simulation_data* );
+void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* );
+void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* );
+
+#endif
diff --git a/PuReMD-GPU/src/forces.cu b/PuReMD-GPU/src/forces.cu
index e8e1e2917b34eefa913f25cab67b1123d08f892e..a2ad8ea7cf10222b1f0e4627ee287f701fa20a1d 100644
--- a/PuReMD-GPU/src/forces.cu
+++ b/PuReMD-GPU/src/forces.cu
@@ -36,7 +36,6 @@
 #include "cuda_init.h"
 #include "reduction.h"
 //#include "matrix.h"
-
 #include "validation.h"
 
 #include "cudaProfiler.h"
@@ -2634,7 +2633,7 @@ GLOBAL void Estimate_Storage_Sizes     (reax_atom *atoms,
 
         if( nbr_pj->d <= control->r_cut ) {
             //++(*Htop);
-            atomicAdd (Htop, 1);
+            atomicAdd(Htop, 1);
 
             /* hydrogen bond lists */ 
             //TODO - CHANGE ORIGINAL
@@ -2643,11 +2642,11 @@ GLOBAL void Estimate_Storage_Sizes     (reax_atom *atoms,
                 jhb = sbp_j->p_hbond;
                 if( ihb == 1 && jhb == 2 )
                     //++hb_top[i];
-                    atomicAdd (&hb_top[i], 1);
+                    atomicAdd(&hb_top[i], 1);
                 else if( ihb == 2 && jhb == 1 )
                     //++hb_top[j];
-                    //atomicAdd (&hb_top[j], 1);
-                    atomicAdd (&hb_top[i], 1);
+                    //atomicAdd(&hb_top[j], 1);
+                    atomicAdd(&hb_top[i], 1);
             }
             //TODO -- CHANGE ORIGINAL
 
@@ -2685,8 +2684,8 @@ GLOBAL void Estimate_Storage_Sizes     (reax_atom *atoms,
                 if( BO >= control->bo_cut ) {
                     //++bond_top[i];
                     //++bond_top[j];
-                    atomicAdd (&bond_top[i], 1);
-                    atomicAdd (&bond_top[j], 1);
+                    atomicAdd(&bond_top[i], 1);
+                    atomicAdd(&bond_top[j], 1);
                 }
             }
         }
diff --git a/PuReMD-GPU/src/four_body_interactions.cu b/PuReMD-GPU/src/four_body_interactions.cu
index af134bab14c0f1b5c9dbd1a51d0a98f4d285a93a..35a907f8018a8c17ba3de613c6d18c15a2f42a10 100644
--- a/PuReMD-GPU/src/four_body_interactions.cu
+++ b/PuReMD-GPU/src/four_body_interactions.cu
@@ -889,7 +889,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
 
                                 //PERFORMANCE IMPACT
                                 e_tor = fn10 * sin_ijk * sin_jkl * CV;
-                                //atomicAdd (&data->E_Tor ,e_tor );
+                                //MYATOMICADD(&data->E_Tor ,e_tor );
                                 E_Tor [j] += e_tor;
                                 //sh_tor [threadIdx.x] += e_tor;
 
@@ -933,7 +933,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                 fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
                                 //PERFORMANCE IMPACT
                                 e_con = fbp->p_cot1 * fn12 * (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
-                                //atomicAdd (&data->E_Con ,e_con );
+                                //MYATOMICADD(&data->E_Con ,e_con );
                                 E_Con [j] += e_con ;
                                 //sh_con [threadIdx.x] += e_con;
 
@@ -971,12 +971,12 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                 /* forces */
                                 //PERFORMANCE IMPACT
                                 /*
-                                   atomicAdd ( &bo_jk->Cdbopi, CEtors2 );
-                                   atomicAdd ( &workspace->CdDelta[j], CEtors3 );
-                                   atomicAdd ( &workspace->CdDelta[k], CEtors3 );
-                                   atomicAdd ( &bo_ij->Cdbo, (CEtors4 + CEconj1) );
-                                   atomicAdd ( &bo_jk->Cdbo, (CEtors5 + CEconj2) );
-                                   atomicAdd ( &bo_kl->Cdbo, (CEtors6 + CEconj3) );
+                                   MYATOMICADD( &bo_jk->Cdbopi, CEtors2 );
+                                   MYATOMICADD( &workspace->CdDelta[j], CEtors3 );
+                                   MYATOMICADD( &workspace->CdDelta[k], CEtors3 );
+                                   MYATOMICADD( &bo_ij->Cdbo, (CEtors4 + CEconj1) );
+                                   MYATOMICADD( &bo_jk->Cdbo, (CEtors5 + CEconj2) );
+                                   MYATOMICADD( &bo_kl->Cdbo, (CEtors6 + CEconj3) );
                                  */
 
                                 //PERFORMANCE IMPACT
@@ -987,7 +987,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                 bo_jk->Cdbo += CEtors5 + CEconj2;
 
                                 //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE
-                                atomicAdd (&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 );
+                                MYATOMICADD(&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 );
                                 //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE
 
                                 if( control->ensemble == NVE || control->ensemble == NVT ||control->ensemble == bNVT) {
diff --git a/PuReMD-GPU/src/init_md.cu b/PuReMD-GPU/src/init_md.cu
index 87b84a76f2ac7ed95b6cec45ea3c319d8033fcb6..92691bbb58a9468d50027dcf7f93d47653f86ef0 100644
--- a/PuReMD-GPU/src/init_md.cu
+++ b/PuReMD-GPU/src/init_md.cu
@@ -34,17 +34,20 @@
 #include "traj.h"
 #include "vector.h"
 
-
 #include "cuda_init.h"
 #include "cuda_copy.h"
 #include "cuda_utils.h"
 #include "helpers.h"
 #include "reduction.h"
 
-#include     "index_utils.h"
+#include "cuda_allocate.h"
+#include "cuda_system_props.h"
+
+#include "index_utils.h"
 
 #include "validation.h"
 
+
 void Generate_Initial_Velocities(reax_system *system, real T )
 {
     int i;
@@ -967,7 +970,7 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
             fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs);
 #endif
 
-            if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS), TYP_HOST ) {
+            if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS, TYP_HOST ) ) {
                 fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
                 exit( INIT_ERR );
             }
@@ -1036,7 +1039,7 @@ void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *
 #endif
 
             /* 3bodies list */
-            if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES), TYP_HOST) {
+            if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES, TYP_HOST )) {
                 fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
                 exit( INIT_ERR );
             }
diff --git a/PuReMD-GPU/src/integrate.cu b/PuReMD-GPU/src/integrate.cu
index d079028653d79dfeb5117fa6d95f33b700dee5b1..c54603b56b777ef38cfdcf9d0f71f52cf633befa 100644
--- a/PuReMD-GPU/src/integrate.cu
+++ b/PuReMD-GPU/src/integrate.cu
@@ -36,6 +36,9 @@
 #include "reduction.h"
 #include "validation.h"
 
+#include "cuda_allocate.h"
+#include "cuda_system_props.h"
+
 
 void Velocity_Verlet_NVE(reax_system* system, control_params* control, 
         simulation_data *data, static_storage *workspace, 
diff --git a/PuReMD-GPU/src/mytypes.h b/PuReMD-GPU/src/mytypes.h
index 91b9438d7c8938c79f8979e2a4080d4720266790..eacde6b90e20a601e0b52f26620617629f2a3ffd 100644
--- a/PuReMD-GPU/src/mytypes.h
+++ b/PuReMD-GPU/src/mytypes.h
@@ -31,6 +31,11 @@
     #include <cuda.h>
     #include <cublas_v2.h>
     #include <cusparse_v2.h>
+    #if __CUDA_ARCH__ < 600
+      #define MYATOMICADD myAtomicAdd
+    #else
+      #define MYATOMICADD atomicAdd
+    #endif
   #endif
 #else
   #ifndef __MYTYPES_H_
diff --git a/PuReMD-GPU/src/neighbors.cu b/PuReMD-GPU/src/neighbors.cu
index 90779538353a05eca7bb04215ebd33b3ffd81e35..0f902de6783fe8d8d57956343485e9ee0f6f0c7a 100644
--- a/PuReMD-GPU/src/neighbors.cu
+++ b/PuReMD-GPU/src/neighbors.cu
@@ -782,7 +782,7 @@ int Estimate_NumNeighbors( reax_system *system, control_params *control,
                     }
 
                     //__syncthreads ();
-                    //atomicAdd ( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1);
+                    //MYATOMICADD( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1);
                     //while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ;
 
                     if (nbrgen)
diff --git a/PuReMD-GPU/src/random.h b/PuReMD-GPU/src/random.h
index f7edb397293676c6e5a7a7d34ba5d12b8f3dab4a..f5685b8b435a9bf0378c3c352162a915896460fd 100644
--- a/PuReMD-GPU/src/random.h
+++ b/PuReMD-GPU/src/random.h
@@ -23,10 +23,6 @@
 
 #include "mytypes.h"
 
-HOST_DEVICE inline double Random(double);
-HOST_DEVICE inline void Randomize();
-HOST_DEVICE inline double GRandom(double , double );
-
 
 /* System random number generator used linear congruance method with
    large periodicity for generation of pseudo random number. function
@@ -37,6 +33,7 @@ HOST_DEVICE inline double Random(double range)
     return (random() * range) / 2147483647L;
 }
 
+
 /* This function seeds the system pseudo random number generator with
    current time. Use this function once in the begining to initialize
    the system */
@@ -45,6 +42,7 @@ HOST_DEVICE inline void Randomize()
     srandom(time(NULL));
 }
 
+
 /* GRandom return random number with gaussian distribution with mean
    and standard deviation "sigma" */
 HOST_DEVICE inline double GRandom(double mean, double sigma)
diff --git a/PuReMD-GPU/src/single_body_interactions.cu b/PuReMD-GPU/src/single_body_interactions.cu
index 3c6c08822fb056c18c78bf4db5f7aa334496ea34..9b8438a6437e6f3fb303aed57913cc77ef8e8f5a 100644
--- a/PuReMD-GPU/src/single_body_interactions.cu
+++ b/PuReMD-GPU/src/single_body_interactions.cu
@@ -374,7 +374,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
         e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
 
         //PERFORMANCE IMPACT
-        atomicAdd (&data->E_Lp, e_lp);
+        MYATOMICADD(&data->E_Lp, e_lp);
 
         dElp = p_lp2 * inv_expvd2 + 
             75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
@@ -382,7 +382,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
         //PERFORMANCE IMPACT
         //workspace->CdDelta[i] += CElp;      // lp - 1st term
-        atomicAdd (&workspace->CdDelta[i], CElp);
+        MYATOMICADD(&workspace->CdDelta[i], CElp);
 
 
 #ifdef TEST_ENERGY
@@ -416,7 +416,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
                             //PERFORMANCE IMPACT
                             e_lph = p_lp3 * SQR(vov3-3.0);
-                            atomicAdd (&data->E_Lp, e_lph );
+                            MYATOMICADD(&data->E_Lp, e_lph );
                             //estrain(i) += e_lph;
 
                             deahu2dbo = 2.*p_lp3*(vov3 - 3.);
@@ -426,7 +426,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
 
                             //PERFORMANCE IMPACT
-                            atomicAdd (&workspace->CdDelta[i], deahu2dsbo);
+                            MYATOMICADD(&workspace->CdDelta[i], deahu2dsbo);
 #ifdef TEST_ENERGY
                             //TODO
                             //fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n",
@@ -500,7 +500,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
     //PERFORMANCE IMPACT
     //data->E_Ov += e_ov = sum_ovun1 * CEover1;
     e_ov = sum_ovun1 * CEover1;
-    atomicAdd (&data->E_Ov, e_ov ); 
+    MYATOMICADD(&data->E_Ov, e_ov ); 
 
     CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
         ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
@@ -523,7 +523,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
     //PERFORMANCE IMPACT
     e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
-    atomicAdd (&data->E_Un, e_un );
+    MYATOMICADD(&data->E_Un, e_un );
 
     CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
             p_ovun2 * e_un * exp_ovun2n);
@@ -537,8 +537,8 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
     // forces 
     //PERFORMANCE IMPACT
-    atomicAdd (&workspace->CdDelta[i] , CEover3);   // OvCoor - 2nd term
-    atomicAdd (&workspace->CdDelta[i], CEunder3);  // UnCoor - 1st term
+    MYATOMICADD(&workspace->CdDelta[i] , CEover3);   // OvCoor - 2nd term
+    MYATOMICADD(&workspace->CdDelta[i], CEunder3);  // UnCoor - 1st term
 
 #ifdef TEST_FORCES
     //TODO
@@ -559,7 +559,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
         bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
 
         //PERFORMANCE IMPACT
-        atomicAdd (&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a
+        MYATOMICADD(&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a
 
         bo_ij->Cdbopi += CEover4 * 
             (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
@@ -568,7 +568,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
 
         //PERFORMANCE IMPACT
-        atomicAdd (&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) );   // UnCoor - 2a
+        MYATOMICADD(&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) );   // UnCoor - 2a
 
         bo_ij->Cdbopi += CEunder4 * 
             (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
@@ -705,7 +705,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
     // calculate the energy 
     e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-    //atomicAdd (&data->E_Lp, e_lp );
+    //MYATOMICADD(&data->E_Lp, e_lp );
     E_Lp [ i ] = e_lp;
 
     dElp = p_lp2 * inv_expvd2 + 
@@ -732,7 +732,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
     e_lph = p_lp3 * SQR(vov3-3.0);
     E_Lp [i] += e_lph;
-    //atomicAdd (&data->E_Lp, e_lph );
+    //MYATOMICADD(&data->E_Lp, e_lph );
     //estrain(i) += e_lph;
 
     deahu2dbo = 2.*p_lp3*(vov3 - 3.);
@@ -790,7 +790,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
     e_ov = sum_ovun1 * CEover1;
     E_Ov [ i ] = e_ov;
-    //atomicAdd ( &data->E_Ov, e_ov );
+    //MYATOMICADD( &data->E_Ov, e_ov );
 
     CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
         ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
@@ -813,7 +813,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
     e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
     E_Un [i] = e_un;
-    //atomicAdd ( &data->E_Un, e_un );
+    //MYATOMICADD( &data->E_Un, e_un );
 
     CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
             p_ovun2 * e_un * exp_ovun2n);
@@ -903,7 +903,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g
 
     // calculate the energy 
     e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-    //atomicAdd (&data->E_Lp, e_lp );
+    //MYATOMICADD(&data->E_Lp, e_lp );
     E_Lp [ i ] = e_lp;
 
     dElp = p_lp2 * inv_expvd2 + 
@@ -930,7 +930,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g
 
                         e_lph = p_lp3 * SQR(vov3-3.0);
                         E_Lp [i] += e_lph;
-                        //atomicAdd (&data->E_Lp, e_lph );
+                        //MYATOMICADD(&data->E_Lp, e_lph );
                         //estrain(i) += e_lph;
 
                         deahu2dbo = 2.*p_lp3*(vov3 - 3.);
diff --git a/PuReMD-GPU/src/system_props.c b/PuReMD-GPU/src/system_props.c
new file mode 100644
index 0000000000000000000000000000000000000000..491b0d93f22aab1cdda29311643434f42694398a
--- /dev/null
+++ b/PuReMD-GPU/src/system_props.c
@@ -0,0 +1,348 @@
+/*----------------------------------------------------------------------
+  PuReMD-GPU - Reax Force Field Simulator
+
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "system_props.h"
+
+#include "box.h"
+#include "vector.h"
+
+
+real Get_Time( )
+{
+    struct timeval tim;
+
+    gettimeofday(&tim, NULL );
+    return( tim.tv_sec + (tim.tv_usec / 1000000.0) );
+}
+
+
+real Get_Timing_Info( real t_start )
+{
+    struct timeval tim;
+    real t_end;
+
+    gettimeofday(&tim, NULL );
+    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
+    return (t_end - t_start);
+}
+
+
+void Temperature_Control( control_params *control, simulation_data *data, 
+        output_controls *out_control )
+{
+    real tmp;
+
+    if( control->T_mode == 1 ) { // step-wise temperature control
+        if( (data->step - data->prev_steps) % 
+                ((int)(control->T_freq / control->dt)) == 0 ) {
+            if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
+                control->T += control->T_rate;
+            else control->T = control->T_final;     
+        }
+    }
+    else if( control->T_mode == 2 ) { // constant slope control
+        tmp = control->T_rate * control->dt / control->T_freq;
+
+        if( fabs( control->T - control->T_final ) >= fabs( tmp ) )
+            control->T += tmp;       
+    }
+}
+
+
+void Compute_Total_Mass( reax_system *system, simulation_data *data )
+{
+    int i;
+    int blocks;
+    int block_size;
+    real    *partial_sums = 0;
+
+    data->M = 0;
+
+    for( i = 0; i < system->N; i++ ) 
+        data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;  
+
+    data->inv_M = 1. / data->M;    
+}
+
+
+void Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
+        FILE *fout )
+{
+    int i;
+    real m, xx, xy, xz, yy, yz, zz, det;
+    rvec tvec, diff;
+    rtensor mat, inv;
+
+    int blocks;
+    int block_size;
+    rvec *l_xcm, *l_vcm, *l_amcm;
+    real t_start, t_end;
+
+    rvec_MakeZero( data->xcm );  // position of CoM
+    rvec_MakeZero( data->vcm );  // velocity of CoM
+    rvec_MakeZero( data->amcm ); // angular momentum of CoM
+    rvec_MakeZero( data->avcm ); // angular velocity of CoM
+
+    /* Compute the position, velocity and angular momentum about the CoM */
+    for( i = 0; i < system->N; ++i ) {
+        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
+
+        rvec_ScaledAdd( data->xcm, m, system->atoms[i].x );
+        rvec_ScaledAdd( data->vcm, m, system->atoms[i].v );
+
+        rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v );
+        rvec_ScaledAdd( data->amcm, m, tvec );
+
+        /*fprintf( fout,"%3d  %g %g %g\n",
+          i+1, 
+          system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2]  );
+          fprintf( fout, "vcm:  %g %g %g\n", 
+          data->vcm[0], data->vcm[1], data->vcm[2] );  
+         */
+    }
+
+    rvec_Scale( data->xcm, data->inv_M, data->xcm );
+    rvec_Scale( data->vcm, data->inv_M, data->vcm );
+
+    rvec_Cross( tvec, data->xcm, data->vcm );
+    rvec_ScaledAdd( data->amcm, -data->M, tvec );
+
+    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
+
+    /* Calculate and then invert the inertial tensor */
+    xx = xy = xz = yy = yz = zz = 0;
+
+    for( i = 0; i < system->N; ++i ) {
+        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
+
+        rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
+        xx += diff[0] * diff[0] * m;
+        xy += diff[0] * diff[1] * m;
+        xz += diff[0] * diff[2] * m;
+        yy += diff[1] * diff[1] * m;
+        yz += diff[1] * diff[2] * m;
+        zz += diff[2] * diff[2] * m;      
+    }
+
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, " xx: %f \n", xx);
+    fprintf (stderr, " xy: %f \n", xy);
+    fprintf (stderr, " xz: %f \n", xz);
+    fprintf (stderr, " yy: %f \n", yy);
+    fprintf (stderr, " yz: %f \n", yz);
+    fprintf (stderr, " zz: %f \n", zz);
+#endif
+
+    mat[0][0] = yy + zz;     
+    mat[0][1] = mat[1][0] = -xy;
+    mat[0][2] = mat[2][0] = -xz;
+    mat[1][1] = xx + zz;
+    mat[2][1] = mat[1][2] = -yz;
+    mat[2][2] = xx + yy;
+
+    /* invert the inertial tensor */
+    det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
+            mat[0][1] * mat[1][2] * mat[2][0] + 
+            mat[0][2] * mat[1][0] * mat[2][1] ) -
+        ( mat[0][0] * mat[1][2] * mat[2][1] + 
+          mat[0][1] * mat[1][0] * mat[2][2] + 
+          mat[0][2] * mat[1][1] * mat[2][0] );
+
+    inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
+    inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
+    inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
+    inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
+    inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
+    inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
+    inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
+    inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
+    inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
+
+    if( fabs(det) > ALMOST_ZERO )
+        rtensor_Scale( inv, 1./det, inv );
+    else 
+        rtensor_MakeZero( inv );
+
+    /* Compute the angular velocity about the centre of mass */
+    rtensor_MatVec( data->avcm, inv, data->amcm );  
+    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
+
+#if defined(DEBUG)
+    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
+            data->xcm[0], data->xcm[1], data->xcm[2] );
+    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
+            data->vcm[0], data->vcm[1], data->vcm[2] );
+    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
+            data->amcm[0], data->amcm[1], data->amcm[2] );
+    /* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
+       mat[0][0], mat[0][1], mat[0][2], 
+       mat[1][0], mat[1][1], mat[1][2], 
+       mat[2][0], mat[2][1], mat[2][2] );
+       fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
+       inv[0][0], inv[0][1], inv[0][2], 
+       inv[1][0], inv[1][1], inv[1][2], 
+       inv[2][0], inv[2][1], inv[2][2] );
+       fflush( fout ); */
+    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
+            data->avcm[0], data->avcm[1], data->avcm[2] );
+#endif
+}
+
+
+void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
+{
+    int i;
+    rvec p;
+    real m;
+
+    data->E_Kin = 0.0;
+
+    for (i=0; i < system->N; i++) {
+        m = system->reaxprm.sbp[system->atoms[i].type].mass;
+
+        rvec_Scale( p, m, system->atoms[i].v );
+        data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v );
+
+        /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n",
+           i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
+           system->reaxprm.sbp[system->atoms[i].type].mass); */
+    }
+
+    data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
+
+    if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */
+        data->therm.T = ALMOST_ZERO;
+}
+
+
+/* IMPORTANT: This function assumes that current kinetic energy and 
+ *  the center of mass of the system is already computed before. 
+ *
+ * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
+ *  to be added when there are long-range interactions or long-range 
+ *  corrections to short-range interactions present.
+ *  We may want to add that for more accuracy. 
+ */
+void Compute_Pressure_Isotropic( reax_system* system, control_params *control, 
+        simulation_data* data, 
+        output_controls *out_control )
+{
+    int i;
+    reax_atom *p_atom;
+    rvec tx;
+    rvec tmp;
+    simulation_box *box = &(system->box);
+
+    /* Calculate internal pressure */
+    rvec_MakeZero( data->int_press );
+
+    // 0: both int and ext, 1: ext only, 2: int only
+    if( control->press_mode == 0 || control->press_mode == 2 ) {
+        for( i = 0; i < system->N; ++i ) {
+            p_atom = &( system->atoms[i] );
+
+            /* transform x into unitbox coordinates */
+            Transform_to_UnitBox( p_atom->x, box, 1, tx );
+
+            /* this atom's contribution to internal pressure */
+            rvec_Multiply( tmp, p_atom->f, tx );
+            rvec_Add( data->int_press, tmp );
+
+            if( out_control->debug_level > 0 ) {
+                fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", 
+                        i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f", 
+                        p_atom->f[0], p_atom->f[1], p_atom->f[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", 
+                        data->int_press[0],data->int_press[1],data->int_press[2]);
+            }
+        }
+    }
+
+    /* kinetic contribution */
+    data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV );
+
+    /* Calculate total pressure in each direction */  
+    data->tot_press[0] = data->kin_press - 
+        ((data->int_press[0] + data->ext_press[0]) /
+         (box->box_norms[1] * box->box_norms[2] * P_CONV));
+
+    data->tot_press[1] = data->kin_press - 
+        ((data->int_press[1] + data->ext_press[1])/
+         (box->box_norms[0] * box->box_norms[2] * P_CONV));
+
+    data->tot_press[2] = data->kin_press - 
+        ((data->int_press[2] + data->ext_press[2])/
+         (box->box_norms[0] * box->box_norms[1] * P_CONV));
+
+    /* Average pressure for the whole box */
+    data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3;
+}
+
+
+void Compute_Pressure_Isotropic_Klein( reax_system* system, 
+        simulation_data* data )
+{
+    int i;
+    reax_atom *p_atom;
+    rvec dx;
+
+    // IMPORTANT: This function assumes that current kinetic energy and 
+    // the center of mass of the system is already computed before.
+    data->iso_bar.P = 2.0 * data->E_Kin;
+
+    for( i = 0; i < system->N; ++i )
+    {
+        p_atom = &( system->atoms[i] );
+        rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm);
+        data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) );
+    }
+
+    data->iso_bar.P /= (3.0 * system->box.volume);
+
+    // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
+    // to be added when there are long-range interactions or long-range 
+    // corrections to short-range interactions present.
+    // We may want to add that for more accuracy.
+}
+
+
+void Compute_Pressure( reax_system* system, simulation_data* data, 
+        static_storage *workspace )
+{
+    int i;
+    reax_atom *p_atom;
+    rtensor temp;
+
+    rtensor_MakeZero( data->flex_bar.P );
+
+    for( i = 0; i < system->N; ++i ) {
+        p_atom = &( system->atoms[i] );
+        // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx );
+        rvec_OuterProduct( temp, p_atom->v, p_atom->v );
+        rtensor_ScaledAdd( data->flex_bar.P, 
+                system->reaxprm.sbp[ p_atom->type ].mass, temp );
+        // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); 
+        rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp );
+    }
+
+    rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P );
+    data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0;
+}
diff --git a/PuReMD-GPU/src/system_props.h b/PuReMD-GPU/src/system_props.h
index d152c25403832f5a778104ed114a9b3bcd8d9f73..90f43d763fc2ca57fad32ff99f5538aa08092f4f 100644
--- a/PuReMD-GPU/src/system_props.h
+++ b/PuReMD-GPU/src/system_props.h
@@ -30,21 +30,14 @@ real Get_Timing_Info( real );
 void Temperature_Control( control_params*, simulation_data*, output_controls* );
 
 void Compute_Total_Mass( reax_system*, simulation_data* );
-void Cuda_Compute_Total_Mass( reax_system*, simulation_data* );
 
 void Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* );
-void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* );
 
 void Compute_Kinetic_Energy( reax_system*, simulation_data* );
-void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* );
 
 void Compute_Pressure( reax_system*, simulation_data*, static_storage* );
 
 void Compute_Pressure_Isotropic( reax_system*, control_params*, simulation_data*, output_controls* );
 
-void prep_dev_system (reax_system *system);
-GLOBAL void k_Compute_Total_Mass (single_body_parameters *, reax_atom *, real *, size_t );
-//GLOBAL void Compute_Kinetic_Energy (single_body_parameters *, reax_atom *, unsigned int , simulation_data *, real *);
-
 
 #endif
diff --git a/PuReMD-GPU/src/testmd.cu b/PuReMD-GPU/src/testmd.c
similarity index 99%
rename from PuReMD-GPU/src/testmd.cu
rename to PuReMD-GPU/src/testmd.c
index 93f286cc2c0f99cc4aa9788195881dd7ffa15f9f..f2f27d19ff3158ffd4ea6a74bbb41f176e78d9bb 100644
--- a/PuReMD-GPU/src/testmd.cu
+++ b/PuReMD-GPU/src/testmd.c
@@ -39,6 +39,7 @@
 #include "cuda_copy.h"
 #include "validation.h"
 
+#include "cuda_system_props.h"
 
 
 interaction_function Interaction_Functions[NO_OF_INTERACTIONS];
diff --git a/PuReMD-GPU/src/three_body_interactions.cu b/PuReMD-GPU/src/three_body_interactions.cu
index c2eed63bc52c8e5f3db040d74d9d8d083478ec82..1edd41aeacb6e3a895cb4aa900be66104b743fea 100644
--- a/PuReMD-GPU/src/three_body_interactions.cu
+++ b/PuReMD-GPU/src/three_body_interactions.cu
@@ -871,7 +871,7 @@ where i < k */
 
                             e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
                             //PERFORMANCE IMPACT
-                            //atomicAdd (&data->E_Ang, e_ang);
+                            //MYATOMICADD(&data->E_Ang, e_ang);
                             E_Ang [j] += e_ang;
                             /* END ANGLE ENERGY*/
 
@@ -895,7 +895,7 @@ where i < k */
 
                             e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
                             //PERFORMANCE IMPACT
-                            //atomicAdd (&data->E_Pen, e_pen);
+                            //MYATOMICADD(&data->E_Pen, e_pen);
                             E_Pen [j] += e_pen;
 
 
@@ -921,7 +921,7 @@ where i < k */
                                 EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
 
                             //PERFORMANCE IMPACT
-                            //atomicAdd (&data->E_Coa, e_coa);
+                            //MYATOMICADD(&data->E_Coa, e_coa);
                             E_Coa [j] += e_coa;
 
                             CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
@@ -933,19 +933,19 @@ where i < k */
 
                             /* FORCES */
                             /*
-                               atomicAdd (&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) );
-                               atomicAdd (&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) );
-                               atomicAdd (&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) );
-                               atomicAdd (&workspace->CdDelta[i], CEcoa4 );
-                               atomicAdd (&workspace->CdDelta[k], CEcoa5 );              
+                               MYATOMICADD(&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) );
+                               MYATOMICADD(&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) );
+                               MYATOMICADD(&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) );
+                               MYATOMICADD(&workspace->CdDelta[i], CEcoa4 );
+                               MYATOMICADD(&workspace->CdDelta[k], CEcoa5 );              
                              */
 
                             bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ;
                             bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ;
                             workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ;
-                            //atomicAdd (&workspace->CdDelta[i], CEcoa4 );
+                            //MYATOMICADD(&workspace->CdDelta[i], CEcoa4 );
                             pbond_ij->CdDelta_ij += CEcoa4 ;
-                            //atomicAdd (&workspace->CdDelta[k], CEcoa5 );              
+                            //MYATOMICADD(&workspace->CdDelta[k], CEcoa5 );              
                             pbond_jk->CdDelta_ij += CEcoa5;
 
                             for( t = start_j; t < end_j; ++t ) {
@@ -960,9 +960,9 @@ where i < k */
                                 //    (CEval6 * pBOjt7) );
 
                                 /*
-                                   atomicAdd (&bo_jt->Cdbo, (CEval6 * pBOjt7) );
-                                   atomicAdd (&bo_jt->Cdbopi, CEval5 );
-                                   atomicAdd (&bo_jt->Cdbopi2, CEval5 );
+                                   MYATOMICADD(&bo_jt->Cdbo, (CEval6 * pBOjt7) );
+                                   MYATOMICADD(&bo_jt->Cdbopi, CEval5 );
+                                   MYATOMICADD(&bo_jt->Cdbopi2, CEval5 );
                                  */
                                 bo_jt->Cdbo        += (CEval6 * pBOjt7) ;
                                 bo_jt->Cdbopi    += CEval5 ;
@@ -1670,7 +1670,7 @@ GLOBAL void Hydrogen_Bonds (    reax_atom *atoms,
 
                     //PERFORMANCE IMPACT
                     e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                    //atomicAdd ( &data->E_HB, e_hb );
+                    //MYATOMICADD( &data->E_HB, e_hb );
                     //E_HB [j] += e_hb;
                     sh_hb [threadIdx.x] += e_hb;
 
@@ -2050,7 +2050,7 @@ GLOBAL void Hydrogen_Bonds (    reax_atom *atoms,
 
                         //PERFORMANCE IMPACT
                         e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                        //atomicAdd ( &data->E_HB, e_hb );
+                        //MYATOMICADD( &data->E_HB, e_hb );
                         //E_HB [j] += e_hb;
                         sh_hb [threadIdx.x] += e_hb;
 
diff --git a/PuReMD-GPU/src/traj.cu b/PuReMD-GPU/src/traj.c
similarity index 99%
rename from PuReMD-GPU/src/traj.cu
rename to PuReMD-GPU/src/traj.c
index 97496e7f7b8dc4e9add824ee198d0c2907180a98..afe3b1d2c628cbd6f6802b43ee829ba42b057fe0 100644
--- a/PuReMD-GPU/src/traj.cu
+++ b/PuReMD-GPU/src/traj.c
@@ -20,7 +20,10 @@
 
 #include "traj.h"
 #include "list.h"
-#include "cuda_copy.h"
+
+#ifdef __PRINT_CPU_RESULTS__
+  #include "cuda_copy.h"
+#endif
 
 /************************************************/
 /*      CUSTOM FORMAT ROUTINES                  */
@@ -207,7 +210,7 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
     if( write_bonds )
     {
 
-#ifndef __PRINT_CPU_RESULTS__
+#ifdef __PRINT_CPU_RESULTS__
         //fprintf (stderr, "Synching bonds from device for printing ....\n");
         Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND );
 #endif
@@ -239,7 +242,7 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
     num_thb_intrs = 0;
     if( write_angles ) {
 
-#ifndef __PRINT_CPU_RESULTS__
+#ifdef __PRINT_CPU_RESULTS__
         //fprintf (stderr, "Synching three bodies from deivce for printing ... \n");
         Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY );
         if ( !write_bonds) {
diff --git a/PuReMD-GPU/src/traj.h b/PuReMD-GPU/src/traj.h
index d8c1792d0f6941d6881939d559bd9003b286bfa1..35d92602eee7c2d0b5ee83889623df2cb2106c71 100644
--- a/PuReMD-GPU/src/traj.h
+++ b/PuReMD-GPU/src/traj.h
@@ -22,8 +22,8 @@
 #define __TRAJ_H__
 
 #include "mytypes.h"
-#include "zlib.h"
 
+#include <zlib.h>
 
 #define BLOCK_MARK "REAX_BLOCK_MARK "
 #define BLOCK_MARK_LEN 16
@@ -73,12 +73,14 @@
 #define SIZE_INFO_LINE3 "%-10d %-10d %-10d\n"
 #define SIZE_INFO_LEN3 33
 
+
 enum ATOM_LINE_OPTS {OPT_NOATOM = 0, OPT_ATOM_BASIC = 4, OPT_ATOM_wF = 5,
                      OPT_ATOM_wV = 6, OPT_ATOM_FULL = 7
                     };
 enum BOND_LINE_OPTS {OPT_NOBOND, OPT_BOND_BASIC, OPT_BOND_FULL};
 enum ANGLE_LINE_OPTS {OPT_NOANGLE, OPT_ANGLE_BASIC};
 
+
 struct
 {
     int no_of_sub_blocks;
@@ -89,11 +91,11 @@ struct
 
 typedef struct __block block;
 
+
 int Write_Block( gzFile, block* );
 int Read_Next_Block( gzFile, block*, int* );
 int Skip_Next_Block( gzFile, int*);
 
-
 /*
   Format for trajectory file
 
@@ -141,8 +143,6 @@ int Skip_Next_Block( gzFile, int*);
   No. of torsion entries (int)
   Torsion info lines as per torsion format.
 */
-
-
 int Write_Custom_Header( reax_system*, control_params*,
                          static_storage*, output_controls* );
 int Write_xyz_Header   ( reax_system*, control_params*,
diff --git a/PuReMD-GPU/src/two_body_interactions.cu b/PuReMD-GPU/src/two_body_interactions.cu
index f53b0cfb0fb1c23626032e7a257fce5b1447953e..60cfbd0b90389d54918ef9b009cfac1b9399ce3e 100644
--- a/PuReMD-GPU/src/two_body_interactions.cu
+++ b/PuReMD-GPU/src/two_body_interactions.cu
@@ -221,7 +221,7 @@ GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params,
                 -twbp->De_pp * bo_ij->BO_pi2;
 
             //PERFORMANCE IMAPCT
-            //atomicAdd (&data->E_BE, ebond);
+            //MYATOMICADD(&data->E_BE, ebond);
             //TODO
             //E_BE [ i ] += ebond/2.0;
             E_BE [ i ] += ebond;
@@ -274,7 +274,7 @@ GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params,
                     //estrain(j2) = estrain(j2) + 0.50*estriph;
 
                     //PERFORMANCE IMPACT
-                    //atomicAdd (&data->E_BE, estriph);
+                    //MYATOMICADD(&data->E_BE, estriph);
                     E_BE [ i] += estriph;
                     //data->E_BE += estriph;
 
@@ -289,7 +289,7 @@ GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params,
 
                     //PERFORMANCE IMAPCT
                     workspace->CdDelta[i] += decobdboua;
-                    //atomicAdd (&workspace->CdDelta[j], decobdboub);
+                    //MYATOMICADD(&workspace->CdDelta[j], decobdboub);
                     //CdDelta [ i * N + i ] += decobdboua;
                     //CdDelta [ i * N + j ] += decobdboua;
                     //workspace->CdDelta [i] += decobdboua;
diff --git a/PuReMD-GPU/src/vector.cu b/PuReMD-GPU/src/vector.c
similarity index 100%
rename from PuReMD-GPU/src/vector.cu
rename to PuReMD-GPU/src/vector.c
diff --git a/PuReMD-GPU/src/vector.h b/PuReMD-GPU/src/vector.h
index 336534784e50fd5552aaf27ff0c9531c3b97a02a..ee1375ee0816bcefbf1f335bfdaeccf8fcfed3f3 100644
--- a/PuReMD-GPU/src/vector.h
+++ b/PuReMD-GPU/src/vector.h
@@ -22,8 +22,10 @@
 #define __VECTOR_H_
 
 #include "mytypes.h"
+
 #include "random.h"
 
+
 int  Vector_isZero( real*, int );
 void Vector_MakeZero( real*, int );
 void Vector_Copy( real*, real*, int );
@@ -33,8 +35,6 @@ void Vector_Copy( real*, real*, int );
 void Vector_Print( FILE*, char*, real*, int );
 real Norm( real*, int );
 
-HOST_DEVICE inline real Dot( real*, real*, int );
-
 void rvec_Sum( rvec, rvec, rvec );
 real rvec_ScaledDot( real, rvec, real, rvec );
 void rvec_Multiply( rvec, rvec, rvec );
@@ -44,19 +44,6 @@ void rvec_Invert( rvec, rvec );
 void rvec_OuterProduct( rtensor, rvec, rvec );
 int  rvec_isZero( rvec );
 
-HOST_DEVICE inline real rvec_Dot( rvec, rvec );
-HOST_DEVICE inline void rvec_Scale( rvec, real, rvec );
-HOST_DEVICE inline real rvec_Norm_Sqr( rvec );
-HOST_DEVICE inline void rvec_Random( rvec );
-HOST_DEVICE inline void rvec_MakeZero( rvec );
-HOST_DEVICE inline void rvec_Add( rvec, rvec );
-HOST_DEVICE inline void rvec_Copy( rvec, rvec );
-HOST_DEVICE inline void rvec_Cross( rvec, rvec, rvec );
-HOST_DEVICE inline void rvec_ScaledAdd( rvec, real, rvec );
-HOST_DEVICE inline void rvec_ScaledSum( rvec, real, rvec, real, rvec );
-HOST_DEVICE inline void rvec_iMultiply( rvec, ivec, rvec );
-HOST_DEVICE inline real rvec_Norm( rvec );
-
 void rtensor_MakeZero( rtensor );
 void rtensor_Multiply( rtensor, rtensor, rtensor );
 void rtensor_MatVec( rvec, rtensor, rvec );
@@ -80,15 +67,6 @@ void ivec_MakeZero( ivec );
 void ivec_rScale( ivec, real, rvec );
 
 
-HOST_DEVICE inline void ivec_Copy( ivec, ivec );
-HOST_DEVICE inline void ivec_Scale( ivec, real, ivec );
-HOST_DEVICE inline void ivec_Sum( ivec, ivec, ivec );
-
-/*
- * Code which is common to multiple HOST and DEVICE
- *
- */
-
 HOST_DEVICE inline real Dot( real* v1, real* v2, int k )
 {
     real ret = 0;
@@ -100,17 +78,15 @@ HOST_DEVICE inline real Dot( real* v1, real* v2, int k )
 }
 
 
-
-
 /////////////////////////////
 //rvec functions
 /////////////////////////////
-
 HOST_DEVICE inline void rvec_MakeZero( rvec v )
 {
     v[0] = v[1] = v[2] = ZERO;
 }
 
+
 HOST_DEVICE inline void rvec_Add( rvec ret, rvec v )
 {
     ret[0] += v[0];
@@ -118,11 +94,13 @@ HOST_DEVICE inline void rvec_Add( rvec ret, rvec v )
     ret[2] += v[2];
 }
 
+
 HOST_DEVICE inline void rvec_Copy( rvec dest, rvec src )
 {
     dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
 }
 
+
 HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 )
 {
     ret[0] = v1[1] * v2[2] - v1[2] * v2[1];
@@ -130,11 +108,13 @@ HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 )
     ret[2] = v1[0] * v2[1] - v1[1] * v2[0];
 }
 
+
 HOST_DEVICE inline void rvec_ScaledAdd( rvec ret, real c, rvec v )
 {
     ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
 }
 
+
 HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 )
 {
     ret[0] = c1 * v1[0] + c2 * v2[0];
@@ -142,6 +122,7 @@ HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rv
     ret[2] = c1 * v1[2] + c2 * v2[2];
 }
 
+
 HOST_DEVICE inline void rvec_Random( rvec v )
 {
     v[0] = Random(2.0) - 1.0;
@@ -149,21 +130,25 @@ HOST_DEVICE inline void rvec_Random( rvec v )
     v[2] = Random(2.0) - 1.0;
 }
 
+
 HOST_DEVICE inline real rvec_Norm_Sqr( rvec v )
 {
     return SQR(v[0]) + SQR(v[1]) + SQR(v[2]);
 }
 
+
 HOST_DEVICE inline void rvec_Scale( rvec ret, real c, rvec v )
 {
     ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
 }
 
+
 HOST_DEVICE inline real rvec_Dot( rvec v1, rvec v2 )
 {
     return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
 }
 
+
 HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
 {
     r[0] = v1[0] * v2[0];
@@ -171,23 +156,22 @@ HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
     r[2] = v1[2] * v2[2];
 }
 
+
 HOST_DEVICE inline real rvec_Norm( rvec v )
 {
     return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) );
 }
 
 
-
 /////////////////
 //ivec functions
 /////////////////
-
-
 HOST_DEVICE inline void ivec_Copy( ivec dest , ivec src )
 {
     dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
 }
 
+
 HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src )
 {
     dest[0] = C * src[0];
@@ -195,6 +179,7 @@ HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src )
     dest[2] = C * src[2];
 }
 
+
 HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
 {
     dest[0] = v1[0] + v2[0];
@@ -203,7 +188,6 @@ HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
 }
 
 
-
 /////////////////
 //vector functions
 /////////////////
@@ -213,12 +197,14 @@ HOST_DEVICE inline void Vector_Sum( real* dest, real c, real* v, real d, real* y
         dest[k] = c * v[k] + d * y[k];
 }
 
+
 HOST_DEVICE inline void Vector_Scale( real* dest, real c, real* v, int k )
 {
     for (k--; k >= 0; k--)
         dest[k] = c * v[k];
 }
 
+
 HOST_DEVICE inline void Vector_Add( real* dest, real c, real* v, int k )
 {
     for (k--; k >= 0; k--)