diff --git a/.gitignore b/.gitignore
index 78b4824986d629e57c707d0a78fb375efb441428..fd8f967c419e269fb5a95b5521100fb17e46ed6a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
+# PuReMD
 *.log
+*.prs
 *.pot
 *.trj
 
diff --git a/PG-PuReMD/Makefile.am b/PG-PuReMD/Makefile.am
index 3b051035263dfedbade53966fa45ee9d62da7775..149e16de9b1d84f601590884e39926dc708e02de 100644
--- a/PG-PuReMD/Makefile.am
+++ b/PG-PuReMD/Makefile.am
@@ -35,7 +35,7 @@ include_HEADERS = src/reax_types.h src/index_utils.h \
 
 if USE_CUDA
 bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cuda/cuda_environment.cu \
-      src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu \
+      src/cuda/cuda_system_props.cu src/cuda/cuda_reduction.cu src/cuda/cuda_box.cu \
       src/cuda/cuda_copy.cu src/cuda/cuda_reset_tools.cu src/cuda/cuda_list.cu \
       src/cuda/cuda_neighbors.cu src/cuda/cuda_bond_orders.cu src/cuda/cuda_bonds.cu \
       src/cuda/cuda_multi_body.cu src/cuda/cuda_valence_angles.cu \
@@ -45,7 +45,7 @@ bin_pg_puremd_SOURCES += src/cuda/cuda_utils.cu src/cuda/cuda_allocate.cu src/cu
       src/cuda/cuda_init_md.cu src/cuda/cuda_validation.cu src/cuda/cuda_lookup.cu
 include_HEADERS += src/cuda/cuda_helpers.h src/cuda/cuda_shuffle.h \
       src/cuda/cuda_utils.h src/cuda/cuda_allocate.h src/cuda/cuda_environment.h \
-      src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h \
+      src/cuda/cuda_system_props.h src/cuda/cuda_reduction.h src/cuda/cuda_box.h \
       src/cuda/cuda_copy.h src/cuda/cuda_reset_tools.h src/cuda/cuda_list.h \
       src/cuda/cuda_neighbors.h src/cuda/cuda_bond_orders.h src/cuda/cuda_bonds.h \
       src/cuda/cuda_multi_body.h src/cuda/cuda_valence_angles.h \
diff --git a/PG-PuReMD/src/box.h b/PG-PuReMD/src/box.h
index 00e51d063a1126c662f717df938787178ff77549..fa92c72275d2acfad2258a09cc288e0203a8f5bf 100644
--- a/PG-PuReMD/src/box.h
+++ b/PG-PuReMD/src/box.h
@@ -29,14 +29,16 @@
 extern "C" {
 #endif
 
+void Make_Consistent( simulation_box* );
+
 /* initializes simulation boxes */
 void Setup_Big_Box( real, real, real, real, real, real, simulation_box* );
 
 void Init_Box( rtensor, simulation_box* );
 
-//void Setup_My_Box( reax_system*, control_params* );
+void Setup_My_Box( reax_system*, control_params* );
 
-//void Setup_My_Ext_Box( reax_system*, control_params* );
+void Setup_My_Ext_Box( reax_system*, control_params* );
 
 void Setup_Environment( reax_system*, control_params*, mpi_datatypes* );
 
diff --git a/PG-PuReMD/src/cuda/cuda_allocate.cu b/PG-PuReMD/src/cuda/cuda_allocate.cu
index 5c722e56d816a420a93f3ac03c1994f58a1d53c5..e85b4eaee10deabf763901c6b67358cde9a8eaf2 100644
--- a/PG-PuReMD/src/cuda/cuda_allocate.cu
+++ b/PG-PuReMD/src/cuda/cuda_allocate.cu
@@ -80,46 +80,46 @@ void dev_alloc_grid( reax_system *system )
     cuda_malloc( (void **) &device->rel_box, sizeof(ivec) * total,
             TRUE, "dev_alloc_grid::grid->rel_box" );
 
-    /*
-       int block_size = 512;
-       int blocks = (host->max_nbrs) / block_size + ((host->max_nbrs) % block_size == 0 ? 0 : 1); 
-
-       Init_Nbrs <<<blocks, block_size>>>
-       (nbrs_x, host->max_nbrs );
-       cudaThreadSynchronize (); 
-       cudaCheckError ();
-
-       cuda_malloc ((void **)& device->cells, 
-       sizeof (grid_cell) * total, 
-       TRUE, "grid:cells");
-       fprintf (stderr, " Device cells address --> %ld \n", device->cells );
-       cuda_malloc ((void **) &device->order, sizeof (ivec) * (host->total + 1), TRUE, "grid:order");
-
-       local_cell.top = local_cell.mark = local_cell.str = local_cell.end = 0;
-       fprintf (stderr, "Total cells to be allocated -- > %d \n", total );
-       for (int i = 0; i < total; i++) {
-    //fprintf (stderr, "Address of the local atom -> %ld  \n", &local_cell);
-
-    cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * host->max_atoms, 
-    TRUE, "alloc:grid:cells:atoms");
-    //fprintf (stderr, "Allocated address of the atoms --> %ld  (%d)\n", local_cell.atoms, host->max_atoms );
-
-    cuda_malloc ((void **) &local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, 
-    TRUE, "alloc:grid:cells:nbrs_x" );
-    copy_device (local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof (ivec), "grid:nbrs_x");    
-    //fprintf (stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x);
-
-    cuda_malloc ((void **) &local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, 
-    TRUE, "alloc:grid:cells:nbrs_cp" );
-    //fprintf (stderr, "Allocated address of the nbrs_cp--> %ld \n", local_cell.nbrs_cp);
-
-    //cuda_malloc ((void **) &local_cell.nbrs, sizeof (grid_cell *) * host->max_nbrs , 
-    //                TRUE, "alloc:grid:cells:nbrs" );
-    //fprintf (stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs);
-
-    copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-alloc");
-    }
-     */
+//    int block_size = 512;
+//    int blocks = (host->max_nbrs) / block_size + ((host->max_nbrs) % block_size == 0 ? 0 : 1); 
+//
+//    Init_Nbrs <<< blocks, block_size >>>
+//        ( nbrs_x, host->max_nbrs );
+//    cudaThreadSynchronize( );
+//    cudaCheckError( );
+//
+//    cuda_malloc( (void **)& device->cells, sizeof(grid_cell) * total,
+//            TRUE, "grid:cells");
+//    fprintf( stderr, " Device cells address --> %ld \n", device->cells );
+//    cuda_malloc( (void **) &device->order,
+//            sizeof(ivec) * (host->total + 1), TRUE, "grid:order" );
+//
+//    local_cell.top = local_cell.mark = local_cell.str = local_cell.end = 0;
+//    fprintf( stderr, "Total cells to be allocated -- > %d \n", total );
+//    for (int i = 0; i < total; i++)
+//    {
+//        //fprintf( stderr, "Address of the local atom -> %ld  \n", &local_cell );
+//
+//        cuda_malloc( (void **) &local_cell.atoms, sizeof(int) * host->max_atoms,
+//                TRUE, "alloc:grid:cells:atoms" );
+//        //fprintf( stderr, "Allocated address of the atoms --> %ld  (%d)\n", local_cell.atoms, host->max_atoms );
+//
+//        cuda_malloc( (void **) &local_cell.nbrs_x, sizeof(ivec) * host->max_nbrs,
+//                TRUE, "alloc:grid:cells:nbrs_x" );
+//        copy_device( local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof(ivec), "grid:nbrs_x" );
+//        //fprintf( stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x );
+//
+//        cuda_malloc( (void **) &local_cell.nbrs_cp, sizeof(rvec) * host->max_nbrs,
+//                TRUE, "alloc:grid:cells:nbrs_cp" );
+//        //fprintf( stderr, "Allocated address of the nbrs_cp--> %ld \n", local_cell.nbrs_cp );
+//
+//        //cuda_malloc( (void **) &local_cell.nbrs, sizeof(grid_cell *) * host->max_nbrs,
+//        //                TRUE, "alloc:grid:cells:nbrs" );
+//        //fprintf( stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs );
+//
+//        copy_host_device( &local_cell, &device->cells[i], sizeof(grid_cell),
+//                cudaMemcpyHostToDevice, "grid:cell-alloc" );
+//    }
 }
 
 
@@ -757,7 +757,7 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
     if ( renbr && realloc->gcell_atoms > -1 )
     {
 #if defined(DEBUG_FOCUS)
-        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
+        fprintf( stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms );
 #endif
 
         for ( i = g->native_str[0]; i < g->native_end[0]; i++ )
@@ -769,7 +769,7 @@ void Cuda_ReAllocate( reax_system *system, control_params *control,
                     // reallocate g->atoms
                     sfree( g->cells[ index_grid_3d(i,j,k,g) ].atoms, "g:atoms" );
                     g->cells[ index_grid_3d(i,j,k,g) ].atoms = (int*)
-                            scalloc(realloc->gcell_atoms, sizeof(int), "g:atoms");
+                            scalloc( realloc->gcell_atoms, sizeof(int), "g:atoms" );
                 }
             }
         }
diff --git a/PG-PuReMD/src/cuda/cuda_box.cu b/PG-PuReMD/src/cuda/cuda_box.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2d5f47566f2543a21f6cf4598f30b527913a1bf6
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_box.cu
@@ -0,0 +1,118 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#include "cuda_box.h"
+
+#include "cuda_integrate.h"
+#include "cuda_system_props.h"
+#include "cuda_utils.h"
+
+#include "../box.h"
+#include "../comm_tools.h"
+
+
+void Cuda_Scale_Box( reax_system *system, control_params *control,
+        simulation_data *data, mpi_datatypes *mpi_data )
+{
+    int d;
+    real dt, lambda;
+    rvec mu = {0.0, 0.0, 0.0};
+
+    dt = control->dt;
+
+    /* pressure scaler */
+    if ( control->ensemble == iNPT )
+    {
+        mu[0] = POW( 1.0 + (dt / control->Tau_P[0]) * (data->iso_bar.P - control->P[0]),
+                1.0 / 3.0 );
+
+        if ( mu[0] < MIN_dV )
+        {
+            mu[0] = MIN_dV;
+        }
+        else if ( mu[0] > MAX_dV )
+        {
+            mu[0] = MAX_dV;
+        }
+
+        mu[1] = mu[0];
+        mu[2] = mu[1];
+    }
+    else if ( control->ensemble == sNPT )
+    {
+        for ( d = 0; d < 3; ++d )
+        {
+            mu[d] = POW(1.0 + (dt / control->Tau_P[d]) * (data->tot_press[d] - control->P[d]),
+                        1. / 3 );
+
+            if ( mu[d] < MIN_dV )
+            {
+                mu[d] = MIN_dV;
+            }
+            else if ( mu[d] > MAX_dV )
+            {
+                mu[d] = MAX_dV;
+            }
+        }
+    }
+
+    /* temperature scaler */
+    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+    if ( lambda < MIN_dT )
+    {
+        lambda = MIN_dT;
+    }
+    else if (lambda > MAX_dT )
+    {
+        lambda = MAX_dT;
+    }
+    lambda = SQRT( lambda );
+
+    /* Scale velocities and positions at t+dt */
+    bNVP_scale_velocities( system, lambda, mu );
+
+    Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+
+#if defined(DEBUG)
+    fprintf( stderr, "damping - " );
+#endif
+
+    /* update box & grid */
+    system->big_box.box[0][0] *= mu[0];
+    system->big_box.box[1][1] *= mu[1];
+    system->big_box.box[2][2] *= mu[2];
+
+    Make_Consistent( &(system->big_box) );
+    Setup_My_Box( system, control );
+    Setup_My_Ext_Box( system, control );
+    Update_Comm( system );
+
+    copy_host_device( &system->big_box, &system->d_big_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->big_box" );
+    copy_host_device( &system->my_box, &system->d_my_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->my_box" );
+    copy_host_device( &system->my_ext_box, &system->d_my_ext_box,
+            sizeof(simulation_box), cudaMemcpyHostToDevice, "Cuda_Scale_Box::simulation_data->my_ext_box" );
+
+#if defined(DEBUG)
+    fprintf( stderr, "box & grid updated - " );
+#endif
+}
diff --git a/PG-PuReMD/src/cuda/cuda_box.h b/PG-PuReMD/src/cuda/cuda_box.h
new file mode 100644
index 0000000000000000000000000000000000000000..6db597385884cbaf4efe81bb83674c5ab2e48786
--- /dev/null
+++ b/PG-PuReMD/src/cuda/cuda_box.h
@@ -0,0 +1,40 @@
+/*----------------------------------------------------------------------
+  PuReMD - Purdue ReaxFF Molecular Dynamics Program
+
+  Copyright (2010) Purdue University
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Joseph Fogarty, jcfogart@mail.usf.edu
+  Sagar Pandit, pandit@usf.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+
+#ifndef __CUDA_BOX_H__
+#define __CUDA_BOX_H__
+
+#include "../reax_types.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Cuda_Scale_Box( reax_system *, control_params *,
+        simulation_data *, mpi_datatypes *);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/PG-PuReMD/src/cuda/cuda_charges.cu b/PG-PuReMD/src/cuda/cuda_charges.cu
index ada6bf2f75c7960e1b00e820bf8387cbd98199d2..72018b1d936a472e2c78d78784306283f5c33ffb 100644
--- a/PG-PuReMD/src/cuda/cuda_charges.cu
+++ b/PG-PuReMD/src/cuda/cuda_charges.cu
@@ -220,7 +220,7 @@ void Cuda_Calculate_Charges( reax_system *system, storage *workspace,
     my_sum[1] = 0.0;
     scale = sizeof(real) / sizeof(void);
     q = (real *) host_scratch;
-    memset( q, 0, system->N * sizeof (real));
+    memset( q, 0, system->N * sizeof(real) );
 
     cuda_charges_x( system, my_sum );
 
@@ -270,9 +270,9 @@ void Cuda_QEq( reax_system *system, control_params *control, simulation_data
 #endif
 
     //MATRIX CHANGES
-    s_matvecs = Cuda_dual_CG(system, workspace, &dev_workspace->H,
+    s_matvecs = Cuda_dual_CG( system, workspace, &dev_workspace->H,
             dev_workspace->b, control->q_err, dev_workspace->x, mpi_data,
-            out_control->log, data);
+            out_control->log, data );
     t_matvecs = 0;
     //fprintf (stderr, "Device: First CG complated with iterations: %d \n", s_matvecs);
 
diff --git a/PG-PuReMD/src/cuda/cuda_copy.cu b/PG-PuReMD/src/cuda/cuda_copy.cu
index 42055875624ce003a3f2b4ba6661d3bc1e08eb8e..1098d197ffdce8c39a51fe4ff5033eacd13beeb4 100644
--- a/PG-PuReMD/src/cuda/cuda_copy.cu
+++ b/PG-PuReMD/src/cuda/cuda_copy.cu
@@ -32,18 +32,18 @@ void Sync_Grid( grid *host, grid *device )
     ivec_Copy( device->ghost_bond_span, host->ghost_bond_span );
 
     copy_host_device( host->str, device->str, sizeof(int) * total,
-            cudaMemcpyHostToDevice, "grid:str");
+            cudaMemcpyHostToDevice, "grid:str" );
     copy_host_device( host->end, device->end, sizeof(int) * total,
-            cudaMemcpyHostToDevice, "grid:end");
+            cudaMemcpyHostToDevice, "grid:end" );
     copy_host_device( host->cutoff, device->cutoff, sizeof(real) * total,
-            cudaMemcpyHostToDevice, "grid:cutoff");
+            cudaMemcpyHostToDevice, "grid:cutoff" );
     copy_host_device( host->nbrs_x, device->nbrs_x, sizeof(ivec) * total *
-            host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x");
+            host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x" );
     copy_host_device( host->nbrs_cp, device->nbrs_cp, sizeof(rvec) * total *
-            host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp");
+            host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp" );
 
     copy_host_device( host->rel_box, device->rel_box, sizeof(ivec) * total,
-            cudaMemcpyHostToDevice, "grid:rel_box");
+            cudaMemcpyHostToDevice, "grid:rel_box" );
 
     device->max_nbrs = host->max_nbrs;
 }
diff --git a/PG-PuReMD/src/cuda/cuda_forces.cu b/PG-PuReMD/src/cuda/cuda_forces.cu
index a790b1a8c7fd570723c27c985c458886f0dcb478..553acbd06d26052379bd6a35817bd51683c4a1b9 100644
--- a/PG-PuReMD/src/cuda/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda/cuda_forces.cu
@@ -1554,12 +1554,12 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
 
         /* reduction for ext_pres */
         rvec_spad = (rvec *) (spad + 6 * system->N);
-        k_reduction_rvec <<<BLOCKS_N, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
+        k_reduction_rvec <<< BLOCKS_N, BLOCK_SIZE, sizeof(rvec) * BLOCK_SIZE >>>
             ( rvec_spad, rvec_spad + system->N,  system->N );
         cudaThreadSynchronize( );
         cudaCheckError( );
 
-        k_reduction_rvec <<<1, BLOCKS_POW_2_N, sizeof(rvec) * BLOCKS_POW_2_N >>>
+        k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof(rvec) * BLOCKS_POW_2_N >>>
             ( rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS_N );
         cudaThreadSynchronize ();
         cudaCheckError( );
@@ -1692,7 +1692,7 @@ int Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control,
                 ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS) );
 //            Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_blocks, HB_POST_PROC_BLOCK_SIZE, 
 //                    HB_POST_PROC_BLOCK_SIZE * sizeof(rvec) >>>
-                ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS), system->N );
+//                ( system->d_my_atoms, *dev_workspace, *(*dev_lists + HBONDS), system->N );
             cudaThreadSynchronize( );
             cudaCheckError( );
 
diff --git a/PG-PuReMD/src/cuda/cuda_init_md.cu b/PG-PuReMD/src/cuda/cuda_init_md.cu
index fb1ac0df5abdccc5c021ee594caf4c34fe3dcd22..2bb2f89e7b76315462927b659c00efe8102b56c0 100644
--- a/PG-PuReMD/src/cuda/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda/cuda_init_md.cu
@@ -17,7 +17,6 @@
   #include "../comm_tools.h"
   #include "../grid.h"
   #include "../init_md.h"
-  #include "../integrate.h"
   #include "../io_tools.h"
 #ifdef __cplusplus
 extern "C" {
@@ -35,7 +34,6 @@ extern "C" {
   #include "../reax_comm_tools.h"
   #include "../reax_grid.h"
   #include "../reax_init_md.h"
-  #include "../reax_integrate.h"
   #include "../reax_io_tools.h"
   #include "../reax_list.h"
   #include "../reax_lookup.h"
@@ -138,7 +136,7 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
     {
     case NVE:
         data->N_f = 3 * system->bigN;
-        Cuda_Evolve = Velocity_Verlet_NVE;
+        Cuda_Evolve = Cuda_Velocity_Verlet_NVE;
         control->virial = 0;
         break;
 
@@ -151,7 +149,7 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
     case nhNVT:
         fprintf( stderr, "[WARNING] Nose-Hoover NVT is still under testing.\n" );
         data->N_f = 3 * system->bigN + 1;
-        Cuda_Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein;
         control->virial = 0;
         if ( !control->restart || (control->restart && control->random_vel) )
         {
@@ -165,7 +163,7 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
 
     case sNPT: /* Semi-Isotropic NPT */
         data->N_f = 3 * system->bigN + 4;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NPT;
         control->virial = 1;
         if ( !control->restart )
         {
@@ -175,7 +173,7 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
 
     case iNPT: /* Isotropic NPT */
         data->N_f = 3 * system->bigN + 2;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NPT;
         control->virial = 1;
         if ( !control->restart )
         {
@@ -185,7 +183,7 @@ void Cuda_Init_Simulation_Data( reax_system *system, control_params *control,
 
     case NPT: /* Anisotropic NPT */
         data->N_f = 3 * system->bigN + 9;
-        Cuda_Evolve = Velocity_Verlet_Berendsen_NPT;
+        Cuda_Evolve = Cuda_Velocity_Verlet_Berendsen_NPT;
         control->virial = 1;
 
         fprintf( stderr, "p%d: init_simulation_data: option not yet implemented\n",
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.cu b/PG-PuReMD/src/cuda/cuda_integrate.cu
index dcb972921f735d419e6528a390ed990699c57d2c..772032632b51031dd0abe8bec187959a1080ad4b 100644
--- a/PG-PuReMD/src/cuda/cuda_integrate.cu
+++ b/PG-PuReMD/src/cuda/cuda_integrate.cu
@@ -2,10 +2,12 @@
 #include "cuda_integrate.h"
 
 #include "cuda_allocate.h"
+#include "cuda_box.h"
 #include "cuda_forces.h"
 #include "cuda_integrate.h"
 #include "cuda_copy.h"
 #include "cuda_neighbors.h"
+#include "cuda_reduction.h"
 #include "cuda_reset_tools.h"
 #include "cuda_system_props.h"
 #include "cuda_utils.h"
@@ -21,7 +23,9 @@ CUDA_GLOBAL void k_update_velocity_1( reax_atom *my_atoms,
     real inv_m;
     rvec dx;
     reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if ( i >= n )
     {
@@ -39,14 +43,15 @@ CUDA_GLOBAL void k_update_velocity_1( reax_atom *my_atoms,
 }
 
 
-void bNVT_update_velocity_part1( reax_system *system, real dt )
+void update_velocity_part1( reax_system *system, real dt )
 {
     int blocks;
 
     blocks = system->n / DEF_BLOCK_SIZE + 
         ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
+        ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n );
     cudaThreadSynchronize( );
     cudaCheckError( );
 }
@@ -57,7 +62,9 @@ CUDA_GLOBAL void k_update_velocity_2( reax_atom *my_atoms,
 {
     reax_atom *atom;
     real inv_m;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if ( i >= n )
     {
@@ -72,20 +79,140 @@ CUDA_GLOBAL void k_update_velocity_2( reax_atom *my_atoms,
 }
 
 
-void bNVT_update_velocity_part2( reax_system *system, real dt )
+void update_velocity_part2( reax_system *system, real dt )
 {
     int blocks;
 
     blocks = system->n / DEF_BLOCK_SIZE + 
         ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
     k_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
+        ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_nhNVT_update_velocity_1( reax_atom *my_atoms, 
+        single_body_parameters *sbp, real dt, int n )
+{
+    real inv_m;
+    rvec dx;
+    reax_atom *atom;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 1st part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+    rvec_Copy( atom->f_old, atom->f );
+}
+
+
+void nhNVT_update_velocity_part1( reax_system *system, real dt )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_nhNVT_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, dt, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+CUDA_GLOBAL void k_nhNVT_update_velocity_2( reax_atom *my_atoms, rvec * v_const,
+        single_body_parameters *sbp, real dt, real v_xi, int n )
+{
+    reax_atom *atom;
+    real inv_m;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* velocity verlet, 2nd part */
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute v(t + dt) */
+    rvec_Scale( v_const[i], 1.0 - 0.5 * dt * v_xi, atom->v );
+    rvec_ScaledAdd( v_const[i], 0.5 * dt * inv_m * -F_CONV, atom->f_old );
+    rvec_ScaledAdd( v_const[i], 0.5 * dt * inv_m * -F_CONV, atom->f );
+}
+
+
+void nhNVT_update_velocity_part2( reax_system *system, storage *workspace, real dt, real v_xi )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_nhNVT_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, workspace->v_const, system->reax_param.d_sbp, dt, v_xi, system->n );
     cudaThreadSynchronize( );
     cudaCheckError( );
 }
 
 
-CUDA_GLOBAL void k_scale_velocities( reax_atom *my_atoms, real lambda, int n )
+CUDA_GLOBAL void k_nhNVT_update_velocity_3( reax_atom *my_atoms, rvec *v_const,
+        single_body_parameters *sbp, real dt, real v_xi_old, real * my_ekin, int n )
+{
+    reax_atom *atom;
+    real coef_v;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    atom = &(my_atoms[i]);
+    coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
+    rvec_Scale( atom->v, coef_v, v_const[i] );
+    my_ekin[i] = (0.5 * sbp[atom->type].mass * rvec_Dot(atom->v, atom->v));
+}
+
+
+int nhNVT_update_velocity_part3( reax_system *system, storage *workspace,
+       real dt, real v_xi_old, real * d_my_ekin, real * d_total_my_ekin )
+{
+    int blocks, my_ekin;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_nhNVT_update_velocity_3 <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, workspace->v_const, system->reax_param.d_sbp, dt, v_xi_old, d_my_ekin, system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+
+    Cuda_Reduction_Sum( d_my_ekin, d_total_my_ekin, system->n );
+
+    copy_host_device( &my_ekin, d_total_my_ekin, sizeof(int), 
+            cudaMemcpyDeviceToHost, "nhNVT_update_velocity_part3::d_total_my_ekin" );
+
+    return my_ekin;
+}
+
+
+CUDA_GLOBAL void k_bNVT_scale_velocities( reax_atom *my_atoms, real lambda, int n )
 {
     reax_atom *atom;
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -107,27 +234,311 @@ void bNVT_scale_velocities( reax_system *system, real lambda )
 
     blocks = system->n / DEF_BLOCK_SIZE + 
         ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-    k_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
-        (system->d_my_atoms, lambda, system->n);
+
+    k_bNVT_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, lambda, system->n );
     cudaThreadSynchronize( );
     cudaCheckError( );
 }
 
 
+CUDA_GLOBAL void k_bNVP_scale_velocities( reax_atom *my_atoms, real lambda,
+        real mu0, real mu1, real mu2, int n )
+{
+    reax_atom *atom;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    /* Scale velocities and positions at t+dt */
+    atom = &(my_atoms[i]);
+    rvec_Scale( atom->v, lambda, atom->v );
+//    rvec_Multiply( atom->x, mu, atom->x );
+    atom->x[0] = mu0 * atom->x[0];
+    atom->x[1] = mu1 * atom->x[1];
+    atom->x[2] = mu2 * atom->x[2];
+}
+
+
+void bNVP_scale_velocities( reax_system *system, real lambda, rvec mu )
+{
+    int blocks;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    k_bNVP_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, lambda, mu[0], mu[1], mu[2], system->n );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+}
+
+
+int Cuda_Velocity_Verlet_NVE( reax_system* system, control_params* control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int steps, renbr, ret;
+    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
+    real dt;
+#if defined(DEBUG)
+    real t_over_start, t_over_elapsed;
+#endif
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step %d\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+    ret = SUCCESS;
+
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+    if ( verlet_part1_done == FALSE )
+    {
+        update_velocity_part1( system, dt );
+        verlet_part1_done = TRUE;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Output_Sync_Atoms( system );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Sync_Atoms( system );
+
+        /* synch the Grid to the Device here */
+        Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+        init_blocks( system );
+
+#if defined(__CUDA_DEBUG_LOG__)
+        fprintf( stderr, "p:%d - Matvec BLocks: %d, blocksize: %d \n",
+                system->my_rank, MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
+#endif
+    }
+
+    Cuda_Reset( system, control, data, workspace, lists );
+
+    if ( renbr )
+    {
+#if defined(DEBUG)
+        t_over_start  = Get_Time( );
+#endif
+
+        if ( estimate_nbrs_done == 0 )
+        {
+            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
+            ret = Cuda_Estimate_Neighbors( system, data->step );
+            estimate_nbrs_done = 1;
+        }
+
+        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
+        {
+            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+            estimate_nbrs_done = 2;
+    
+#if defined(DEBUG)
+            t_over_elapsed = Get_Timing_Info( t_over_start );
+            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                    system->my_rank, data->step, t_over_elapsed );
+#endif
+        }
+    }
+
+    if ( ret == SUCCESS )
+    {
+        ret = Cuda_Compute_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+    }
+
+    if ( ret == SUCCESS )
+    {
+        update_velocity_part2( system, dt );
+
+        verlet_part1_done = FALSE;
+        estimate_nbrs_done = 0;
+    }
+    
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    return ret;
+}
+
+
+int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system* system,
+        control_params* control, simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int itr, steps, renbr, ret;
+    real *d_my_ekin, *d_total_my_ekin;
+    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
+    real dt, dt_sqr;
+    real my_ekin, new_ekin;
+    real G_xi_new, v_xi_new, v_xi_old;
+    thermostat *therm;
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    dt = control->dt;
+    dt_sqr = SQR(dt);
+    therm = &( data->therm );
+    steps = data->step - data->prev_steps;
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+    if ( verlet_part1_done == FALSE )
+    {
+        nhNVT_update_velocity_part1( system, dt );
+    
+        /* Compute xi(t + dt) */
+        therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
+
+        verlet_part1_done = TRUE;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Output_Sync_Atoms( system );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Sync_Atoms( system );
+
+        /* synch the Grid to the Device here */
+        Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+        init_blocks( system );
+    }
+
+    Cuda_Reset( system, control, data, workspace, lists );
+
+    if ( renbr )
+    {
+#if defined(DEBUG)
+        t_over_start  = Get_Time( );
+#endif
+
+        if ( estimate_nbrs_done == 0 )
+        {
+            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
+            ret = Cuda_Estimate_Neighbors( system, data->step );
+            fprintf( stderr, "Cuda_Estimate_Neighbors (%d)\n", ret );
+            estimate_nbrs_done = 1;
+        }
+
+        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
+        {
+            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+            estimate_nbrs_done = 2;
+    
+#if defined(DEBUG)
+            t_over_elapsed = Get_Timing_Info( t_over_start );
+            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                    system->my_rank, data->step, t_over_elapsed );
+#endif
+        }
+    }
+
+    if ( ret == SUCCESS )
+    {
+        ret = Cuda_Compute_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+        fprintf( stderr, "Cuda_Compute_Forces (%d)\n", ret );
+    }
+
+    if ( ret == SUCCESS )
+    {
+        /* Compute iteration constants for each atom's velocity */
+        nhNVT_update_velocity_part2( system, dev_workspace, dt, therm->v_xi );
+    
+        v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
+        my_ekin = G_xi_new = v_xi_old = 0;
+        itr = 0;
+
+        cuda_malloc( (void **) &d_my_ekin, sizeof(real) * system->n, FALSE,
+                "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_my_ekin" );
+        cuda_malloc( (void **) &d_total_my_ekin, sizeof(real), FALSE,
+                "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_total_my_ekin" );
+
+        do
+        {
+            itr++;
+    
+            /* new values become old in this iteration */
+            v_xi_old = v_xi_new;
+    
+            my_ekin = nhNVT_update_velocity_part3( system, dev_workspace, dt, v_xi_old,
+                    d_my_ekin, d_total_my_ekin );
+    
+            MPI_Allreduce( &my_ekin, &new_ekin, 1, MPI_DOUBLE, MPI_SUM,
+                    mpi_data->comm_mesh3D  );
+    
+            G_xi_new = control->Tau_T * ( 2.0 * new_ekin - data->N_f * K_B * control->T );
+            v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
+        }
+        while ( FABS(v_xi_new - v_xi_old) > 1e-5 );
+        therm->v_xi_old = therm->v_xi;
+        therm->v_xi = v_xi_new;
+        therm->G_xi = G_xi_new;
+
+        cuda_free( d_total_my_ekin,
+                "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_total_my_ekin" );
+        cuda_free( d_my_ekin,
+                "Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein::d_my_ekin" );
+
+        verlet_part1_done = FALSE;
+        estimate_nbrs_done = 0;
+    }
+    
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    return ret;
+}
+
+
+/* uses Berendsen-type coupling for both T and P.
+   All box dimensions are scaled by the same amount,
+   there is no change in the angles between axes. */
 int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
         simulation_data *data, storage *workspace, reax_list **lists,
         output_controls *out_control, mpi_datatypes *mpi_data )
 {
-    int i, steps, renbr, ret;
+    int steps, renbr, ret;
     static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
-    real inv_m, dt, lambda;
-    rvec dx;
-    reax_atom *atom;
-    int *bond_top, *hb_top;
-    int Htop, num_3body;
-    int total_hbonds, count, total_bonds;
-    int bond_cap, cap_3body;
+    real dt, lambda;
+#if defined(DEBUG)
     real t_over_start, t_over_elapsed;
+#endif
 
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d @ step%d\n", system->my_rank, data->step );
@@ -144,7 +555,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
     if ( verlet_part1_done == FALSE )
     {
         /* velocity verlet, 1st part */
-        bNVT_update_velocity_part1( system, dt );
+        update_velocity_part1( system, dt );
         verlet_part1_done = TRUE;
 
 #if defined(DEBUG_FOCUS)
@@ -161,7 +572,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
         Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
         Sync_Atoms( system );
 
-        /* synch the Grid to the Device here */
+        /* sync grid to device */
         Sync_Grid( &system->my_grid, &system->d_my_grid );
 
         init_blocks( system );
@@ -209,7 +620,7 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
     if ( ret == SUCCESS )
     {
         /* velocity verlet, 2nd part */
-        bNVT_update_velocity_part2( system, dt );
+        update_velocity_part2( system, dt );
 
 #if defined(DEBUG_FOCUS)
         fprintf(stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step);
@@ -247,3 +658,109 @@ int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* con
 
     return ret;
 }
+
+
+/* uses Berendsen-type coupling for both T and P.
+ * All box dimensions are scaled by the same amount,
+ * there is no change in the angles between axes. */
+int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system* system, control_params* control,
+        simulation_data *data, storage *workspace, reax_list **lists,
+        output_controls *out_control, mpi_datatypes *mpi_data )
+{
+    int steps, renbr, ret;
+    static int verlet_part1_done = FALSE, estimate_nbrs_done = 0;
+    real dt;
+#if defined(DEBUG)
+    real t_over_start, t_over_elapsed;
+#endif
+
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step %d\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+    ret = SUCCESS;
+
+    Cuda_ReAllocate( system, control, data, workspace, lists, mpi_data );
+
+    if ( verlet_part1_done == FALSE )
+    {
+        update_velocity_part1( system, dt );
+        verlet_part1_done = TRUE;
+
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "p%d @ step%d: verlet1 done\n", system->my_rank, data->step );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        if ( renbr )
+        {
+            Update_Grid( system, control, mpi_data->world );
+        }
+
+        Output_Sync_Atoms( system );
+        Comm_Atoms( system, control, data, workspace, lists, mpi_data, renbr );
+        Sync_Atoms( system );
+
+        /* sync grid to device */
+        Sync_Grid( &system->my_grid, &system->d_my_grid );
+
+        init_blocks( system );
+    }
+
+    Cuda_Reset( system, control, data, workspace, lists );
+
+    if ( renbr )
+    {
+#if defined(DEBUG)
+        t_over_start  = Get_Time( );
+#endif
+
+        if ( estimate_nbrs_done == 0 )
+        {
+            //TODO: move far_nbrs reallocation checks outside of renbr frequency check
+            ret = Cuda_Estimate_Neighbors( system, data->step );
+            estimate_nbrs_done = 1;
+        }
+
+        if ( ret == SUCCESS && estimate_nbrs_done == 1 )
+        {
+            Cuda_Generate_Neighbor_Lists( system, data, workspace, lists );
+            estimate_nbrs_done = 2;
+    
+#if defined(DEBUG)
+            t_over_elapsed = Get_Timing_Info( t_over_start );
+            fprintf( stderr, "p%d --> Overhead (Step-%d) %f \n",
+                    system->my_rank, data->step, t_over_elapsed );
+#endif
+        }
+    }
+
+    if ( ret == SUCCESS )
+    {
+        ret = Cuda_Compute_Forces( system, control, data, workspace,
+                lists, out_control, mpi_data );
+    }
+
+    if ( ret == SUCCESS )
+    {
+        update_velocity_part2( system, dt );
+
+        verlet_part1_done = FALSE;
+        estimate_nbrs_done = 0;
+
+        Cuda_Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );
+        Cuda_Compute_Pressure( system, control, data, mpi_data );
+        Cuda_Scale_Box( system, control, data, mpi_data );
+    }
+    
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "p%d @ step%d: verlet2 done\n", system->my_rank, data->step );
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+    return ret;
+}
diff --git a/PG-PuReMD/src/cuda/cuda_integrate.h b/PG-PuReMD/src/cuda/cuda_integrate.h
index 2797b3e33397685fc1cfdc05dedead43fb98fe74..9d893419abd1c0c4397641cbad9fd0f42f888696 100644
--- a/PG-PuReMD/src/cuda/cuda_integrate.h
+++ b/PG-PuReMD/src/cuda/cuda_integrate.h
@@ -29,16 +29,24 @@
 extern "C" {
 #endif
 
-void bNVT_update_velocity_part1( reax_system *, real );
+void bNVP_scale_velocities( reax_system *, real, rvec );
 
-void bNVT_update_velocity_part2( reax_system *, real );
+int Cuda_Velocity_Verlet_NVE( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
 
-void bNVT_scale_velocities( reax_system *, real );
+int Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
 
 int Cuda_Velocity_Verlet_Berendsen_NVT( reax_system*, control_params*,
         simulation_data*, storage*, reax_list**, output_controls*,
         mpi_datatypes* );
 
+int Cuda_Velocity_Verlet_Berendsen_NPT( reax_system*, control_params*,
+        simulation_data*, storage*, reax_list**, output_controls*,
+        mpi_datatypes* );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/PG-PuReMD/src/cuda/cuda_lin_alg.cu b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
index dc7a2fc344e8c82d2758f89052b4cad34fb578ba..a0db127b6abefd39019acb874147ecb022a85705 100644
--- a/PG-PuReMD/src/cuda/cuda_lin_alg.cu
+++ b/PG-PuReMD/src/cuda/cuda_lin_alg.cu
@@ -766,7 +766,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     }
 #endif
 
-    for ( i = 1; i < 300; ++i )
+    for ( i = 1; i < 1000; ++i )
     {
         //MVAPICH2
 //#ifdef __CUDA_DEBUG__
@@ -948,7 +948,7 @@ int Cuda_dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         Cuda_RvecCopy_To( dev_workspace->x, dev_workspace->s, 0, system->n );
     }
 
-    if ( i >= 300 )
+    if ( i >= 1000 )
     {
         fprintf( stderr, "[WARNING] p%d: dual CG convergence failed! (%d steps)\n",
                 system->my_rank, i );
diff --git a/PG-PuReMD/src/cuda/cuda_neighbors.cu b/PG-PuReMD/src/cuda/cuda_neighbors.cu
index b1f2b85d7530efb12647de27e0fa3d89ca3086b9..f9c127d6e919c8a9177b858b1c49f84866aa145b 100644
--- a/PG-PuReMD/src/cuda/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda/cuda_neighbors.cu
@@ -68,8 +68,7 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms,
     atom1 = &(my_atoms[l]);
     num_far = Dev_Start_Index( l, &far_nbrs );
 
-    //get the coordinates of the atom and 
-    //compute the grid cell
+    /* get the coordinates of the atom and compute the grid cell */
     if ( l < n )
     {
         for ( i = 0; i < 3; i++ )
@@ -115,7 +114,7 @@ CUDA_GLOBAL void k_generate_neighbor_lists( reax_atom *my_atoms,
 
         /* if neighboring grid cell is further in the "positive" direction AND within cutoff */
         if ( g.str[index_grid_3d(i, j, k, &g)] <= g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
-                Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)], atom1->x) <= cutoff )
+                Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs(i, j, k, itr, &g)], atom1->x) <= cutoff )
         {
             /* pick up another atom from the neighbor cell */
             for ( m = g.str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.cu b/PG-PuReMD/src/cuda/cuda_system_props.cu
index 54957d00fc8d555a26ac699511ed003865482a19..a7ece36c47cce9b0e07178d8b92fe68844472fc8 100644
--- a/PG-PuReMD/src/cuda/cuda_system_props.cu
+++ b/PG-PuReMD/src/cuda/cuda_system_props.cu
@@ -1,11 +1,12 @@
 
 #include "cuda_system_props.h"
 
+#include "cuda_copy.h"
 #include "cuda_utils.h"
 #include "cuda_reduction.h"
-#include "cuda_copy.h"
 #include "cuda_shuffle.h"
 
+#include "../tool_box.h"
 #include "../vector.h"
 
 
@@ -622,18 +623,18 @@ extern "C" void dev_compute_total_mass( reax_system *system, real *local_val )
     real *block_mass = (real *) scratch;
     cuda_memset( block_mass, 0, sizeof(real) * (1 + BLOCKS_POW_2), "total_mass:tmp" );
 
-    k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n);
+    k_compute_total_mass <<< BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
-        (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2);
+    k_reduction <<< 1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
+        ( block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2 );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof(real), 
-            cudaMemcpyDeviceToHost, "total_mass:tmp");
+    copy_host_device( local_val, block_mass + BLOCKS_POW_2, sizeof(real), 
+            cudaMemcpyDeviceToHost, "total_mass:tmp" );
 }
 
 
@@ -655,9 +656,9 @@ CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_ato
 
     __syncthreads( );
 
-    for(int z = 16; z >=1; z/=2)
+    for(int z = 16; z >= 1; z /= 2)
     {
-        sdata += shfl( sdata, z);
+        sdata += shfl( sdata, z );
     }
 
     if (threadIdx.x % 32 == 0)
@@ -715,27 +716,28 @@ CUDA_GLOBAL void k_compute_kinetic_energy( single_body_parameters *sbp, reax_ato
 #endif
 }
 
+
 extern "C" void dev_compute_kinetic_energy( reax_system *system,
         simulation_data *data, real *local_val )
 {
     real *block_energy = (real *) scratch;
     cuda_memset( block_energy, 0, sizeof(real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp" );
 
-    k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
-        (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n);
+    k_compute_kinetic_energy <<< BLOCKS, BLOCK_SIZE, sizeof(real) * BLOCK_SIZE >>>
+        ( system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
-    k_reduction <<<1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
-        (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2);
+    k_reduction <<< 1, BLOCKS_POW_2, sizeof(real) * BLOCKS_POW_2 >>>
+        ( block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2 );
     cudaThreadSynchronize( );
     cudaCheckError( );
 
+    //copy_host_device( local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
     copy_host_device( local_val, block_energy + BLOCKS_POW_2,
-            //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
             sizeof(real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp" );
-            //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
-            //        sizeof (real), "kinetic_energy");
+    //copy_device( block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
+    //        sizeof(real), "kinetic_energy" );
 }
 
 
@@ -884,20 +886,16 @@ extern "C" void dev_sync_simulation_data( simulation_data *data )
 void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
         MPI_Comm comm )
 {
-    int i;
-    rvec p;
-    real m;
-
     data->my_en.e_kin = 0.0;
 
     dev_compute_kinetic_energy( system, data, &data->my_en.e_kin );
 
-    MPI_Allreduce( &data->my_en.e_kin,  &data->sys_en.e_kin,
+    MPI_Allreduce( &data->my_en.e_kin, &data->sys_en.e_kin,
             1, MPI_DOUBLE, MPI_SUM, comm );
 
-    data->therm.T = (2. * data->sys_en.e_kin) / (data->N_f * K_B);
+    data->therm.T = (2.0 * data->sys_en.e_kin) / (data->N_f * K_B);
 
-    // avoid T being an absolute zero, might cause F.P.E!
+    /* avoid T being an absolute zero, might cause F.P.E! */
     if ( FABS(data->therm.T) < ALMOST_ZERO )
     {
         data->therm.T = ALMOST_ZERO;
@@ -908,10 +906,9 @@ void Cuda_Compute_Kinetic_Energy( reax_system* system, simulation_data* data,
 void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data,
         MPI_Comm comm  )
 {
-    int  i;
     real tmp;
 
-    //compute local total mass of the system
+    /* compute local total mass of the system */
     dev_compute_total_mass( system, &tmp );
 
     MPI_Allreduce( &tmp, &data->M, 1, MPI_DOUBLE, MPI_SUM, comm );
@@ -924,10 +921,10 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
         mpi_datatypes *mpi_data, MPI_Comm comm )
 {
     int i;
-    real m, det; //xx, xy, xz, yy, yz, zz;
+    real det; //xx, xy, xz, yy, yz, zz;
     real tmp_mat[6], tot_mat[6];
     rvec my_xcm, my_vcm, my_amcm, my_avcm;
-    rvec tvec, diff;
+    rvec tvec;
     rtensor mat, inv;
 
     rvec_MakeZero( my_xcm );  // position of CoM
@@ -1024,3 +1021,116 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 }
 
 
+CUDA_GLOBAL void k_compute_pressure( reax_atom *my_atoms, simulation_box *big_box,
+        rvec *int_press, int n )
+{
+    reax_atom *p_atom;
+    rvec tx;
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if ( i >= n )
+    {
+        return;
+    }
+
+    p_atom = &( my_atoms[i] );
+    rvec_MakeZero( int_press[i] );
+
+    /* transform x into unit box coordinates, store in tx */
+    Transform_to_UnitBox( p_atom->x, big_box, 1, tx );
+
+    /* this atom's contribution to internal pressure */
+    rvec_Multiply( int_press[i], p_atom->f, tx );
+}
+
+
+/* IMPORTANT: This function assumes that current kinetic energy
+ * the system is already computed
+ *
+ * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs
+ *  to be added when there are long-range interactions or long-range
+ *  corrections to short-range interactions present.
+ *  We may want to add that for more accuracy.
+ */
+void Cuda_Compute_Pressure( reax_system* system, control_params *control,
+        simulation_data* data, mpi_datatypes *mpi_data )
+{
+    int blocks, block_size, blocks_n, blocks_pow_2_n;
+    rvec *rvec_spad;
+    rvec int_press;
+    simulation_box *big_box;
+    
+    rvec_spad = (rvec *) scratch;
+    big_box = &(system->big_box);
+
+    /* 0: both int and ext, 1: ext only, 2: int only */
+    if ( control->press_mode == 0 || control->press_mode == 2 )
+    {
+        blocks = system->n / DEF_BLOCK_SIZE + 
+            ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+        compute_blocks( &blocks_n, &block_size, system->n );
+        compute_nearest_pow_2( blocks_n, &blocks_pow_2_n );
+
+        k_compute_pressure <<< blocks, DEF_BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->d_big_box, rvec_spad,
+              system->n );
+
+        k_reduction_rvec <<< blocks_n, block_size, sizeof(rvec) * block_size >>>
+            ( rvec_spad, rvec_spad + system->n,  system->n );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+
+        k_reduction_rvec <<< 1, blocks_pow_2_n, sizeof(rvec) * blocks_pow_2_n >>>
+            ( rvec_spad + system->n, rvec_spad + system->n + blocks_n, blocks_n );
+        cudaThreadSynchronize ();
+        cudaCheckError( );
+
+        copy_host_device( &int_press, rvec_spad + system->n + blocks_n, sizeof(rvec), 
+                cudaMemcpyDeviceToHost, "Cuda_Compute_Pressure::d_int_press" );
+    }
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d:p_int(%10.5f %10.5f %10.5f)p_ext(%10.5f %10.5f %10.5f)\n",
+            system->my_rank, int_press[0], int_press[1], int_press[2],
+            data->my_ext_press[0], data->my_ext_press[1], data->my_ext_press[2] );
+#endif
+
+    /* sum up internal and external pressure */
+    MPI_Allreduce( int_press, data->int_press,
+            3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+    MPI_Allreduce( data->my_ext_press, data->ext_press,
+            3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+
+#if defined(DEBUG)
+    fprintf( stderr, "p%d: %10.5f %10.5f %10.5f\n",
+             system->my_rank,
+             data->int_press[0], data->int_press[1], data->int_press[2] );
+    fprintf( stderr, "p%d: %10.5f %10.5f %10.5f\n",
+             system->my_rank,
+             data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+#endif
+
+    /* kinetic contribution */
+    data->kin_press = 2.0 * (E_CONV * data->sys_en.e_kin)
+        / (3.0 * big_box->V * P_CONV);
+
+    /* Calculate total pressure in each direction */
+    data->tot_press[0] = data->kin_press -
+        (( data->int_press[0] + data->ext_press[0] ) /
+         ( big_box->box_norms[1] * big_box->box_norms[2] * P_CONV ));
+
+    data->tot_press[1] = data->kin_press -
+        (( data->int_press[1] + data->ext_press[1] ) /
+         ( big_box->box_norms[0] * big_box->box_norms[2] * P_CONV ));
+
+    data->tot_press[2] = data->kin_press -
+        (( data->int_press[2] + data->ext_press[2] ) /
+         ( big_box->box_norms[0] * big_box->box_norms[1] * P_CONV ));
+
+    /* Average pressure for the whole box */
+    data->iso_bar.P =
+        ( data->tot_press[0] + data->tot_press[1] + data->tot_press[2] ) / 3.0;
+}
diff --git a/PG-PuReMD/src/cuda/cuda_system_props.h b/PG-PuReMD/src/cuda/cuda_system_props.h
index 66f620b301deaf3d5bf0d7acac48e28fa45f6901..ddf8ed4d5c5b99e5cddda960c5b6274cd5622386 100644
--- a/PG-PuReMD/src/cuda/cuda_system_props.h
+++ b/PG-PuReMD/src/cuda/cuda_system_props.h
@@ -28,6 +28,9 @@ void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data*, MPI_Comm );
 void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*,
         mpi_datatypes*, MPI_Comm );
 
+void Cuda_Compute_Pressure( reax_system *, control_params *,
+        simulation_data *, mpi_datatypes * );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/PG-PuReMD/src/grid.c b/PG-PuReMD/src/grid.c
index d893f6c60e166f8e11092e203817a3f610439c64..7f3ef23144dc9b7dd64929b1270dbd348bef79a5 100644
--- a/PG-PuReMD/src/grid.c
+++ b/PG-PuReMD/src/grid.c
@@ -215,7 +215,7 @@ void Find_Neighbor_GridCells( grid *g, control_params *control )
                             //gc->nbrs[top] = &(g->cells[cj[0]][cj[1]][cj[2]]);
                             //gc->nbrs[top] = &(g->cells[ index_grid_3d(cj[0],cj[1],cj[2],g) ]);
                             ivec_Copy( g->nbrs_x[index_grid_nbrs(ci[0], ci[1], ci[2], top, g)], cj );
-                            //fprintf (stderr, " index: %d - %d \n", index_grid_nbrs (ci[0], ci[1], ci[2], top, g), g->total * g->max_nbrs);
+                            //fprintf( stderr, " index: %d - %d \n", index_grid_nbrs (ci[0], ci[1], ci[2], top, g), g->total * g->max_nbrs );
                             Find_Closest_Point( /*ext_box,*/ g, ci, cj, g->nbrs_cp[index_grid_nbrs (ci[0], ci[1], ci[2], top, g)] );
                             //fprintf( stderr, "cp: %f %f %f\n",
                             //       gc->nbrs_cp[top][0], gc->nbrs_cp[top][1],
@@ -787,13 +787,14 @@ void Bin_Boundary_Atoms( reax_system *system )
     Get_Boundary_GCell( g, base, atoms[start].x, &gc, &cur_min, &cur_max, gcell_cood );
     g->str[index_grid_3d( gcell_cood[0], gcell_cood[1], gcell_cood[2], g )] = start;
     gc->top = 1;
+
     /* error check */
     if ( is_Within_GCell( atoms[start].x, ext_box->min, ext_box->max ) == FALSE )
     {
         fprintf( stderr, "p%d: (start):ghost atom%d [%f %f %f] is out of my (grid cell) box!\n",
                  system->my_rank, start,
                  atoms[start].x[0], atoms[start].x[1], atoms[start].x[2] );
-//        MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+        MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
     }
 
     for ( i = start + 1; i < end; i++ )
@@ -804,7 +805,7 @@ void Bin_Boundary_Atoms( reax_system *system )
             fprintf( stderr, "p%d: (middle) ghost atom%d [%f %f %f] is out of my (grid cell) box!\n",
                     system->my_rank, i,
                     atoms[i].x[0], atoms[i].x[1], atoms[i].x[2] );
-//            MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+            MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
         }
 
         if ( is_Within_GCell( atoms[i].x, cur_min, cur_max ) == TRUE )
@@ -815,6 +816,7 @@ void Bin_Boundary_Atoms( reax_system *system )
         {
             g->end[index_grid_3d( gcell_cood[0], gcell_cood[1], gcell_cood[2], g )] = i;
             Get_Boundary_GCell( g, base, atoms[i].x, &gc, &cur_min, &cur_max, gcell_cood );
+
             /* sanity check! */
             if ( gc->top != 0 )
             {
@@ -824,8 +826,9 @@ void Bin_Boundary_Atoms( reax_system *system )
                          atoms[i].x[0], atoms[i].x[1], atoms[i].x[2],
                          gc->min[0], gc->min[1], gc->min[2],
                          gc->max[0], gc->max[1], gc->max[2] );
-//                MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+                MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
             }
+
             g->str[index_grid_3d( gcell_cood[0], gcell_cood[1], gcell_cood[2], g )] = i;
             gc->top = 1;
         }
diff --git a/PG-PuReMD/src/integrate.c b/PG-PuReMD/src/integrate.c
index b02008972bf42c235df90fd16ba5b8f84e6097ba..d3e9239f38e5b826b871ade2a6ade78f6b180c66 100644
--- a/PG-PuReMD/src/integrate.c
+++ b/PG-PuReMD/src/integrate.c
@@ -64,8 +64,10 @@ int Velocity_Verlet_NVE( reax_system* system, control_params* control,
         {
             atom = &(system->my_atoms[i]);
             inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
+            /* Compute x(t + dt) */
             rvec_ScaledSum( dx, dt, atom->v, 0.5 * dt_sqr * -F_CONV * inv_m, atom->f );
             rvec_Add( system->my_atoms[i].x, dx );
+            /* Compute v(t + dt/2) */
             rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
         }
         verlet_part1_done = TRUE;
@@ -232,7 +234,7 @@ int Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
 {
     int i, steps, renbr, ret;
     static int verlet_part1_done = FALSE;
-    real inv_m, dt, lambda;
+    real inv_m, dt, dt_sqr, lambda;
     rvec dx;
     reax_atom *atom;
 
@@ -243,6 +245,7 @@ int Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = steps % control->reneighbor == 0 ? TRUE : FALSE;
+    dt_sqr = SQR(dt);
 
     ReAllocate( system, control, data, workspace, lists, mpi_data );
 
@@ -253,10 +256,8 @@ int Velocity_Verlet_Berendsen_NVT( reax_system* system, control_params* control,
         {
             atom = &(system->my_atoms[i]);
             inv_m = 1.0 / system->reax_param.sbp[atom->type].mass;
-            /* Compute x(t + dt) */
-            rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+            rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * dt_sqr, atom->f );
             rvec_Add( atom->x, dx );
-            /* Compute v(t + dt/2) */
             rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
         }
 
diff --git a/PG-PuReMD/src/neighbors.c b/PG-PuReMD/src/neighbors.c
index e938329a6728f57d178d530ee8bb996447fa8af7..732a6e7da4bdbcb422308246db9916d66870385c 100644
--- a/PG-PuReMD/src/neighbors.c
+++ b/PG-PuReMD/src/neighbors.c
@@ -112,22 +112,23 @@ void Generate_Neighbor_Lists( reax_system *system, simulation_data *data,
                 cutoff = SQR(g->cutoff[index_grid_3d(i, j, k, g)]);
 
                 /* pick up an atom from the current cell */
-                for (l = g->str[index_grid_3d(i, j, k, g)]; l < g->end[index_grid_3d(i, j, k, g)]; ++l )
+                for ( l = g->str[index_grid_3d(i, j, k, g)]; l < g->end[index_grid_3d(i, j, k, g)]; ++l )
                 {
                     atom1 = &(system->my_atoms[l]);
                     Set_Start_Index( l, num_far, far_nbrs );
 
                     itr = 0;
+                    /* search through neighbor grid cell candidates of current cell */
                     while ( (g->nbrs_x[index_grid_nbrs(i, j, k, itr, g)][0]) >= 0 )
                     {
 
-                        ivec_Copy (nbrs_x, g->nbrs_x[index_grid_nbrs(i, j, k, itr, g)]);
-                        gcj = &( g->cells [ index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g) ] );
+                        ivec_Copy( nbrs_x, g->nbrs_x[index_grid_nbrs(i, j, k, itr, g)] );
+                        gcj = &( g->cells[ index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g) ] );
 
                         if ( g->str[index_grid_3d(i, j, k, g)] <= g->str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g)] &&
                                 (DistSqr_to_Special_Point(g->nbrs_cp[index_grid_nbrs(i, j, k, itr, g)], atom1->x) <= cutoff) )
                         {
-                            /* pick up another atom from the neighbor cell */
+                            /* pick up another atom from the neighbor grid cell */
                             for ( m = g->str[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g)];
                                     m < g->end[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], g)]; ++m )
                             {
@@ -217,10 +218,7 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists )
         {
             for ( k = 0; k < g->ncells[2]; k++ )
             {
-                //SUDHIR
-                //gci = &(g->cells[i][j][k]);
                 gci = &(g->cells[ index_grid_3d(i, j, k, g) ]);
-                //cutoff = SQR(gci->cutoff);
                 cutoff = SQR(g->cutoff[ index_grid_3d(i, j, k, g) ]);
 
                 //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
@@ -279,15 +277,12 @@ int Estimate_NumNeighbors( reax_system *system, reax_list **lists )
         }
     }
 
-#if defined(DEBUG)
-    fprintf (stderr, "Total number of host neighbors: %d \n", num_far);
-#endif
-
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "p%d: estimate nbrs done - num_far=%d\n",
              system->my_rank, num_far );
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
+
     return MAX( num_far * SAFE_ZONE, MIN_CAP * MIN_NBRS );
 }
 
diff --git a/PG-PuReMD/src/system_props.c b/PG-PuReMD/src/system_props.c
index ea4465c58f54635963b0da5c383833fc2aa094f7..c983afc1cad8fbc324fb7977a921c96dbfbe23f2 100644
--- a/PG-PuReMD/src/system_props.c
+++ b/PG-PuReMD/src/system_props.c
@@ -40,7 +40,8 @@ void Temperature_Control( control_params *control, simulation_data *data )
 {
     real tmp;
 
-    if ( control->T_mode == 1 ) // step-wise temperature control
+    /* step-wise temperature control */
+    if ( control->T_mode == 1 )
     {
         if ((data->step - data->prev_steps) % ((int)(control->T_freq / control->dt)) == 0)
         {
@@ -54,7 +55,8 @@ void Temperature_Control( control_params *control, simulation_data *data )
             }
         }
     }
-    else if ( control->T_mode == 2 )  // constant slope control
+    /* constant slope control */
+    else if ( control->T_mode == 2 )
     {
         tmp = control->T_rate * control->dt / control->T_freq;
 
@@ -105,7 +107,6 @@ void Compute_System_Energy( reax_system *system, simulation_data *data,
     my_en[13] = data->my_en.e_kin;
 
 #ifdef HAVE_CUDA
-    //Cuda Wrapper here
     dev_sync_simulation_data( data );
 #endif
 
@@ -166,7 +167,7 @@ void Compute_System_Energy( reax_system *system, simulation_data *data,
 void Compute_Total_Mass( reax_system *system, simulation_data *data,
         MPI_Comm comm  )
 {
-    int  i;
+    int i;
     real tmp;
 
     tmp = 0;
@@ -268,8 +269,13 @@ void Compute_Center_of_Mass( reax_system *system, simulation_data *data,
         inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
 
         if ( det > ALMOST_ZERO )
+        {
             rtensor_Scale( inv, 1. / det, inv );
-        else rtensor_MakeZero( inv );
+        }
+        else
+        {
+            rtensor_MakeZero( inv );
+        }
 
         /* Compute the angular velocity about the centre of mass */
         rtensor_MatVec( data->avcm, inv, data->amcm );
@@ -345,16 +351,18 @@ void Compute_Pressure( reax_system* system, control_params *control,
         }
     }
 
-    /* sum up internal and external pressure */
 #if defined(DEBUG)
     fprintf(stderr, "p%d:p_int(%10.5f %10.5f %10.5f)p_ext(%10.5f %10.5f %10.5f)\n",
             system->my_rank, int_press[0], int_press[1], int_press[2],
             data->my_ext_press[0], data->my_ext_press[1], data->my_ext_press[2] );
 #endif
+
+    /* sum up internal and external pressure */
     MPI_Allreduce( int_press, data->int_press,
-                   3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+            3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
     MPI_Allreduce( data->my_ext_press, data->ext_press,
-                   3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+            3, MPI_DOUBLE, MPI_SUM, mpi_data->comm_mesh3D );
+
 #if defined(DEBUG)
     fprintf( stderr, "p%d: %10.5f %10.5f %10.5f\n",
              system->my_rank,
@@ -365,20 +373,21 @@ void Compute_Pressure( reax_system* system, control_params *control,
 #endif
 
     /* kinetic contribution */
-    data->kin_press = 2.*(E_CONV * data->sys_en.e_kin) / (3.*big_box->V * P_CONV);
+    data->kin_press = 2. * (E_CONV * data->sys_en.e_kin)
+        / (3. * big_box->V * P_CONV);
 
     /* Calculate total pressure in each direction */
     data->tot_press[0] = data->kin_press -
-                         (( data->int_press[0] + data->ext_press[0] ) /
-                          ( big_box->box_norms[1] * big_box->box_norms[2] * P_CONV ));
+        (( data->int_press[0] + data->ext_press[0] ) /
+         ( big_box->box_norms[1] * big_box->box_norms[2] * P_CONV ));
 
     data->tot_press[1] = data->kin_press -
-                         (( data->int_press[1] + data->ext_press[1] ) /
-                          ( big_box->box_norms[0] * big_box->box_norms[2] * P_CONV ));
+        (( data->int_press[1] + data->ext_press[1] ) /
+         ( big_box->box_norms[0] * big_box->box_norms[2] * P_CONV ));
 
     data->tot_press[2] = data->kin_press -
-                         (( data->int_press[2] + data->ext_press[2] ) /
-                          ( big_box->box_norms[0] * big_box->box_norms[1] * P_CONV ));
+        (( data->int_press[2] + data->ext_press[2] ) /
+         ( big_box->box_norms[0] * big_box->box_norms[1] * P_CONV ));
 
     /* Average pressure for the whole box */
     data->iso_bar.P =
diff --git a/PG-PuReMD/src/tool_box.c b/PG-PuReMD/src/tool_box.c
index 2c2c0a5040f01547fb7e9bddf91ab4c90615eb94..a6f570c8c2b3fdb2a7d1afd789490f8a1f5fcd57 100644
--- a/PG-PuReMD/src/tool_box.c
+++ b/PG-PuReMD/src/tool_box.c
@@ -78,57 +78,6 @@ void SumScanB( int n, int me, int wsize, int root, MPI_Comm comm, int *nbuf )
 }
 
 
-/************** taken from box.c **************/
-void Transform( rvec x1, simulation_box *box, char flag, rvec x2 )
-{
-    int i, j;
-    real tmp;
-
-#if defined(DEBUG)
-    fprintf( stderr, ">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2] );
-#endif
-
-    if ( flag > 0 )
-    {
-        for (i = 0; i < 3; i++)
-        {
-            tmp = 0.0;
-            for (j = 0; j < 3; j++)
-            {
-                tmp += box->trans[i][j] * x1[j];
-            }
-            x2[i] = tmp;
-        }
-    }
-    else
-    {
-        for (i = 0; i < 3; i++)
-        {
-            tmp = 0.0;
-            for (j = 0; j < 3; j++)
-            {
-                tmp += box->trans_inv[i][j] * x1[j];
-            }
-            x2[i] = tmp;
-        }
-    }
-
-#if defined(DEBUG)
-    fprintf( stderr, ">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2] );
-#endif
-}
-
-
-void Transform_to_UnitBox( rvec x1, simulation_box *box, char flag, rvec x2 )
-{
-    Transform( x1, box, flag, x2 );
-
-    x2[0] /= box->box_norms[0];
-    x2[1] /= box->box_norms[1];
-    x2[2] /= box->box_norms[2];
-}
-
-
 /* determine whether point p is inside the box */
 void Fit_to_Periodic_Box( simulation_box *box, rvec *p )
 {
diff --git a/PG-PuReMD/src/tool_box.h b/PG-PuReMD/src/tool_box.h
index a1f55910ff34fe08aceb5a84aa7e270a855d9291..717a8a80e490171e8b6aa4249dd432d5ad7e3da7 100644
--- a/PG-PuReMD/src/tool_box.h
+++ b/PG-PuReMD/src/tool_box.h
@@ -34,9 +34,6 @@ int SumScan( int, int, int, MPI_Comm );
 
 void SumScanB( int, int, int, int, MPI_Comm, int* );
 
-/* from box.h */
-void Transform_to_UnitBox( rvec, simulation_box*, char, rvec );
-
 void Fit_to_Periodic_Box( simulation_box*, rvec* );
 
 /* from geo_tools.h */
@@ -223,6 +220,51 @@ static inline real DistSqr_to_Special_Point( rvec cp, rvec x )
 
     return d_sqr;
 }
+
+
+/************** taken from box.c **************/
+CUDA_HOST_DEVICE static inline void Transform( rvec x1,
+        simulation_box *box, char flag, rvec x2 )
+{
+    int i, j;
+    real tmp;
+
+    if ( flag > 0 )
+    {
+        for ( i = 0; i < 3; i++ )
+        {
+            tmp = 0.0;
+            for ( j = 0; j < 3; j++ )
+            {
+                tmp += box->trans[i][j] * x1[j];
+            }
+            x2[i] = tmp;
+        }
+    }
+    else
+    {
+        for ( i = 0; i < 3; i++ )
+        {
+            tmp = 0.0;
+            for ( j = 0; j < 3; j++ )
+            {
+                tmp += box->trans_inv[i][j] * x1[j];
+            }
+            x2[i] = tmp;
+        }
+    }
+}
+
+
+CUDA_HOST_DEVICE static inline void Transform_to_UnitBox( rvec x1,
+        simulation_box *box, char flag, rvec x2 )
+{
+    Transform( x1, box, flag, x2 );
+
+    x2[0] /= box->box_norms[0];
+    x2[1] /= box->box_norms[1];
+    x2[2] /= box->box_norms[2];
+}
 #endif