diff --git a/PG-PuReMD/src/cuda_environment.cu b/PG-PuReMD/src/cuda_environment.cu
index 24898ec57ea754e0658d63193b264ea78c406c14..be6101a21af4af5d6f1688ad023a5aaa330263a9 100644
--- a/PG-PuReMD/src/cuda_environment.cu
+++ b/PG-PuReMD/src/cuda_environment.cu
@@ -1,13 +1,15 @@
 #include "cuda_environment.h"
 #include "cuda_utils.h"
-extern "C" void Setup_Cuda_Environment (int rank, int nprocs, int gpus_per_node)
+extern "C" void Setup_Cuda_Environment(int rank, int nprocs, int gpus_per_node)
     int deviceCount;
     cudaError_t flag;
-    flag = cudaGetDeviceCount (&deviceCount);
+    flag = cudaGetDeviceCount(&deviceCount);
     if ( flag != cudaSuccess )
@@ -18,32 +20,21 @@ extern "C" void Setup_Cuda_Environment (int rank, int nprocs, int gpus_per_node)
     //Calculate the # of GPUs per processor
     //and assign the GPU for each process
     //TODO: handle condition where # CPU procs > # GPUs
-    cudaSetDevice ( (rank % (deviceCount)) );
+    cudaSetDevice( (rank % (deviceCount)) );
 #if defined(__CUDA_DEBUG__)
     fprintf( stderr, "p:%d is using GPU: %d \n", rank, (rank % deviceCount));
-    ///////////////////////////////////////////////
-    ///////////////////////////////////////////////
-    ///////////////////////////////////////////////
-    // CHANGE ORIGINAL/////////////////////////////
-    ///////////////////////////////////////////////
-    ///////////////////////////////////////////////
-    ///////////////////////////////////////////////
-    //cudaDeviceSetLimit ( cudaLimitStackSize, 8192 );
-    //cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 );
-    //cudaCheckError ();
-    ///////////////////////////////////////////////
-    ///////////////////////////////////////////////
-    ///////////////////////////////////////////////
-    ///////////////////////////////////////////////
-    ///////////////////////////////////////////////
+    //cudaDeviceSetLimit( cudaLimitStackSize, 8192 );
+    //cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
+    //cudaCheckError();
-extern "C" void Cleanup_Cuda_Environment ()
+extern "C" void Cleanup_Cuda_Environment()
-    cudaDeviceReset ();
-    cudaDeviceSynchronize ();
+    cudaDeviceReset();
+    cudaDeviceSynchronize();
diff --git a/PG-PuReMD/src/reax_types.h b/PG-PuReMD/src/reax_types.h
index debe20aac39b241c5064a55b80603a3a3886952f..8c353a7ca600b369ad69b2bdabd8c8e883915d21 100644
--- a/PG-PuReMD/src/reax_types.h
+++ b/PG-PuReMD/src/reax_types.h
@@ -22,30 +22,26 @@
 #if !(defined(__REAX_TYPES_H_) || defined(__CUDA_REAX_TYPES_H_))
 #ifdef __CUDACC__
-#ifndef __CUDA_REAX_TYPES_H_
-#define __CUDA_REAX_TYPES_H_
-#define CUDA_HOST __host__
-#define CUDA_DEVICE __device__
-#define CUDA_GLOBAL __global__
-#define CUDA_HOST_DEVICE __host__ __device__
+  #ifndef __CUDA_REAX_TYPES_H_
+    #define __CUDA_REAX_TYPES_H_
+    #define CUDA_HOST __host__
+    #define CUDA_DEVICE __device__
+    #define CUDA_GLOBAL __global__
+    #define CUDA_HOST_DEVICE __host__ __device__
+  #endif
-#ifndef __REAX_TYPES_H_
-#define __REAX_TYPES_H_
-#define CUDA_HOST
-#define CUDA_DEVICE
-#define CUDA_GLOBAL
+  #ifndef __REAX_TYPES_H_
+    #define __REAX_TYPES_H_
+    #define CUDA_HOST
+    #define CUDA_DEVICE
+    #define CUDA_GLOBAL
+    #define CUDA_HOST_DEVICE
+  #endif
 #if (defined(HAVE_CONFIG_H) && !defined(__CONFIG_H_))
-#define __CONFIG_H_
-#include "config.h"
+  #define __CONFIG_H_
+  #include "config.h"
 #include <ctype.h>
@@ -57,13 +53,13 @@
 #include <sys/time.h>
 #include <time.h>
 #include <zlib.h>
-#define         HOST_SCRATCH_SIZE               (1024 * 1024 * 20)
+#define HOST_SCRATCH_SIZE (1024 * 1024 * 20)
 #ifdef HAVE_CUDA
-#include <cuda.h>
+  #include <cuda.h>
 #if defined(__IBMC__)
-#define inline __inline__
+  #define inline __inline__
 #endif /*IBMC*/
 #define PURE_REAX
diff --git a/PuReMD-GPU/Makefile.am b/PuReMD-GPU/Makefile.am
index d433237070eb800b0142d0f81be5f10b42553035..016114db2dfaf3b9af623069b2f5f3ffc677907a 100644
--- a/PuReMD-GPU/Makefile.am
+++ b/PuReMD-GPU/Makefile.am
@@ -15,32 +15,51 @@ AM_LDFLAGS =
 NVCCFLAGS += -use_fast_math 
 NVCCFLAGS += -gencode arch=compute_35,code=sm_35
 NVCCFLAGS += --compiler-options "$(DEFS) -D__SM_35__ -O3 -funroll-loops -fstrict-aliasing"
+#NVCCFLAGS += -Xcompiler -fPIC -dc
 #NVCCFLAGS += --ptxas-options -v
 bin_PROGRAMS = bin/puremd-gpu
-bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c src/reset_utils.c src/param.c src/pdb_tools.c \
-	src/GMRES.cu src/QEq.cu src/allocate.cu src/bond_orders.cu \
-	src/box.cu src/forces.cu src/four_body_interactions.cu \
-	src/grid.cu src/init_md.cu src/integrate.cu src/list.cu \
-	src/lookup.cu src/neighbors.cu \
-	src/restart.cu src/single_body_interactions.cu \
-	src/system_props.cu src/three_body_interactions.cu \
-	src/traj.cu src/two_body_interactions.cu src/vector.cu \
-	src/testmd.cu \
-	src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/reduction.cu \
-	src/center_mass.cu src/helpers.cu src/validation.cu src/matvec.cu
+bin_puremd_gpu_SOURCES = src/analyze.c src/print_utils.c \
+	src/restart.c src/param.c src/pdb_tools.c src/box.c \
+	src/lin_alg.c src/QEq.c src/allocate.c src/bond_orders.c \
+	src/forces.c src/four_body_interactions.c \
+	src/grid.c src/init_md.c src/integrate.c src/list.c \
+	src/lookup.c src/neighbors.c \
+	src/reset_utils.c src/single_body_interactions.c \
+	src/system_props.c src/three_body_interactions.c \
+	src/traj.c src/two_body_interactions.c src/vector.c \
+	src/testmd.c \
+	src/cuda_utils.cu src/cuda_copy.cu src/cuda_init.cu src/cuda_reduction.cu \
+	src/cuda_center_mass.cu src/cuda_box.cu src/validation.cu \
+        src/cuda_allocate.cu src/cuda_bond_orders.cu \
+	src/cuda_lin_alg.cu src/cuda_QEq.cu \
+        src/cuda_forces.cu src/cuda_four_body_interactions.cu \
+	src/cuda_grid.cu src/cuda_init_md.cu src/cuda_integrate.cu src/cuda_list.cu \
+	src/cuda_lookup.cu src/cuda_neighbors.cu \
+	src/cuda_reset_utils.cu src/cuda_single_body_interactions.cu \
+        src/cuda_system_props.cu src/cuda_three_body_interactions.cu \
+	src/cuda_two_body_interactions.cu src/cuda_environment.cu \
+	src/cuda_post_evolve.cu
 include_HEADERS = src/mytypes.h src/analyze.h src/print_utils.h \
-        src/reset_utils.h src/param.h src/pdb_tools.h \
-	src/GMRES.h src/QEq.h src/allocate.h src/bond_orders.h \
-	src/box.h src/forces.h src/four_body_interactions.h \
+        src/restart.h src/param.h src/pdb_tools.h src/box.h \
+	src/lin_alg.h src/QEq.h src/allocate.h src/bond_orders.h \
+	src/forces.h src/four_body_interactions.h \
 	src/grid.h src/init_md.h src/integrate.h src/list.h \
 	src/lookup.h src/neighbors.h \
-	src/restart.h src/single_body_interactions.h \
+	src/reset_utils.h src/single_body_interactions.h \
 	src/system_props.h src/three_body_interactions.h \
 	src/traj.h src/two_body_interactions.h src/vector.h \
-	src/cuda_utils.h src/cuda_copy.h src/cuda_init.h src/reduction.h \
-	src/center_mass.h src/helpers.h src/validation.h src/matvec.h
+	src/cuda_utils.h src/cuda_copy.h src/cuda_init.h src/cuda_reduction.h \
+	src/cuda_center_mass.h src/cuda_box.h src/validation.h \
+        src/cuda_allocate.h src/cuda_bond_orders.h \
+	src/cuda_lin_alg.h src/cuda_QEq.h \
+        src/cuda_forces.h src/cuda_four_body_interactions.h \
+	src/cuda_grid.h src/cuda_init_md.h src/cuda_integrate.h src/cuda_list.h \
+	src/cuda_lookup.h src/cuda_neighbors.h \
+	src/cuda_reset_utils.h src/cuda_single_body_interactions.h \
+        src/cuda_system_props.h src/cuda_three_body_interactions.h \
+	src/cuda_two_body_interactions.h src/cuda_environment.h \
+	src/cuda_post_evolve.h
 # dummy source to cause C linking
 nodist_EXTRA_bin_puremd_gpu_SOURCES = src/dummy.c
diff --git a/PuReMD-GPU/configure.ac b/PuReMD-GPU/configure.ac
index c947ed021a0544653895f01c59dce33e4a07a902..30e7e0bff6bc3115193c75d3b2d0dcbe29a5cf4b 100644
--- a/PuReMD-GPU/configure.ac
+++ b/PuReMD-GPU/configure.ac
@@ -42,19 +42,6 @@ AC_SEARCH_LIBS([gzeof], [z])
 AC_SEARCH_LIBS([gzgets], [z])
 AC_SEARCH_LIBS([gzseek], [z])
 AC_SEARCH_LIBS([gzclose, [z]])
-AC_SEARCH_LIBS([cublasCheckError], [cublas])
-AC_SEARCH_LIBS([cublasDnrm2], [cublas])
-AC_SEARCH_LIBS([cublasDaxpy], [cublas])
-AC_SEARCH_LIBS([cublasDscal], [cublas])
-AC_SEARCH_LIBS([cublasDdot], [cublas])
-AC_SEARCH_LIBS([cudaThreadSynchronize], [cuda])
-AC_SEARCH_LIBS([cudaCheckError], [cuda])
-# FIXME: Replace `main' with a function in `-lcudart':
-#AC_CHECK_LIB([cudart], [main])
-AC_SEARCH_LIBS([cusparseCheckError], [cusparse])
-AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse])
-AC_SEARCH_LIBS([cusparseSetMatType], [cusparse])
-AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse])
 # Checks for typedefs, structures, and compiler characteristics.
@@ -78,10 +65,26 @@ then
 AC_DEFINE([HAVE_CUDA], [1], [Define to 1 if you have CUDA support enabled.])
-if test "BUILD_PROF" = "true"
-	NVCCFLAGS+=" --compiler-options ${gprof_flags}"
+AC_SEARCH_LIBS([cublasDnrm2], [cublas])
+AC_SEARCH_LIBS([cublasDaxpy], [cublas])
+AC_SEARCH_LIBS([cublasDscal], [cublas])
+AC_SEARCH_LIBS([cublasDdot], [cublas])
+AC_SEARCH_LIBS([cudaThreadSynchronize], [cudart])
+AC_SEARCH_LIBS([cudaGetLastError], [cudart])
+AC_CHECK_LIB([cudart], [cudaMalloc])
+AC_SEARCH_LIBS([cusparseCreateMatDescr], [cusparse])
+AC_SEARCH_LIBS([cusparseSetMatType], [cusparse])
+AC_SEARCH_LIBS([cusparseSetMatIndexBase], [cusparse])
+AC_SEARCH_LIBS([cublasDnrm2], [cublas],
+	[CUBLAS_FOUND_LIBS="yes"], [CUBLAS_FOUND_LIBS="no"], [-lcublas])
+AS_IF([test "x${CUBLAS_FOUND_LIBS}" != "xyes"],
+	[AC_MSG_ERROR([Unable to find CUBLAS library.])])
+AC_SEARCH_LIBS([cusparseSetMatType], [cusparse],
+	[CUSPARSE_FOUND_LIBS="yes"], [CUSPARSE_FOUND_LIBS="no"], [-lcusparse])
+AS_IF([test "x${CUSPARSE_FOUND_LIBS}" != "xyes"],
+	[AC_MSG_ERROR([Unable to find CUSPARSE library.])])
 AC_CHECK_TYPES([cublasHandle_t], [], 
 	       [AC_MSG_FAILURE([cublasHandle_t type not found in cublas.h], [1])], [#include<cublas_v2.h>])
@@ -89,10 +92,12 @@ AC_CHECK_TYPES([cusparseHandle_t], [],
 	       [AC_MSG_FAILURE([cusparseHandle_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
 AC_CHECK_TYPES([cusparseMatDescr_t], [], 
 	       [AC_MSG_FAILURE([cusparseMatDescr_t type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
-#	       [AC_MSG_FAILURE([CUSPARSE_MATRIX_TYPE_GENERAL type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
-#	       [AC_MSG_FAILURE([CUSPARSE_INDEX_BASE_ZERO type not found in cusparse.h], [1])], [#include<cusparse_v2.h>])
+if test "BUILD_PROF" = "true"
+	NVCCFLAGS+=" --compiler-options ${gprof_flags}"
diff --git a/PuReMD-GPU/src/GMRES.cu b/PuReMD-GPU/src/GMRES.cu
deleted file mode 100644
index d00100e9ced86d2b1b7a8d1f37b67648576cdca1..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/GMRES.cu
+++ /dev/null
@@ -1,1138 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#include "GMRES.h"
-#include "list.h"
-#include "vector.h"
-#include "index_utils.h"
-#include "cuda_copy.h"
-#include "cuda_utils.h"
-#include "reduction.h"
-#include "matvec.h"
-#include "system_props.h"
-#include "cublas_v2.h"
-#include "cusparse_v2.h"
-void Sparse_MatVec( sparse_matrix *A, real *x, real *b )
-    int i, j, k, n, si, ei;
-    real H;
-    n = A->n;
-    for( i = 0; i < n; ++i )
-        b[i] = 0;
-    for( i = 0; i < n; ++i ) {
-        si = A->start[i];
-        ei = A->start[i+1]-1;
-        for( k = si; k < ei; ++k ) {
-            j = A->entries[k].j;
-            H = A->entries[k].val;
-            b[j] += H * x[i]; 
-            b[i] += H * x[j];
-        }
-        // the diagonal entry is the last one in
-        b[i] += A->entries[k].val * x[i]; 
-    }
-void Forward_Subs( sparse_matrix *L, real *b, real *y )
-    int i, pj, j, si, ei;
-    real val;
-    for( i = 0; i < L->n; ++i ) {
-        y[i] = b[i];
-        si = L->start[i];
-        ei = L->start[i+1];
-        for( pj = si; pj < ei-1; ++pj ){
-            j = L->entries[pj].j;
-            val = L->entries[pj].val;
-            y[i] -= val * y[j];
-        }
-        y[i] /= L->entries[pj].val;
-    }
-void Backward_Subs( sparse_matrix *U, real *y, real *x )
-    int i, pj, j, si, ei;
-    real val;
-    for( i = U->n-1; i >= 0; --i ) {
-        x[i] = y[i];
-        si = U->start[i];
-        ei = U->start[i+1];
-        for( pj = si+1; pj < ei; ++pj ){
-            j = U->entries[pj].j;
-            val = U->entries[pj].val;
-            x[i] -= val * x[j];
-        }
-        x[i] /= U->entries[si].val;
-    }
-int GMRES( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout, reax_system* system)
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-    N = H->n;
-    bnorm = Norm( b, N );
-    /* apply the diagonal pre-conditioner to rhs */
-    for( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
-    /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
-        /* calculate r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );      
-        for( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */    
-        Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
-        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
-        Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system)], N );
-        /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
-            /* matvec */
-            Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
-            for( k = 0; k < N; ++k )  
-                workspace->v[ index_wkspace_sys (j+1,k,system)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ 
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i <= j; i++ ) {
-                workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)], 
-                        -workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N );
-            }
-            workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], 
-                    1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for( i = 0; i <= j; i++ )    {
-                if( i == j ) {
-                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
-                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
-                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
-                }
-                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-                    workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ];
-                tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-                    workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ];
-                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
-                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
-            } 
-            /* apply Givens rotations to the rhs as well */
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j+1] = tmp2;
-            // fprintf( stderr, "h: " );
-            // for( i = 0; i <= j+1; ++i )
-            //  fprintf( stderr, "%.6f ", workspace->h[i][j] );
-            // fprintf( stderr, "\n" );
-            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-        }
-        /* solve Hy = g.
-           H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- ) {
-            temp = workspace->g[i];      
-            for( k = j-1; k > i; k-- )
-                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
-            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
-        }
-        /* update x = x_0 + Vy */
-        for( i = 0; i < j; i++ )
-            Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N );
-        /* stopping condition */
-        if( fabs(workspace->g[j]) / bnorm <= tol )
-            break;
-    }
-    // Sparse_MatVec( H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
-    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
-    //          itr, j, fabs( workspace->g[j] ) / bnorm );
-    // data->timing.matvec += itr * RESTART + j;
-    if( itr >= MAX_ITR ) {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        // return -1;
-        return itr * (RESTART+1) + j + 1;
-    }
-    return itr * (RESTART+1) + j + 1;
-//Cuda Functions for GMRES implementation
-GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv, int entries)
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= entries) return;
-    b_proc [i] = b[i] * Hdia_inv[i];
-GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j, real *output)
-    real tmp1, tmp2, cc;
-    for( int i = 0; i <= j; i++ )    {
-        if( i == j ) {
-            cc = SQRT( SQR(h[ index_wkspace_res (j,j) ])+SQR(h[ index_wkspace_res (j+1,j) ]) );
-            hc[j] = h[ index_wkspace_res (j,j) ] / cc;
-            hs[j] = h[ index_wkspace_res (j+1,j) ] / cc;
-        }
-        tmp1 =  hc[i] * h[ index_wkspace_res (i,j) ] + hs[i] * h[ index_wkspace_res (i+1,j) ];
-        tmp2 = -hs[i] * h[ index_wkspace_res (i,j) ] + hc[i] * h[ index_wkspace_res (i+1,j) ];
-        h[ index_wkspace_res (i,j) ] = tmp1;
-        h[ index_wkspace_res (i+1,j) ] = tmp2;
-    } 
-    /* apply Givens rotations to the rhs as well */
-    tmp1 =  hc[j] * g_j;
-    tmp2 = -hs[j] * g_j;
-    output[0] = tmp1;
-    output[1] = tmp2;
-GLOBAL void GMRES_BackSubstitution (int j, real *g, real *h, real *y)
-    real temp;
-    for( int i = j-1; i >= 0; i-- ) {
-        temp = g[i];      
-        for( int k = j-1; k > i; k-- )
-            temp -= h[ index_wkspace_res (i,k) ] * y[k];
-        y[i] = temp / h[ index_wkspace_res (i,i) ];
-    }
-int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x )
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-    real v_add_tmp;
-    sparse_matrix *H = &workspace->H;
-    real t_start, t_elapsed;
-    real *spad = (real *)scratch;
-    real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
-    N = H->n;
-    cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
-    Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Norm of the array is %e \n", bnorm );
-    /* apply the diagonal pre-conditioner to rhs */
-    GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
-        (workspace->b_prc, b, workspace->Hdia_inv, N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
-        /* calculate r0 */
-        //Sparse_MatVec( H, x, workspace->b_prm );      
-        Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>>
-            (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>>
-            (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
-        {
-            cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
-            Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
-                (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G);
-        }
-        Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
-            ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        /* GMRES inner-loop */
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] );
-        for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) {
-            /* matvec */
-            //Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
-            Cuda_Matvec_csr 
-                ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
-                (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i <= j; i++ ) {
-                Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-                    (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
-                cudaThreadSynchronize ();
-                cudaCheckError ();
-                Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
-                cudaThreadSynchronize ();
-                cudaCheckError ();
-                copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-                Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>>
-                    ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
-                      -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
-                cudaThreadSynchronize ();
-                cudaCheckError ();
-            }
-            //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-            cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
-            Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
-                ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
-                  1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            GMRES_Givens_Rotation <<<1, 1>>>
-                (j, workspace->h, workspace->hc, workspace->hs, g[j], spad);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-        }
-        copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
-        /* solve Hy = g.
-           H is now upper-triangular, do back-substitution */
-        copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
-        GMRES_BackSubstitution <<<1, 1>>>
-            (j, spad, workspace->h, workspace->y);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        /* update x = x_0 + Vy */
-        for( i = 0; i < j; i++ )
-        {
-            copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>>
-                ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-        }
-        /* stopping condition */
-        if( fabs(g[j]) / bnorm <= tol )
-            break;
-    }
-    if( itr >= MAX_ITR ) {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        return itr * (RESTART+1) + j + 1;
-    }
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
-    return itr * (RESTART+1) + j + 1;
-int Cublas_GMRES(reax_system *system, static_storage *workspace, real *b, real tol, real *x )
-    real CSR_ALPHA = 1, CSR_BETA = 0;
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-    real v_add_tmp;
-    sparse_matrix *H = &workspace->H;
-    real t_start, t_elapsed;
-    real *spad = (real *)scratch;
-    real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
-    N = H->n;
-    cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
-    /*
-       Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL);
-       cudaThreadSynchronize ();
-       cudaCheckError ();
-       Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
-       cudaThreadSynchronize ();
-       cudaCheckError ();
-       copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-     */
-    cublasCheckError (cublasDnrm2 ( cublasHandle, N, b, 1, &bnorm ));
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Norm of the array is %e \n", bnorm );
-    /* apply the diagonal pre-conditioner to rhs */
-    GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
-        (workspace->b_prc, b, workspace->Hdia_inv, N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
-        /* calculate r0 */
-        //Sparse_MatVec( H, x, workspace->b_prm );      
-        Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>>
-            (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        /*
-           Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>>
-           (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
-           cudaThreadSynchronize ();
-           cudaCheckError ();
-         */
-        cuda_memset (workspace->v, 0, REAL_SIZE * (RESTART+1) * N, RES_STORAGE_V);
-        double D_ONE = 1.;
-        double D_MINUS_ONE = -1.;
-        cublasCheckError (cublasDaxpy (cublasHandle, N, &D_ONE, workspace->b_prc, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
-        cublasCheckError (cublasDaxpy (cublasHandle, N, &D_MINUS_ONE, workspace->b_prm, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
-        //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
-        {
-            /*
-               cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
-               Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
-               (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL);
-               cudaThreadSynchronize ();
-               cudaCheckError ();
-               Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
-               cudaThreadSynchronize ();
-               cudaCheckError ();
-               copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G);
-             */
-            cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (0, 0, N)], 1, g ));
-            copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
-        }
-        /*
-           Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
-           ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N );
-           cudaThreadSynchronize ();
-           cudaCheckError ();
-         */
-        double D_SCALE = 1.0 / g[0];
-        cublasCheckError (cublasDscal (cublasHandle, N, &D_SCALE, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
-        /* GMRES inner-loop */
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] );
-        for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) {
-            /* matvec */
-            Cuda_Matvec_csr 
-                ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
-                (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i <= j; i++ ) {
-                /*
-                   Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-                   (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
-                   cudaThreadSynchronize ();
-                   cudaCheckError ();
-                   Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
-                   cudaThreadSynchronize ();
-                   cudaCheckError ();
-                   copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-                 */
-                cublasCheckError (cublasDdot (cublasHandle, N, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
-                            &workspace->v[index_wkspace_sys(j+1,0,N)], 1, 
-                            &v_add_tmp));
-                copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
-                /*
-                   Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>>
-                   ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
-                   -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
-                   cudaThreadSynchronize ();
-                   cudaCheckError ();
-                 */
-                double NEG_V_ADD_TMP = -v_add_tmp;
-                cublasCheckError (cublasDaxpy (cublasHandle, N, &NEG_V_ADD_TMP, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
-                            &workspace->v[index_wkspace_sys(j+1,0,N)], 1 ));
-            }
-            //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-            /*
-               cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
-               Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
-               cudaThreadSynchronize ();
-               cudaCheckError ();
-               Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
-               cudaThreadSynchronize ();
-               cudaCheckError ();
-               copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-             */
-            cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (j+1, 0, N)], 1, &v_add_tmp ));
-            copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
-            /*
-               Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
-               ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
-               1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N );
-               cudaThreadSynchronize ();
-               cudaCheckError ();
-             */
-            double REC_V_ADD_TMP = 1. / v_add_tmp;
-            cublasCheckError (cublasDscal (cublasHandle, N, &REC_V_ADD_TMP,  &workspace->v[index_wkspace_sys(j+1,0,N)], 1));
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            GMRES_Givens_Rotation <<<1, 1>>>
-                (j, workspace->h, workspace->hc, workspace->hs, g[j], spad);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-        }
-        copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
-        /* solve Hy = g.
-           H is now upper-triangular, do back-substitution */
-        copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
-        GMRES_BackSubstitution <<<1, 1>>>
-            (j, spad, workspace->h, workspace->y);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        /* update x = x_0 + Vy */
-        for( i = 0; i < j; i++ )
-        {
-            /*
-               copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-               Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>>
-               ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
-               cudaThreadSynchronize ();
-               cudaCheckError ();
-             */
-            copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            cublasCheckError (cublasDaxpy (cublasHandle, N, &v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
-                        x, 1));
-        }
-        /* stopping condition */
-        if( fabs(g[j]) / bnorm <= tol )
-            break;
-    }
-    if( itr >= MAX_ITR ) {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        return itr * (RESTART+1) + j + 1;
-    }
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
-    return itr * (RESTART+1) + j + 1;
-int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, 
-        real *b, real tol, real *x, FILE *fout, reax_system *system)
-    int  i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-    real v[10000], z[RESTART+2][10000], w[RESTART+2];
-    real u[RESTART+2][10000];
-    N = H->n;
-    bnorm = Norm( b, N );
-    /* apply the diagonal pre-conditioner to rhs */
-    for( i = 0; i < N; ++i )
-        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
-    // memset( x, 0, sizeof(real) * N );
-    /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
-        /* compute z = r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );      
-        for( i = 0; i < N; ++i )
-            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
-        Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
-        Vector_MakeZero( w, RESTART+1 );
-        w[0] = Norm( z[0], N );
-        Vector_Copy( u[0], z[0], N );
-        u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
-        Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
-        w[0]    *= ( u[0][0] < 0.0 ?  1 :-1 );
-        // fprintf( stderr, "\n\n%12.6f\n", w[0] );
-        /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) {
-            /* compute v_j */
-            Vector_Scale( z[j], -2 * u[j][j], u[j], N );
-            z[j][j] += 1.; /* due to e_j */
-            for( i = j-1; i >= 0; --i )
-                Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i );
-            /* matvec */
-            Sparse_MatVec( H, z[j], v );
-            for( k = 0; k < N; ++k )
-                v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
-            for( i = 0; i <= j; ++i )
-                Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i );
-            if( !Vector_isZero( v + (j+1), N - (j+1) ) ) {
-                /* compute the HouseHolder unit vector u_j+1 */
-                for( i = 0; i <= j; ++i )  
-                    u[j+1][i] = 0;
-                Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) );
-                u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) );
-                Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N );
-                /* overwrite v with P_m+1 * v */
-                v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1];
-                Vector_MakeZero( v + (j+2), N - (j+2) );
-                // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
-            }
-            /* prev Givens rots on the upper-Hessenberg matrix to make it U */
-            for( i = 0; i < j; i++ ) {
-                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1];
-                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1];
-                v[i]   = tmp1;
-                v[i+1] = tmp2;
-            }
-            /* apply the new Givens rotation to H and right-hand side */
-            if( fabs(v[j+1]) >= ALMOST_ZERO )    {
-                cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) );
-                workspace->hc[j] = v[j] / cc;
-                workspace->hs[j] = v[j+1] / cc;
-                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1];
-                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1];
-                v[j]   = tmp1;
-                v[j+1] = tmp2;
-                /* Givens rotations to rhs */
-                tmp1 =  workspace->hc[j] * w[j];
-                tmp2 = -workspace->hs[j] * w[j];
-                w[j]   = tmp1;
-                w[j+1] = tmp2;
-            }
-            /* extend R */
-            for( i = 0; i <= j; ++i )
-                workspace->h[ index_wkspace_res (i,j) ] = v[i];
-            // fprintf( stderr, "h:" );
-            // for( i = 0; i <= j+1 ; ++i )
-            // fprintf( stderr, "%.6f ", h[i][j] );
-            // fprintf( stderr, "\n" );
-            // fprintf( stderr, "%12.6f\n", w[j+1] );
-        }
-        /* solve Hy = w.
-           H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- ) {
-            temp = w[i];      
-            for( k = j-1; k > i; k-- )
-                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
-            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
-        }
-        // fprintf( stderr, "y: " );
-        // for( i = 0; i < RESTART+1; ++i )
-        //   fprintf( stderr, "%8.3f ", workspace->y[i] );
-        /* update x = x_0 + Vy */
-        // memset( z, 0, sizeof(real) * N );
-        // for( i = j-1; i >= 0; i-- )
-        //   {
-        //     Vector_Copy( v, z, N );
-        //     v[i] += workspace->y[i];
-        //    
-        //     Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N );
-        //   }      
-        //
-        // fprintf( stderr, "\nz: " );
-        // for( k = 0; k < N; ++k )
-        // fprintf( stderr, "%6.2f ", z[k] );
-        // fprintf( stderr, "\nx_bef: " );
-        // for( i = 0; i < N; ++i )
-        //   fprintf( stderr, "%6.2f ", x[i] );
-        // Vector_Add( x, 1, z, N );
-        for( i = j-1; i >= 0; i-- )
-            Vector_Add( x, workspace->y[i], z[i], N );
-        // fprintf( stderr, "\nx_aft: " );
-        // for( i = 0; i < N; ++i )
-        //   fprintf( stderr, "%6.2f ", x[i] );
-        /* stopping condition */
-        if( fabs( w[j] ) / bnorm <= tol )
-            break;
-    }
-    // Sparse_MatVec( H, x, workspace->b_prm );
-    // for( i = 0; i < N; ++i )
-    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
-    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-    // for( i = 0; i < N; ++i )
-    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
-    // workspace->b_prc[i], workspace->b_prm[i], x[i] );
-    //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", 
-    //         itr, j, fabs( workspace->g[j] ) / bnorm );
-    if( itr >= MAX_ITR ) {
-        fprintf( stderr, "GMRES convergence failed\n" );
-        // return -1;
-        return itr * (RESTART+1) + j + 1;
-    }
-    return itr * (RESTART+1) + j + 1;
-int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, 
-        sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system )
-    int i, j, k, itr, N;
-    real cc, tmp1, tmp2, temp, bnorm;
-    N = H->n;
-    bnorm = Norm( b, N );
-    /* GMRES outer-loop */
-    for( itr = 0; itr < MAX_ITR; ++itr ) {
-        /* calculate r0 */
-        Sparse_MatVec( H, x, workspace->b_prm );      
-        Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system)], 1., b, -1., workspace->b_prm, N );
-        Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] );
-        Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] );
-        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system)], N );
-        Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system)], N );
-        //fprintf( stderr, "res: %.15e\n", workspace->g[0] );
-        /* GMRES inner-loop */
-        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
-            /* matvec */
-            Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system)], &workspace->v[index_wkspace_sys (j+1,0,system)] );
-            Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
-            Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
-            /* apply modified Gram-Schmidt to orthogonalize the new residual */
-            for( i = 0; i < j-1; i++ ) workspace->h[ index_wkspace_res (i,j)] = 0;
-            //for( i = 0; i <= j; i++ ) {
-            for( i = MAX(j-1,0); i <= j; i++ ) {
-                workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N );
-            }
-            workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system)], N );
-            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], 
-                    1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-            /* Givens rotations on the upper-Hessenberg matrix to make it U */
-            for( i = MAX(j-1,0); i <= j; i++ )    {
-                if( i == j ) {
-                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
-                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
-                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
-                }
-                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-                    workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ];
-                tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + 
-                    workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ];
-                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
-                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
-            } 
-            /* apply Givens rotations to the rhs as well */
-            tmp1 =  workspace->hc[j] * workspace->g[j];
-            tmp2 = -workspace->hs[j] * workspace->g[j];
-            workspace->g[j] = tmp1;
-            workspace->g[j+1] = tmp2;
-            //fprintf( stderr, "h: " );
-            //for( i = 0; i <= j+1; ++i )
-            //fprintf( stderr, "%.6f ", workspace->h[i][j] );
-            //fprintf( stderr, "\n" );
-            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-        }
-        /* solve Hy = g: H is now upper-triangular, do back-substitution */
-        for( i = j-1; i >= 0; i-- ) {
-            temp = workspace->g[i];      
-            for( k = j-1; k > i; k-- )
-                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
-            workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)];
-        }
-        /* update x = x_0 + Vy */
-        Vector_MakeZero( workspace->p, N );
-        for( i = 0; i < j; i++ )
-            Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N );
-        //Backward_Subs( U, workspace->p, workspace->p );
-        //Forward_Subs( L, workspace->p, workspace->p );
-        Vector_Add( x, 1., workspace->p, N );
-        /* stopping condition */
-        if( fabs(workspace->g[j]) / bnorm <= tol )
-            break;
-        }
-        // Sparse_MatVec( H, x, workspace->b_prm );
-        // for( i = 0; i < N; ++i )
-        // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
-        // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-        // for( i = 0; i < N; ++i )
-        // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
-        // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-        // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
-        //          itr, j, fabs( workspace->g[j] ) / bnorm );
-        // data->timing.matvec += itr * RESTART + j;
-        if( itr >= MAX_ITR ) {
-            fprintf( stderr, "GMRES convergence failed\n" );
-            // return -1;
-            return itr * (RESTART+1) + j + 1;
-        }
-        return itr * (RESTART+1) + j + 1;
-    }
-    int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, 
-            sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system )
-    {
-        int  i, N;
-        real tmp, alpha, beta, b_norm, r_norm;
-        real sig0, sig_old, sig_new;
-        N = A->n;
-        b_norm = Norm( b, N );
-        //fprintf( stderr, "b_norm: %.15e\n", b_norm );
-        Sparse_MatVec( A, x, workspace->q );
-        Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-        r_norm = Norm(workspace->r, N);
-        //Print_Soln( workspace, x, q, b, N );
-        //fprintf( stderr, "res: %.15e\n", r_norm );
-        Forward_Subs( L, workspace->r, workspace->d );
-        Backward_Subs( U, workspace->d, workspace->p );
-        sig_new = Dot( workspace->r, workspace->p, N );
-        sig0 = sig_new;
-        for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) {
-            //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) {
-            Sparse_MatVec( A, workspace->p, workspace->q );
-            tmp = Dot( workspace->q, workspace->p, N );
-            alpha = sig_new / tmp;
-            Vector_Add( x, alpha, workspace->p, N );
-            //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
-            //     i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp );
-            Vector_Add( workspace->r, -alpha, workspace->q, N );
-            r_norm = Norm(workspace->r, N);
-            //fprintf( stderr, "res: %.15e\n", r_norm );
-            Forward_Subs( L, workspace->r, workspace->d );
-            Backward_Subs( U, workspace->d, workspace->d );
-            sig_old = sig_new;
-            sig_new = Dot( workspace->r, workspace->d, N );
-            beta = sig_new / sig_old;
-            Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N );
-        }
-        //fprintf( fout, "CG took %d iterations\n", i );
-        if( i >= 200 ) {
-            fprintf( stderr, "CG convergence failed!\n" );
-            return i;
-        }
-        return i;
-        }
-        int CG( static_storage *workspace, sparse_matrix *H, 
-                real *b, real tol, real *x, FILE *fout, reax_system *system)
-        {
-            int  i, j, N;
-            real tmp, alpha, beta, b_norm;
-            real sig_old, sig_new, sig0;
-            N = H->n;
-            b_norm = Norm( b, N );
-            //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
-            Sparse_MatVec( H, x, workspace->q );
-            Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-            for( j = 0; j < N; ++j )
-                workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-            sig_new = Dot( workspace->r, workspace->d, N );
-            sig0 = sig_new;
-            //Print_Soln( workspace, x, q, b, N );
-            //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", 
-            // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) );
-            //fprintf( stderr, "sig_new: %f\n", sig_new );
-            for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) {
-                //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) {
-                Sparse_MatVec( H, workspace->d, workspace->q );
-                tmp = Dot( workspace->d, workspace->q, N );
-                //fprintf( stderr, "tmp: %f\n", tmp );
-                alpha = sig_new / tmp;    
-                Vector_Add( x, alpha, workspace->d, N );
-                //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
-                //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
-                Vector_Add( workspace->r, -alpha, workspace->q, N );    
-                for( j = 0; j < N; ++j )
-                    workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-                sig_old = sig_new;
-                sig_new = Dot( workspace->r, workspace->p, N );
-                beta = sig_new / sig_old;
-                Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N );
-                //fprintf( stderr, "sig_new: %f\n", sig_new );
-            }
-            fprintf( stderr, "CG took %d iterations\n", i );
-            if( i >= 300 ) {
-                fprintf( stderr, "CG convergence failed!\n" );
-                return i;
-            }
-            return i;
-            }
-            /* Steepest Descent */
-            int SDM( static_storage *workspace, sparse_matrix *H, 
-                    real *b, real tol, real *x, FILE *fout )
-            {
-                int  i, j, N;
-                real tmp, alpha, beta, b_norm;
-                real sig0, sig;
-                N = H->n;
-                b_norm = Norm( b, N );
-                //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
-                Sparse_MatVec( H, x, workspace->q );
-                Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-                for( j = 0; j < N; ++j )
-                    workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-                sig = Dot( workspace->r, workspace->d, N );
-                sig0 = sig;
-                for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) {
-                    Sparse_MatVec( H, workspace->d, workspace->q );
-                    sig = Dot( workspace->r, workspace->d, N );
-                    tmp = Dot( workspace->d, workspace->q, N );
-                    alpha = sig / tmp;    
-                    Vector_Add( x, alpha, workspace->d, N );
-                    Vector_Add( workspace->r, -alpha, workspace->q, N );
-                    for( j = 0; j < N; ++j )
-                        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-                    //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
-                    //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
-                }
-                fprintf( stderr, "SDM took %d iterations\n", i );
-                if( i >= 300 ) {
-                    fprintf( stderr, "SDM convergence failed!\n" );
-                    return i;
-                }
-                return i;
-            }
diff --git a/PuReMD-GPU/src/QEq.c b/PuReMD-GPU/src/QEq.c
new file mode 100644
index 0000000000000000000000000000000000000000..8cc638ea90dcc25f86d33f275b162c8e531d82bb
--- /dev/null
+++ b/PuReMD-GPU/src/QEq.c
@@ -0,0 +1,396 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "QEq.h"
+#include "allocate.h"
+#include "lin_alg.h"
+#include "list.h"
+#include "print_utils.h"
+#include "index_utils.h"
+#include "system_props.h"
+#include "sort.h"
+int compare_matrix_entry(const void *v1, const void *v2)
+    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
+void Sort_Matrix_Rows( sparse_matrix *A )
+    int i, si, ei;
+    for( i = 0; i < A->n; ++i ) {
+        si = A->start[i];
+        ei = A->start[i+1];
+        qsort( &(A->entries[si]), ei - si, 
+                sizeof(sparse_matrix_entry), compare_matrix_entry );
+    }
+void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
+    int i, j, k;
+    real val;
+    /* init droptol to 0 */
+    for( i = 0; i < A->n; ++i )
+        droptol[i] = 0;
+    /* calculate sqaure of the norm of each row */
+    for( i = 0; i < A->n; ++i ) {
+        for( k = A->start[i]; k < A->start[i+1]-1; ++k ) {
+            j = A->entries[k].j;
+            val = A->entries[k].val;
+            droptol[i] += val*val;
+            droptol[j] += val*val;
+        }
+        val = A->entries[k].val; // diagonal entry
+        droptol[i] += val*val;
+    }
+    /* calculate local droptol for each row */
+    //fprintf( stderr, "droptol: " );
+    for( i = 0; i < A->n; ++i ) {
+        //fprintf( stderr, "%f-->", droptol[i] );
+        droptol[i] = SQRT( droptol[i] ) * dtol;
+        //fprintf( stderr, "%f  ", droptol[i] );
+    }
+    //fprintf( stderr, "\n" );
+int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
+    int i, j, pj;
+    int fillin;
+    real val;
+    fillin = 0;
+    //fprintf( stderr, "n: %d\n", A->n );
+    for( i = 0; i < A->n; ++i )
+        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
+            j = A->entries[pj].j;
+            val = A->entries[pj].val;
+            //fprintf( stderr, "i: %d, j: %d", i, j );
+            if( fabs(val) > droptol[i] )
+                ++fillin;
+        }
+    return fillin + A->n;
+void ICHOLT( sparse_matrix *A, real *droptol, 
+        sparse_matrix *L, sparse_matrix *U )
+    sparse_matrix_entry tmp[1000];
+    int i, j, pj, k1, k2, tmptop, Ltop;
+    real val;
+    int *Utop;
+    Utop = (int*) malloc((A->n+1) * sizeof(int));
+    // clear variables
+    Ltop = 0;
+    tmptop = 0;
+    for( i = 0; i <= A->n; ++i )
+        L->start[i] = U->start[i] = 0;
+    for( i = 0; i < A->n; ++i )
+        Utop[i] = 0;
+    //fprintf( stderr, "n: %d\n", A->n );
+    for( i = 0; i < A->n; ++i ){
+        L->start[i] = Ltop;
+        tmptop = 0;
+        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
+            j = A->entries[pj].j;
+            val = A->entries[pj].val;
+            //fprintf( stderr, "i: %d, j: %d", i, j );
+            if( fabs(val) > droptol[i] ){
+                k1 = 0;
+                k2 = L->start[j];
+                while( k1 < tmptop && k2 < L->start[j+1] ){
+                    if( tmp[k1].j < L->entries[k2].j )
+                        ++k1;
+                    else if( tmp[k1].j > L->entries[k2].j )
+                        ++k2;
+                    else
+                        val -= (tmp[k1++].val * L->entries[k2++].val);
+                }
+                // L matrix is lower triangular, 
+                // so right before the start of next row comes jth diagonal
+                val /= L->entries[L->start[j+1]-1].val;
+                tmp[tmptop].j = j;
+                tmp[tmptop].val = val;
+                ++tmptop;
+            }
+            //fprintf( stderr, " -- done\n" );
+        }
+        // compute the ith diagonal in L
+        // sanity check
+        if( A->entries[pj].j != i ) {
+            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
+            exit(999);
+        }
+        val = A->entries[pj].val;
+        for( k1 = 0; k1 < tmptop; ++k1 )
+            val -= (tmp[k1].val * tmp[k1].val);
+        tmp[tmptop].j = i;
+        tmp[tmptop].val = SQRT(val);
+        // apply the dropping rule once again
+        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
+        //for( k1 = 0; k1<= tmptop; ++k1 )
+        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
+        //fprintf( stderr, "\n" );
+        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
+        for( k1 = 0; k1 < tmptop; ++k1 )
+            if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
+                L->entries[Ltop].j = tmp[k1].j;
+                L->entries[Ltop].val = tmp[k1].val;
+                U->start[tmp[k1].j+1]++;
+                ++Ltop;
+                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
+            }
+        // keep the diagonal in any case
+        L->entries[Ltop].j = tmp[k1].j;
+        L->entries[Ltop].val = tmp[k1].val;
+        ++Ltop;
+        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
+    }
+    L->start[i] = Ltop;
+    //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
+    for( i = 1; i <= U->n; ++i )
+        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
+    for( i = 0; i < L->n; ++i )
+        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
+            j = L->entries[pj].j;
+            U->entries[Utop[j]].j = i;
+            U->entries[Utop[j]].val = L->entries[pj].val;
+            Utop[j]++;
+        }
+    //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+void Init_MatVec( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list *far_nbrs )
+    int i, fillin;
+    real s_tmp, t_tmp;
+    //char fname[100];
+    if(control->refactor > 0 && 
+            ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL))
+    {
+        //Print_Linear_System( system, control, workspace, data->step );
+        Sort_Matrix_Rows( &workspace->H );
+        //fprintf( stderr, "H matrix sorted\n" );
+        Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); 
+        //fprintf( stderr, "drop tolerances calculated\n" );
+        if( workspace->L.entries == NULL )
+        {
+            fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol );
+#ifdef __DEBUG_CUDA__
+            fprintf( stderr, "fillin = %d\n", fillin );
+            if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 ||
+                    Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 )
+            {
+                fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
+                exit(INSUFFICIENT_SPACE);
+            }
+#if defined(DEBUG_FOCUS)
+            fprintf( stderr, "fillin = %d\n", fillin );
+            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
+                    fillin * sizeof(sparse_matrix_entry) / (1024*1024) );
+        }
+        ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U );
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "icholt-" );
+        //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        //Print_Sparse_Matrix2( workspace->L, fname );
+        //Print_Sparse_Matrix( U );
+    }
+    /* extrapolation for s & t */
+    for( i = 0; i < system->N; ++i ) {
+        // no extrapolation
+        //s_tmp = workspace->s[0][i];
+        //t_tmp = workspace->t[0][i];
+        // linear
+        //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
+        //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
+        // quadratic
+        //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
+        t_tmp = workspace->t[index_wkspace_sys(2,i,system->N)] + 3*(workspace->t[index_wkspace_sys(0,i,system->N)]-workspace->t[index_wkspace_sys(1,i,system->N)]);
+        // cubic
+        s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system->N)] + workspace->s[index_wkspace_sys(2,i,system->N)]) - 
+            (6 * workspace->s[index_wkspace_sys(1,i,system->N)] + workspace->s[index_wkspace_sys(3,i,system->N)] );
+        //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
+        //  (6 * workspace->t[1][i] + workspace->t[3][i] );
+        // 4th order
+        //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
+        //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
+        //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
+        //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
+        workspace->s[index_wkspace_sys(4,i,system->N)] = workspace->s[index_wkspace_sys(3,i,system->N)];
+        workspace->s[index_wkspace_sys(3,i,system->N)] = workspace->s[index_wkspace_sys(2,i,system->N)]; 
+        workspace->s[index_wkspace_sys(2,i,system->N)] = workspace->s[index_wkspace_sys(1,i,system->N)];
+        workspace->s[index_wkspace_sys(1,i,system->N)] = workspace->s[index_wkspace_sys(0,i,system->N)];
+        workspace->s[index_wkspace_sys(0,i,system->N)] = s_tmp;
+        workspace->t[index_wkspace_sys(4,i,system->N)] = workspace->t[index_wkspace_sys(3,i,system->N)];
+        workspace->t[index_wkspace_sys(3,i,system->N)] = workspace->t[index_wkspace_sys(2,i,system->N)]; 
+        workspace->t[index_wkspace_sys(2,i,system->N)] = workspace->t[index_wkspace_sys(1,i,system->N)];
+        workspace->t[index_wkspace_sys(1,i,system->N)] = workspace->t[index_wkspace_sys(0,i,system->N)];
+        workspace->t[index_wkspace_sys(0,i,system->N)] = t_tmp;
+    }
+void Calculate_Charges( reax_system *system, static_storage *workspace )
+    int i;
+    real u, s_sum, t_sum;
+    s_sum = t_sum = 0.;
+    for( i = 0; i < system->N; ++i ) {
+        s_sum += workspace->s[index_wkspace_sys(0,i,system->N)];
+        t_sum += workspace->t[index_wkspace_sys(0,i,system->N)];
+    }
+    u = s_sum / t_sum;
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
+    for( i = 0; i < system->N; ++i )
+    {
+        system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system->N)] - u * workspace->t[index_wkspace_sys(0,i,system->N)];
+    }
+void QEq( reax_system *system, control_params *control, simulation_data *data, 
+        static_storage *workspace, list *far_nbrs, 
+        output_controls *out_control )
+    int matvecs;
+    //real t_start, t_elapsed;
+    //t_start = Get_Time ();
+    Init_MatVec( system, control, data, workspace, far_nbrs );
+    //t_elapsed = Get_Timing_Info ( t_start );
+    //fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed );
+    //if( data->step % 10 == 0 )
+    //  Print_Linear_System( system, control, workspace, far_nbrs, data->step );
+    //t_start = Get_Time ( );
+    matvecs = GMRES( workspace, &workspace->H, 
+            workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system );
+    matvecs += GMRES( workspace, &workspace->H, 
+            workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system );
+    //t_elapsed = Get_Timing_Info ( t_start );
+    //fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed );
+    //matvecs = GMRES_HouseHolder( workspace, workspace->H, 
+    //    workspace->b_s, control->q_err, workspace->s[0], out_control->log );
+    //matvecs += GMRES_HouseHolder( workspace, workspace->H,  
+    //    workspace->b_t, control->q_err, workspace->t[0], out_control->log );
+    //matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err,
+    //  &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system->N)], out_control->log, system );
+    //matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err,
+    //  &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system->N)], out_control->log, system );
+    //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, 
+    //      workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1;
+    ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, 
+    //     workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1;
+    //matvecs = CG( workspace, workspace->H, 
+    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
+    //matvecs += CG( workspace, workspace->H, 
+    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
+    //matvecs = SDM( workspace, workspace->H, 
+    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
+    //matvecs += SDM( workspace, workspace->H, 
+    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
+    //fprintf (stderr, " GMRES done with iterations %d \n", matvecs );
+    data->timing.matvecs += matvecs;
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "linsolve-" );
+    Calculate_Charges( system, workspace );
+    //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", 
+    //   data->step, 
+    //   workspace->s[0][0], workspace->t[0][0], 
+    //   workspace->s[0][1], workspace->t[0][1], 
+    //   workspace->s[0][2], workspace->t[0][2] );
+    // if( data->step == control->nsteps )
+    //Print_Charges( system, control, workspace, data->step );
diff --git a/PuReMD-GPU/src/QEq.cu b/PuReMD-GPU/src/QEq.cu
deleted file mode 100644
index 5d849b261b2e8396ec2243f6985f12e335c80430..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/QEq.cu
+++ /dev/null
@@ -1,1073 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#include "QEq.h"
-#include "allocate.h"
-#include "GMRES.h"
-#include "list.h"
-#include "print_utils.h"
-#include "index_utils.h"
-#include "cuda_utils.h"
-#include "cuda_init.h"
-#include "cuda_copy.h"
-#include "sort.h"
-#include "validation.h"
-#include "reduction.h"
-#include "system_props.h"
-HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) 
-    sparse_matrix_entry temp = array[index1];
-    array[index1] = array[index2];
-    array[index2] = temp;
-HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end)
-    int i = start;
-    int k = end; 
-    if (end - start >= 1)  
-    {  
-        int pivot = array[start].j;
-        while (k > i) 
-        {  
-            while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++;
-            while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--;
-            if (k > i) swap(array, i, k);
-        }  
-        swap(array, start, k);
-        quick_sort(array, start, k - 1);
-        quick_sort(array, k + 1, end);
-    }  
-int compare_matrix_entry(const void *v1, const void *v2)
-    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
-void Sort_Matrix_Rows( sparse_matrix *A )
-    int i, si, ei;
-    for( i = 0; i < A->n; ++i ) {
-        si = A->start[i];
-        ei = A->start[i+1];
-        qsort( &(A->entries[si]), ei - si, 
-                sizeof(sparse_matrix_entry), compare_matrix_entry );
-    }
-GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A )
-    int i;
-    int si, ei;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= A.n ) return;
-    si = A.start[i];
-    ei = A.end [i];
-    quick_sort( A.entries + si, 0, ei-si-1 );
-void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
-    int i, j, k;
-    real val;
-    /* init droptol to 0 */
-    for( i = 0; i < A->n; ++i )
-        droptol[i] = 0;
-    /* calculate sqaure of the norm of each row */
-    for( i = 0; i < A->n; ++i ) {
-        for( k = A->start[i]; k < A->start[i+1]-1; ++k ) {
-            j = A->entries[k].j;
-            val = A->entries[k].val;
-            droptol[i] += val*val;
-            droptol[j] += val*val;
-        }
-        val = A->entries[k].val; // diagonal entry
-        droptol[i] += val*val;
-    }
-    /* calculate local droptol for each row */
-    //fprintf( stderr, "droptol: " );
-    for( i = 0; i < A->n; ++i ) {
-        //fprintf( stderr, "%f-->", droptol[i] );
-        droptol[i] = SQRT( droptol[i] ) * dtol;
-        //fprintf( stderr, "%f  ", droptol[i] );
-    }
-    //fprintf( stderr, "\n" );
-GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol )
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    int k, j, offset, x, diagnol;
-    real val;
-    sparse_matrix *A = &p_A;
-    if ( i < A->n ) {
-        droptol [i] = 0;
-        for (k = A->start[i]; k < A->end[i]; ++k ) {
-            val = A->entries[k].val;
-            droptol [i] += val*val;
-        }
-    }
-    __syncthreads ();
-    if ( i < A->n ) {
-        droptol [i] = SQRT (droptol[i]) * dtol;
-    }
-GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real dtol )
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    int k, j, offset, x, diagnol;
-    real val;
-    sparse_matrix *A = &p_A;
-    for (x = 0; x < A->n; x ++)
-    {
-        if (i < (A->end[i]-1 - A->start[i])) {
-            offset = A->start [i] + i;
-            j = A->entries[offset].j;
-            val = A->entries[offset].val;
-            droptol [j] += val * val;
-        }
-        __syncthreads ();
-    }
-GLOBAL void Cuda_Calculate_Droptol_diagnol ( sparse_matrix p_A, real *droptol, real dtol )
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    int k, j, offset, x, diagnol;
-    real val;
-    sparse_matrix *A = &p_A;
-    if ( i < A->n ) {
-        //diagnol element
-        diagnol = A->end[i]-1;
-        val = A->entries [diagnol].val;
-        droptol [i] += val*val;
-    }
-    /*calculate local droptol for each row*/
-    if ( i < A->n )
-        droptol [i] = SQRT (droptol[i]) * dtol;
-int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
-    int i, j, pj;
-    int fillin;
-    real val;
-    fillin = 0;
-    //fprintf( stderr, "n: %d\n", A->n );
-    for( i = 0; i < A->n; ++i )
-        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
-            j = A->entries[pj].j;
-            val = A->entries[pj].val;
-            //fprintf( stderr, "i: %d, j: %d", i, j );
-            if( fabs(val) > droptol[i] )
-                ++fillin;
-        }
-    return fillin + A->n;
-GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *fillin)
-    int i, j, pj;
-    real val;
-    sparse_matrix *A = &p_A;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= A->n) return;
-    fillin [i] = 0;
-    for (pj = A->start[i]; pj < A->end[i]-1; ++pj)
-    {
-        j = A->entries [pj].j;
-        val = A->entries[pj].val;
-        if (fabs (val) > droptol [i]) ++fillin [i];
-    }
-void ICHOLT( sparse_matrix *A, real *droptol, 
-        sparse_matrix *L, sparse_matrix *U )
-    sparse_matrix_entry tmp[1000];
-    int i, j, pj, k1, k2, tmptop, Ltop;
-    real val;
-    int *Utop;
-    Utop = (int*) malloc((A->n+1) * sizeof(int));
-    // clear variables
-    Ltop = 0;
-    tmptop = 0;
-    for( i = 0; i <= A->n; ++i )
-        L->start[i] = U->start[i] = 0;
-    for( i = 0; i < A->n; ++i )
-        Utop[i] = 0;
-    //fprintf( stderr, "n: %d\n", A->n );
-    for( i = 0; i < A->n; ++i ){
-        L->start[i] = Ltop;
-        tmptop = 0;
-        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
-            j = A->entries[pj].j;
-            val = A->entries[pj].val;
-            //fprintf( stderr, "i: %d, j: %d", i, j );
-            if( fabs(val) > droptol[i] ){
-                k1 = 0;
-                k2 = L->start[j];
-                while( k1 < tmptop && k2 < L->start[j+1] ){
-                    if( tmp[k1].j < L->entries[k2].j )
-                        ++k1;
-                    else if( tmp[k1].j > L->entries[k2].j )
-                        ++k2;
-                    else
-                        val -= (tmp[k1++].val * L->entries[k2++].val);
-                }
-                // L matrix is lower triangular, 
-                // so right before the start of next row comes jth diagonal
-                val /= L->entries[L->start[j+1]-1].val;
-                tmp[tmptop].j = j;
-                tmp[tmptop].val = val;
-                ++tmptop;
-            }
-            //fprintf( stderr, " -- done\n" );
-        }
-        // compute the ith diagonal in L
-        // sanity check
-        if( A->entries[pj].j != i ) {
-            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
-            exit(999);
-        }
-        val = A->entries[pj].val;
-        for( k1 = 0; k1 < tmptop; ++k1 )
-            val -= (tmp[k1].val * tmp[k1].val);
-        tmp[tmptop].j = i;
-        tmp[tmptop].val = SQRT(val);
-        // apply the dropping rule once again
-        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
-        //for( k1 = 0; k1<= tmptop; ++k1 )
-        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
-        //fprintf( stderr, "\n" );
-        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
-        for( k1 = 0; k1 < tmptop; ++k1 )
-            if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
-                L->entries[Ltop].j = tmp[k1].j;
-                L->entries[Ltop].val = tmp[k1].val;
-                U->start[tmp[k1].j+1]++;
-                ++Ltop;
-                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
-            }
-        // keep the diagonal in any case
-        L->entries[Ltop].j = tmp[k1].j;
-        L->entries[Ltop].val = tmp[k1].val;
-        ++Ltop;
-        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
-    }
-    L->start[i] = Ltop;
-    //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
-    for( i = 1; i <= U->n; ++i )
-        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
-    for( i = 0; i < L->n; ++i )
-        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
-            j = L->entries[pj].j;
-            U->entries[Utop[j]].j = i;
-            U->entries[Utop[j]].val = L->entries[pj].val;
-            Utop[j]++;
-        }
-    //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-void Cuda_ICHOLT( sparse_matrix *A, real *droptol, 
-        sparse_matrix *L, sparse_matrix *U )
-    sparse_matrix_entry tmp[1000];
-    int i, j, pj, k1, k2, tmptop, Ltop;
-    real val;
-    int *Utop;
-    Utop = (int*) malloc((A->n+1) * sizeof(int));
-    // clear variables
-    Ltop = 0;
-    tmptop = 0;
-    for( i = 0; i <= A->n; ++i )
-        L->start[i] = U->start[i] = 0;
-    for( i = 0; i < A->n; ++i )
-        Utop[i] = 0;
-    //fprintf( stderr, "n: %d\n", A->n );
-    for( i = 0; i < A->n; ++i ){
-        L->start[i] = Ltop;
-        tmptop = 0;
-        for( pj = A->start[i]; pj < A->end[i]-1; ++pj ){
-            j = A->entries[pj].j;
-            val = A->entries[pj].val;
-            //fprintf( stderr, "i: %d, j: %d", i, j );
-            //CHANGE ORIGINAL
-            if (j >= i) break;
-            //CHANGE ORIGINAL
-            if( fabs(val) > droptol[i] ){
-                k1 = 0;
-                k2 = L->start[j];
-                while( k1 < tmptop && k2 < L->start[j+1] ){
-                    if( tmp[k1].j < L->entries[k2].j )
-                        ++k1;
-                    else if( tmp[k1].j > L->entries[k2].j )
-                        ++k2;
-                    else
-                        val -= (tmp[k1++].val * L->entries[k2++].val);
-                }
-                // L matrix is lower triangular, 
-                // so right before the start of next row comes jth diagonal
-                val /= L->entries[L->start[j+1]-1].val;
-                tmp[tmptop].j = j;
-                tmp[tmptop].val = val;
-                ++tmptop;
-            }
-            //fprintf( stderr, " -- done\n" );
-        }
-        // compute the ith diagonal in L
-        // sanity check
-        if( A->entries[pj].j != i ) {
-            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
-            exit(999);
-        }
-        val = A->entries[pj].val;
-        for( k1 = 0; k1 < tmptop; ++k1 )
-            val -= (tmp[k1].val * tmp[k1].val);
-        tmp[tmptop].j = i;
-        tmp[tmptop].val = SQRT(val);
-        // apply the dropping rule once again
-        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
-        //for( k1 = 0; k1<= tmptop; ++k1 )
-        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
-        //fprintf( stderr, "\n" );
-        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
-        for( k1 = 0; k1 < tmptop; ++k1 )
-            if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
-                L->entries[Ltop].j = tmp[k1].j;
-                L->entries[Ltop].val = tmp[k1].val;
-                U->start[tmp[k1].j+1]++;
-                ++Ltop;
-                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
-            }
-        // keep the diagonal in any case
-        L->entries[Ltop].j = tmp[k1].j;
-        L->entries[Ltop].val = tmp[k1].val;
-        ++Ltop;
-        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
-    }
-    L->start[i] = Ltop;
-    //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
-    for( i = 1; i <= U->n; ++i )
-        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
-    for( i = 0; i < L->n; ++i )
-        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
-            j = L->entries[pj].j;
-            U->entries[Utop[j]].j = i;
-            U->entries[Utop[j]].val = L->entries[pj].val;
-            Utop[j]++;
-        }
-    //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
-//Parallel for each row
-//Each kernel will run for 6540 number of times.
-GLOBAL void Cuda_ICHOLT( reax_system *system, sparse_matrix p_A, real *droptol, 
-sparse_matrix p_L, sparse_matrix p_U )
-int start, end, count;
-real tempvalue, val;
-int i,pj,tmptop, offset;
-int j, k1, k2;
-sparse_matrix *A, *L, *U;
-sparse_matrix_entry *tmp;
-A = &p_A;
-L = &p_L;
-U = &p_U;
-real *null_val;
-null_val = 0;
-extern __shared__ real tmp_val[];
-extern __shared__ sparse_matrix_entry sh_tmp[];
-int kid = blockIdx.x * blockDim.x + threadIdx.x;
-tmp = (sparse_matrix_entry *) (tmp_val + blockDim.x);
-offset = 0;
-for( i = 0; i < 10; ++i )
-//if (kid == 0) L->start[i] = i * system->max_sparse_matrix_entries;
-if (kid == 0) L->start[i] = offset;
-tmptop = 0;
-start = A->start[i];
-end = A->end[i]-1; //inclusive
-count = end - start; //inclusive
-tmp_val [kid] = 0;
-if (kid < count) //diagnol not included
-pj = start + kid;
-j = A->entries[pj].j;
-val = A->entries[pj].val;
-if( fabs(val) > droptol[i] )
-k1 = 0;
-k2 = L->start[j];
-while( k1 < tmptop && k2 < L->end[j] ){
-if( tmp[k1].j < L->entries[k2].j )
-else if( tmp[k1].j > L->entries[k2].j )
-tmp_val[kid] = (tmp[k1++].val * L->entries[k2++].val);
-//here read the shared memory of all the kernels 
-if (kid == 0)
-for (i = 0; i < count; i++)
-tempvalue += tmp_val [i];
-val -= tempvalue;
-// L matrix is lower triangular, 
-// so right before the start of next row comes jth diagonal
-val /= L->entries[L->end[j]-1].val;
-tmp[tmptop].j = j;
-tmp[tmptop].val = val;
-__syncthreads ();
-// compute the ith diagonal in L
-// sanity check
-if (kid == 0) 
-    if( A->entries[end].j != i ) {
-        //intentional core dump here for sanity sake
-        *null_val = 1;
-    }
-//diagnol element
-//val = A->entries[pj].val;
-//for( k1 = 0; k1 < tmptop; ++k1 )
-if (kid < count) 
-    tmp_val[kid] = (tmp[kid].val * tmp[kid].val);
-    __syncthreads ();
-if (kid == 0)
-    val = A->entries [end].val;
-    for (i = 0; i < count; i++)
-        tempvalue += tmp_val [i];
-    val -= tempvalue;
-    tmp[tmptop].j = i;
-    tmp[tmptop].val = SQRT(val);
-__syncthreads ();
-//Fill in the LU entries
-//for( k1 = 0; k1 < count; ++k1 )
-if (kid < count )
-    if( fabs(tmp[kid].val) > droptol[i] / tmp[tmptop].val ){
-        L->entries[offset + kid].j = tmp[kid].j;
-        L->entries[offset + kid].val = tmp[kid].val;
-        U->start[tmp[kid].j+1]++;
-    }
-__syncthreads ();
-if (kid == 0) {
-    // keep the diagonal in any case
-    offset += count;
-    L->entries[offset].j = tmp[count].j;
-    L->entries[offset].val = tmp[count].val;
-    ++offset;
-    L->end [i] = offset;
-__syncthreads ();
-} // end of main for loop
-void Cuda_Fill_U    ( sparse_matrix *A, real *droptol, 
-        sparse_matrix *L, sparse_matrix *U )
-    int i, pj, j;
-    for( i = 1; i <= U->n; ++i )
-        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
-    for( i = 0; i < L->n; ++i )
-        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
-            j = L->entries[pj].j;
-            U->entries[Utop[j]].j = i;
-            U->entries[Utop[j]].val = L->entries[pj].val;
-            Utop[j]++;
-        }
-void Init_MatVec( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list *far_nbrs )
-    int i, fillin;
-    real s_tmp, t_tmp;
-    //char fname[100];
-    if(control->refactor > 0 && 
-            ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)){
-        //Print_Linear_System( system, control, workspace, data->step );
-        Sort_Matrix_Rows( &workspace->H );
-        //fprintf( stderr, "H matrix sorted\n" );
-        Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); 
-        //fprintf( stderr, "drop tolerances calculated\n" );
-        if( workspace->L.entries == NULL ) {
-            fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol );
-#ifdef __DEBUG_CUDA__
-            fprintf( stderr, "fillin = %d\n", fillin );
-            if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 ||
-                    Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ){
-                fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
-                exit(INSUFFICIENT_SPACE);
-            }
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "fillin = %d\n", fillin );
-            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-                    fillin * sizeof(sparse_matrix_entry) / (1024*1024) );
-        }
-        ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U );
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "icholt-" );
-        //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
-        //Print_Sparse_Matrix2( workspace->L, fname );
-        //Print_Sparse_Matrix( U );
-    }
-    /* extrapolation for s & t */
-    for( i = 0; i < system->N; ++i ) {
-        // no extrapolation
-        //s_tmp = workspace->s[0][i];
-        //t_tmp = workspace->t[0][i];
-        // linear
-        //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
-        //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
-        // quadratic
-        //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
-        t_tmp = workspace->t[index_wkspace_sys(2,i,system)] + 3*(workspace->t[index_wkspace_sys(0,i,system)]-workspace->t[index_wkspace_sys(1,i,system)]);
-        // cubic
-        s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system)] + workspace->s[index_wkspace_sys(2,i,system)]) - 
-            (6 * workspace->s[index_wkspace_sys(1,i,system)] + workspace->s[index_wkspace_sys(3,i,system)] );
-        //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
-        //  (6 * workspace->t[1][i] + workspace->t[3][i] );
-        // 4th order
-        //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
-        //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
-        //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
-        //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
-        workspace->s[index_wkspace_sys(4,i,system)] = workspace->s[index_wkspace_sys(3,i,system)];
-        workspace->s[index_wkspace_sys(3,i,system)] = workspace->s[index_wkspace_sys(2,i,system)]; 
-        workspace->s[index_wkspace_sys(2,i,system)] = workspace->s[index_wkspace_sys(1,i,system)];
-        workspace->s[index_wkspace_sys(1,i,system)] = workspace->s[index_wkspace_sys(0,i,system)];
-        workspace->s[index_wkspace_sys(0,i,system)] = s_tmp;
-        workspace->t[index_wkspace_sys(4,i,system)] = workspace->t[index_wkspace_sys(3,i,system)];
-        workspace->t[index_wkspace_sys(3,i,system)] = workspace->t[index_wkspace_sys(2,i,system)]; 
-        workspace->t[index_wkspace_sys(2,i,system)] = workspace->t[index_wkspace_sys(1,i,system)];
-        workspace->t[index_wkspace_sys(1,i,system)] = workspace->t[index_wkspace_sys(0,i,system)];
-        workspace->t[index_wkspace_sys(0,i,system)] = t_tmp;
-    }
-void Cuda_Init_MatVec(     reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list *far_nbrs )
-    int i, fillin;
-    real s_tmp, t_tmp;
-    int *spad = (int *)scratch;
-    real start = 0, end = 0;
-    if(control->refactor > 0 && 
-            ((data->step-data->prev_steps)%control->refactor==0 || dev_workspace->L.entries==NULL)){
-        Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>>
-            ( dev_workspace->H );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, "Sorting done... \n");
-        Cuda_Calculate_Droptol <<<BLOCKS, BLOCK_SIZE >>>
-            ( dev_workspace->H, dev_workspace->droptol, control->droptol );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, "Droptol done... \n");
-        if( dev_workspace->L.entries == NULL ) {
-            cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH );
-            Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>>
-                ( dev_workspace->H, dev_workspace->droptol, spad );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            //Reduction for fill in 
-            Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>>  
-                (spad, spad + system->N,  system->N);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
-                (spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); 
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            copy_host_device (&fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH );
-            fillin += dev_workspace->H.n;
-#ifdef __DEBUG_CUDA__
-            fprintf (stderr, "Calculated value of the fill in is --> %d \n ", fillin );
-            dev_workspace->L.n = far_nbrs->n;
-            dev_workspace->L.m = fillin;
-            Cuda_Init_Sparse_Matrix( &dev_workspace->L, fillin, far_nbrs->n );
-            dev_workspace->U.n = far_nbrs->n;
-            dev_workspace->U.m = fillin;
-            Cuda_Init_Sparse_Matrix( &dev_workspace->U, fillin, far_nbrs->n );
-        }
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, "LU matrix done...\n");
-        //TODO -- This is the ILU Factorization of the H Matrix. 
-        //This is present in the CUDA 5.0 compilation which is not working currently. 
-        //Fix this when CUDA 5.0 is correctly setup. 
-        //TODO
-        //shared memory is per block
-        // here we have only one block - 
-        /*
-           fprintf (stderr, "max sparse matrix entries %d \n", system->max_sparse_matrix_entries );
-           Cuda_ICHOLT <<<1, system->max_sparse_matrix_entries, 
-           system->max_sparse_matrix_entries *(REAL_SIZE + SPARSE_MATRIX_ENTRY_SIZE)   >>>
-           ( system, dev_workspace->H, 
-           dev_workspace->droptol, 
-           dev_workspace->L, 
-           dev_workspace->U );
-           cudaThreadSynchronize ();
-           fprintf (stderr, "Cuda_ICHOLT .. done ...-> %d\n ", cudaGetLastError ());
-         */
-        //1. copy the H matrix from device to host
-        //2. Allocate the L/U matrices on the host and device. 
-        //3. Compute the L/U on the host
-        //4. copy the results to the device
-        //5. Continue the computation.
-        sparse_matrix t_H, t_L, t_U;
-        real *t_droptol;
-        t_droptol = (real *) malloc (REAL_SIZE * system->N);
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m );
-        start = Get_Time ();
-        if (!Allocate_Matrix (&t_H, dev_workspace->H.n, dev_workspace->H.m)) { fprintf (stderr, "No space for H matrix \n"); exit (0);}
-        if (!Allocate_Matrix (&t_L, far_nbrs->n, dev_workspace->L.m)) { fprintf (stderr, "No space for L matrix \n"); exit (0); }
-        if (!Allocate_Matrix (&t_U, far_nbrs->n, dev_workspace->U.m)) { fprintf (stderr, "No space for U matrix \n"); exit (0); }
-        copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX );
-        copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX );
-        copy_host_device ( t_H.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY );
-        copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL );
-        //fprintf (stderr, " Done copying LUH .. \n");
-        Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U);
-        Sync_Host_Device (&t_L, &t_U, cudaMemcpyHostToDevice);
-        end += Get_Timing_Info (start);
-        /*
-           fprintf (stderr, "Done syncing .... \n");
-           free (t_droptol);
-           fprintf (stderr, "Freed droptol ... \n");
-           Deallocate_Matrix (&t_H);
-           fprintf (stderr, "Freed H ... \n");
-           Deallocate_Matrix (&t_L);
-           fprintf (stderr, "Freed l ... \n");
-           Deallocate_Matrix (&t_U);
-           fprintf (stderr, "Freed u ... \n");
-         */
-        //#ifdef __DEBUG_CUDA__
-        fprintf (stderr, "Done copying the L/U matrices to the device ---> %f \n", end);
-        //#endif
-        //#ifdef __BUILD_DEBUG__
-        //        validate_lu (workspace);
-        //#endif
-    }
-GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N )
-    static_storage *workspace = &p_workspace;
-    real s_tmp, t_tmp;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    // no extrapolation
-    //s_tmp = workspace->s[0][i];
-    //t_tmp = workspace->t[0][i];
-    // linear
-    //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
-    //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
-    // quadratic
-    //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
-    t_tmp = workspace->t[index_wkspace_sys(2,i,N)] + 3*(workspace->t[index_wkspace_sys(0,i,N)]-workspace->t[index_wkspace_sys(1,i,N)]);
-    // cubic
-    s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,N)] + workspace->s[index_wkspace_sys(2,i,N)]) - 
-        (6 * workspace->s[index_wkspace_sys(1,i,N)] + workspace->s[index_wkspace_sys(3,i,N)] );
-    //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
-    //  (6 * workspace->t[1][i] + workspace->t[3][i] );
-    // 4th order
-    //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
-    //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
-    //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
-    //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
-    workspace->s[index_wkspace_sys(4,i,N)] = workspace->s[index_wkspace_sys(3,i,N)];
-    workspace->s[index_wkspace_sys(3,i,N)] = workspace->s[index_wkspace_sys(2,i,N)]; 
-    workspace->s[index_wkspace_sys(2,i,N)] = workspace->s[index_wkspace_sys(1,i,N)];
-    workspace->s[index_wkspace_sys(1,i,N)] = workspace->s[index_wkspace_sys(0,i,N)];
-    workspace->s[index_wkspace_sys(0,i,N)] = s_tmp;
-    workspace->t[index_wkspace_sys(4,i,N)] = workspace->t[index_wkspace_sys(3,i,N)];
-    workspace->t[index_wkspace_sys(3,i,N)] = workspace->t[index_wkspace_sys(2,i,N)]; 
-    workspace->t[index_wkspace_sys(2,i,N)] = workspace->t[index_wkspace_sys(1,i,N)];
-    workspace->t[index_wkspace_sys(1,i,N)] = workspace->t[index_wkspace_sys(0,i,N)];
-    workspace->t[index_wkspace_sys(0,i,N)] = t_tmp;
-void Calculate_Charges( reax_system *system, static_storage *workspace )
-    int i;
-    real u, s_sum, t_sum;
-    s_sum = t_sum = 0.;
-    for( i = 0; i < system->N; ++i ) {
-        s_sum += workspace->s[index_wkspace_sys(0,i,system)];
-        t_sum += workspace->t[index_wkspace_sys(0,i,system)];
-    }
-    u = s_sum / t_sum;
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
-    for( i = 0; i < system->N; ++i )
-        system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system)] - u * workspace->t[index_wkspace_sys(0,i,system)];
-GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, int N)
-    int i = blockIdx.x*blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)];
-void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace)
-    real *spad = (real *) scratch;
-    real u, s_sum, t_sum;
-    cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
-    //s_sum 
-    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
-        (&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-        (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    copy_host_device (&s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-    //t_sum
-    cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
-    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
-        (&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-        (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    copy_host_device (&t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-    //fraction here
-    u = s_sum / t_sum;
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
-    Cuda_Update_Atoms_q <<< BLOCKS, BLOCK_SIZE >>>
-        ( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-void QEq( reax_system *system, control_params *control, simulation_data *data, 
-        static_storage *workspace, list *far_nbrs, 
-        output_controls *out_control )
-    int matvecs;
-    //real t_start, t_elapsed;
-    //t_start = Get_Time ();
-    Init_MatVec( system, control, data, workspace, far_nbrs );
-    //t_elapsed = Get_Timing_Info ( t_start );
-    //fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed );
-    //if( data->step % 10 == 0 )
-    //  Print_Linear_System( system, control, workspace, far_nbrs, data->step );
-    //t_start = Get_Time ( );
-    matvecs = GMRES( workspace, &workspace->H, 
-            workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system );
-    matvecs += GMRES( workspace, &workspace->H, 
-            workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system );
-    //t_elapsed = Get_Timing_Info ( t_start );
-    //fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed );
-    //matvecs = GMRES_HouseHolder( workspace, workspace->H, 
-    //    workspace->b_s, control->q_err, workspace->s[0], out_control->log );
-    //matvecs += GMRES_HouseHolder( workspace, workspace->H,  
-    //    workspace->b_t, control->q_err, workspace->t[0], out_control->log );
-    //matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err,
-    //  &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system)], out_control->log, system );
-    //matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err,
-    //  &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system)], out_control->log, system );
-    //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, 
-    //      workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1;
-    ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, 
-    //     workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1;
-    //matvecs = CG( workspace, workspace->H, 
-    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
-    //matvecs += CG( workspace, workspace->H, 
-    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
-    //matvecs = SDM( workspace, workspace->H, 
-    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
-    //matvecs += SDM( workspace, workspace->H, 
-    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
-    //fprintf (stderr, " GMRES done with iterations %d \n", matvecs );
-    data->timing.matvecs += matvecs;
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "linsolve-" );
-    Calculate_Charges( system, workspace );
-    //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", 
-    //   data->step, 
-    //   workspace->s[0][0], workspace->t[0][0], 
-    //   workspace->s[0][1], workspace->t[0][1], 
-    //   workspace->s[0][2], workspace->t[0][2] );
-    // if( data->step == control->nsteps )
-    //Print_Charges( system, control, workspace, data->step );
-void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data, 
-        static_storage *workspace, list *far_nbrs, 
-        output_controls *out_control )
-    int matvecs = 0;
-    real t_start, t_elapsed;
-#ifdef __DEBUG_CUDA__
-    t_start = Get_Time ();
-    /*
-    //Cuda_Init_MatVec( system, control, data, workspace, far_nbrs );
-    Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>>
-    ( dev_workspace->H );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    t_elapsed = Get_Timing_Info (t_start);
-    fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed);
-     */
-    Init_MatVec_Postprocess <<< BLOCKS, BLOCK_SIZE >>>
-        (*dev_workspace, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-    t_elapsed = Get_Timing_Info (t_start);
-    fprintf (stderr, "Done with post processing of init_matvec --> %d  with time ---> %f \n", cudaGetLastError (), t_elapsed);
-    //Here goes the GMRES part of the program ()
-    //#ifdef __DEBUG_CUDA__
-    t_start = Get_Time ();
-    //#endif
-    //matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s );
-    //matvecs += Cuda_GMRES( dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t );
-    matvecs = Cublas_GMRES( system, dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s );
-    matvecs += Cublas_GMRES( system, dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t );
-    d_timing.matvecs += matvecs;
-#ifdef __DEBUG_CUDA__
-    t_elapsed = Get_Timing_Info ( t_start );
-    fprintf (stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed );
-    //Here cuda calculate charges
-    Cuda_Calculate_Charges (system, workspace);
diff --git a/PuReMD-GPU/src/QEq.h b/PuReMD-GPU/src/QEq.h
index 27eceb9764b3a412728a13f37f6735f086afa546..31dfbf61ba05ec79d32313c3ab648eb259f183f2 100644
--- a/PuReMD-GPU/src/QEq.h
+++ b/PuReMD-GPU/src/QEq.h
@@ -23,10 +23,39 @@
 #include "mytypes.h"
 void QEq( reax_system*, control_params*, simulation_data*, static_storage*,
-          list*, output_controls* );
+        list*, output_controls* );
+static inline HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) 
+    sparse_matrix_entry temp = array[index1];
+    array[index1] = array[index2];
+    array[index2] = temp;
+static inline HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end)
+    int i = start;
+    int k = end; 
+    if (end - start >= 1)  
+    {  
+        int pivot = array[start].j;
+        while (k > i) 
+        {  
+            while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++;
+            while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--;
+            if (k > i) swap(array, i, k);
+        }  
+        swap(array, start, k);
+        quick_sort(array, start, k - 1);
+        quick_sort(array, k + 1, end);
+    }  
-void Cuda_QEq( reax_system*, control_params*, simulation_data*, static_storage*,
-               list*, output_controls* );
diff --git a/PuReMD-GPU/src/allocate.c b/PuReMD-GPU/src/allocate.c
new file mode 100644
index 0000000000000000000000000000000000000000..65f0eb2a872673259d508f17fc0da43530a7426f
--- /dev/null
+++ b/PuReMD-GPU/src/allocate.c
@@ -0,0 +1,281 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "allocate.h"
+#include "list.h"
+void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
+    Delete_List( far_nbrs );
+    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs ))
+    {
+        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
+        exit( INIT_ERR );
+    }
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
+            num_intrs, far_nbrs->num_intrs );  
+    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
+            num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
+HOST int Allocate_Matrix( sparse_matrix *H, int n, int m )
+    H->n = n;
+    H->m = m;
+    if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL )
+        return 0;
+    if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL )
+        return 0;
+    if( (H->entries = 
+                (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL )
+        return 0;
+    return 1;
+void Deallocate_Matrix( sparse_matrix *H )
+    free(H->start);
+    free(H->entries);
+    free(H->end);
+int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
+    Deallocate_Matrix( H );
+    if( !Allocate_Matrix( H, n, m ) ) {
+        fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
+        exit( 1 );
+    }
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
+            name, n, m );
+    fprintf( stderr, "memory allocated: %s = %ldMB\n", 
+            name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
+    return 1;
+int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, 
+        list *hbonds )
+    int i, num_hbonds;
+    num_hbonds = 0;
+    /* find starting indexes for each H and the total number of hbonds */
+    for( i = 1; i < n; ++i )
+        hb_top[i] += hb_top[i-1];
+    num_hbonds = hb_top[n-1];
+    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) )
+    {
+        fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
+        exit( INIT_ERR );
+    }
+    for( i = 0; i < n; ++i )
+        if( h_index[i] == 0 ){
+            Set_Start_Index( 0, 0, hbonds ); 
+            Set_End_Index( 0, 0, hbonds ); 
+        }
+        else if( h_index[i] > 0 ){
+            Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); 
+            Set_End_Index( h_index[i], hb_top[i-1], hbonds ); 
+        }
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds );
+    fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
+            num_hbonds * sizeof(hbond_data) / (1024*1024) );
+    return 1;
+int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
+    int i;
+    int *hb_top;
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "reallocating hbonds\n" );
+    hb_top = (int *)calloc( n, sizeof(int) );
+    for( i = 0; i < n; ++i )
+        if( h_index[i] >= 0 )
+            hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS);
+    Delete_List( hbonds );
+    Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
+    free( hb_top );
+    return 1;
+int Allocate_Bond_List( int n, int *bond_top, list *bonds )
+    int i, num_bonds;
+    num_bonds = 0;
+    /* find starting indexes for each atom and the total number of bonds */
+    for( i = 1; i < n; ++i )
+        bond_top[i] += bond_top[i-1];
+    num_bonds = bond_top[n-1];
+    if( !Make_List(n, num_bonds, TYP_BOND, bonds ) )
+    {
+        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
+        exit( INIT_ERR );
+    }
+    Set_Start_Index( 0, 0, bonds ); 
+    Set_End_Index( 0, 0, bonds ); 
+    for( i = 1; i < n; ++i ) {
+        Set_Start_Index( i, bond_top[i-1], bonds ); 
+        Set_End_Index( i, bond_top[i-1], bonds ); 
+    }
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds );
+    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
+            num_bonds * sizeof(bond_data) / (1024*1024) );
+    return 1;
+int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
+    int i;
+    int *bond_top;
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "reallocating bonds\n" );
+    bond_top = (int *)calloc( n, sizeof(int) );
+    *est_3body = 0;
+    for( i = 0; i < n; ++i ){
+        *est_3body += SQR( Num_Entries( i, bonds ) );
+        bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
+    }
+    Delete_List( bonds );
+    Allocate_Bond_List( n, bond_top, bonds );
+    *num_bonds = bond_top[n-1];
+    free( bond_top );
+    return 1;
+void Reallocate( reax_system *system, static_storage *workspace, list **lists, 
+        int nbr_flag )
+    int num_bonds, est_3body;
+    reallocate_data *realloc;
+    grid *g;
+    realloc = &(workspace->realloc);
+    g = &(system->g);
+    if( realloc->num_far > 0 && nbr_flag ) {
+        fprintf (stderr, " Reallocating neighbors \n");
+        Reallocate_Neighbor_List( (*lists)+FAR_NBRS, 
+                system->N, realloc->num_far * SAFE_ZONE );
+        realloc->num_far = -1;
+    }
+    if( realloc->Htop > 0 ){
+        fprintf (stderr, " Reallocating Matrix \n");
+        Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
+        realloc->Htop = -1;
+        Deallocate_Matrix( &workspace->L );
+        Deallocate_Matrix( &workspace->U );
+    }
+    if( realloc->hbonds > 0 ){
+        fprintf (stderr, " Reallocating hbonds \n");
+        Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
+                (*lists)+HBONDS );
+        realloc->hbonds = -1;
+    }
+    num_bonds = est_3body = -1;
+    if( realloc->bonds > 0 ){
+        fprintf (stderr, " Reallocating bonds \n");
+        Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body );
+        realloc->bonds = -1;
+        realloc->num_3body = MAX( realloc->num_3body, est_3body );
+    }
+    if( realloc->num_3body > 0 ) {
+        fprintf (stderr, " Reallocating 3Body \n");
+        Delete_List( (*lists)+THREE_BODIES );
+        if( num_bonds == -1 )
+            num_bonds = ((*lists)+BONDS)->num_intrs;
+        realloc->num_3body *= SAFE_ZONE;
+        if( !Make_List( num_bonds, realloc->num_3body,
+                    TYP_THREE_BODY, (*lists)+THREE_BODIES ) )
+        {
+            fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
+            exit( INIT_ERR );
+        }
+        realloc->num_3body = -1;
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "reallocating 3 bodies\n" );
+        fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds );
+        fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body );
+        fprintf( stderr, "reallocated 3body memory: %ldMB\n", 
+                realloc->num_3body*sizeof(three_body_interaction_data)/
+                (1024*1024) );
+    }
+    if( realloc->gcell_atoms > -1 ){
+#if defined(DEBUG_FOCUS)
+        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
+        free (g->atoms);
+        g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
+                sizeof (int) * workspace->realloc.gcell_atoms);
+        realloc->gcell_atoms = -1;
+    }
diff --git a/PuReMD-GPU/src/allocate.h b/PuReMD-GPU/src/allocate.h
index 7ab146bbb763073377b2ffb16905b6694a85765d..b03ed80b34f153b9929ccaa80bc5c27fbf6ce540 100644
--- a/PuReMD-GPU/src/allocate.h
+++ b/PuReMD-GPU/src/allocate.h
@@ -23,6 +23,11 @@
 #include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
 void Reallocate( reax_system*, static_storage*, list**, int );
 int Allocate_Matrix( sparse_matrix*, int, int );
@@ -32,13 +37,9 @@ int Allocate_HBond_List( int, int, int*, int*, list* );
 int Allocate_Bond_List( int, int*, list* );
-//Cuda Functions
-int Cuda_Allocate_Matrix( sparse_matrix*, int, int );
-int Cuda_Allocate_HBond_List( int, int, int*, int*, list* );
-int Cuda_Allocate_Bond_List( int, int*, list* );
-void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int );
+#ifdef __cplusplus
-GLOBAL void Init_HBond_Indexes ( int *, int *, list , int  );
-GLOBAL void Init_Bond_Indexes ( int *, list , int  );
diff --git a/PuReMD-GPU/src/bond_orders.cu b/PuReMD-GPU/src/bond_orders.c
similarity index 57%
rename from PuReMD-GPU/src/bond_orders.cu
rename to PuReMD-GPU/src/bond_orders.c
index 57f5baacb761a34cd8012f28b6ecb3e7c63d6081..49eaed6532449ef34ee3dac26a71462b2edbdd36 100644
--- a/PuReMD-GPU/src/bond_orders.cu
+++ b/PuReMD-GPU/src/bond_orders.c
@@ -19,23 +19,21 @@
 #include "bond_orders.h"
+#include "index_utils.h"
 #include "list.h"
 #include "lookup.h"
 #include "print_utils.h"
 #include "vector.h"
-#include "index_utils.h"
-#include "cuda_utils.h"
-#include "cuda_helpers.h"
 inline real Cf45( real p1, real p2 )
     return  -EXP(-p2 / 2) / 
         ( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) );
 void Get_dBO( reax_system *system, list **lists, 
         int i, int pj, real C, rvec *v )
@@ -66,7 +64,8 @@ void Get_dBOpinpi2( reax_system *system, list **lists,
     start_pj = Start_Index(pj, dBOs);
     end_pj = End_Index(pj, dBOs);
-    for( k = start_pj; k < end_pj; ++k ) {
+    for( k = start_pj; k < end_pj; ++k )
+    {
         dbo_k = &(dBOs->select.dbo_list[k]);
         rvec_Scale( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi );
         rvec_Scale( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 );
@@ -179,7 +178,6 @@ void Add_dDelta_to_Forces( reax_system *system, list **lists, int i, real C )
 HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace, 
         list p_bonds, list p_dBOs, int *top )
@@ -367,7 +365,6 @@ HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace,
 void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system, 
         simulation_data *data, static_storage *workspace, 
         list **lists )
@@ -521,134 +518,7 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
        temp[0], temp[1], temp[2] ); */
-//Cuda Functions
-HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms, 
-        simulation_data *data, static_storage *workspace, 
-        list *bonds )
-    bond_data *nbr_j, *nbr_k;
-    bond_order_data *bo_ij, *bo_ji; 
-    dbond_coefficients coef;
-    rvec temp, ext_press;
-    ivec rel_box;
-    int pk, k, j;
-    /* Initializations */
-    nbr_j = &(bonds->select.bond_list[pj]);
-    j = nbr_j->nbr;
-    bo_ij = &(nbr_j->bo_data);
-    bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-    /************************************
-     * forces related to atom i          *
-     * first neighbors of atom i         *
-     ************************************/
-    for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-        nbr_k = &(bonds->select.bond_list[pk]);
-        k = nbr_k->nbr;
-        rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp );       /*2nd,dBO*/
-        rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/
-        rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/
-        rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/
-        /* force */
-        rvec_Add( atoms[k].f, temp );
-        /* pressure */
-        rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
-        rvec_Add( data->ext_press, ext_press );
-    }
-    /* then atom i itself  */
-    rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st, dBO*/
-    rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd, dBO*/
-    rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st, dBO*/
-    rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/
-    rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );         /*1st,dBOpi*/
-    rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );               /*2nd,dBOpi*/
-    rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/
-    rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ;      /*1st,dBO_pi2*/
-    rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBO_pi2*/
-    rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/
-    /* force */
-    rvec_Add( atoms[i].f, temp );
-    /* ext pressure due to i dropped, counting force on j only will be enough */
-    /****************************************************************************
-     * forces and pressure related to atom j                                    *
-     * first neighbors of atom j                                                *
-     ***************************************************************************/
-    for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
-        nbr_k = &(bonds->select.bond_list[pk]);
-        k = nbr_k->nbr;
-        rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );       /*3rd,dBO*/
-        rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ 
-        rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/
-        rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/
-        /* force */
-        rvec_Add( atoms[k].f, temp );
-        /* pressure */
-        if( k != i ) {
-            ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box  wrt i
-            rvec_iMultiply( ext_press, rel_box, temp );
-            rvec_Add( data->ext_press, ext_press );
-        }
-    }
-    /* then atom j itself */
-    rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                     /*1st, dBO*/
-    rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );   /*2nd, dBO*/
-    rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );              /*1st, dBO*/
-    rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/
-    rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
-    rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
-    rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/
-    rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2);       /*1st,dBOpi2*/
-    rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBOpi2*/
-    rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
-    /* force */
-    rvec_Add( atoms[j].f, temp );
-    /* pressure */
-    rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
-    rvec_Add( data->ext_press, ext_press );
-//Cuda Functions
 void Add_dBond_to_Forces( int i, int pj, reax_system *system, 
         simulation_data *data, static_storage *workspace, 
         list **lists )
@@ -761,154 +631,6 @@ void Add_dBond_to_Forces( int i, int pj, reax_system *system,
     /*3rd, dBOpi2*/
-HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms, 
-        static_storage *workspace, list *bonds )
-    bond_data *nbr_j, *nbr_k;
-    bond_order_data *bo_ij, *bo_ji; 
-    dbond_coefficients coef;
-    int pk, k, j;
-    rvec t_f;
-    /* Initializations */ 
-    nbr_j = &(bonds->select.bond_list[pj]);
-    j = nbr_j->nbr;
-    if (i < j)
-    {
-        bo_ij = &(nbr_j->bo_data);
-        bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-    } else {
-        bo_ji = &(nbr_j->bo_data);
-        bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-    }
-    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-    if ( i < j) {
-        for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-            nbr_k = &(bonds->select.bond_list[pk]);
-            k = nbr_k->nbr;
-            rvec_MakeZero (t_f);
-            rvec_ScaledAdd( t_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); 
-            /*2nd, dBO*/
-            rvec_ScaledAdd( t_f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
-            /*dDelta*/
-            rvec_ScaledAdd( t_f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
-            /*3rd, dBOpi*/
-            rvec_ScaledAdd( t_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
-            /*3rd, dBOpi2*/
-            //Store in the temp place
-            rvec_Add (nbr_k->t_f, t_f);
-        }
-        rvec_ScaledAdd( atoms[i].f, coef.C1dbo, bo_ij->dBOp );
-        /*1st, dBO*/
-        rvec_ScaledAdd( atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] );
-        /*2nd, dBO*/
-        rvec_ScaledAdd(atoms[i].f, coef.C1dDelta, bo_ij->dBOp);
-        /*1st, dBO*/
-        rvec_ScaledAdd(atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]);
-        /*2nd, dBO*/
-        rvec_ScaledAdd( atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi );
-        /*1st, dBOpi*/
-        rvec_ScaledAdd( atoms[i].f, coef.C2dbopi, bo_ij->dBOp );
-        /*2nd, dBOpi*/
-        rvec_ScaledAdd( atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]);
-        /*3rd, dBOpi*/
-        rvec_ScaledAdd( atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-        /*1st, dBO_pi2*/
-        rvec_ScaledAdd( atoms[i].f, coef.C2dbopi2, bo_ij->dBOp );
-        /*2nd, dBO_pi2*/
-        rvec_ScaledAdd( atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]);
-        /*3rd, dBO_pi2*/
-    }
-    else 
-    {
-        for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-            nbr_k = &(bonds->select.bond_list[pk]);
-            k = nbr_k->nbr;
-            rvec_MakeZero (t_f);
-            rvec_ScaledAdd( t_f, -coef.C3dbo, nbr_k->bo_data.dBOp );
-            /*3rd, dBO*/
-            rvec_ScaledAdd( t_f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
-            /*dDelta*/ 
-            rvec_ScaledAdd( t_f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
-            /*4th, dBOpi*/
-            rvec_ScaledAdd( t_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
-            /*4th, dBOpi2*/
-            //Store in the temp place
-            rvec_Add (nbr_k->t_f, t_f);
-        }
-        rvec_ScaledAdd( atoms[i].f, -coef.C1dbo, bo_ij->dBOp );
-        /*1st, dBO*/
-        rvec_ScaledAdd( atoms[i].f, coef.C3dbo, workspace->dDeltap_self[i] );
-        /*2nd, dBO*/
-        rvec_ScaledAdd( atoms[i].f, -coef.C1dDelta, bo_ij->dBOp );
-        /*1st, dBO*/
-        rvec_ScaledAdd(atoms[i].f, coef.C3dDelta, workspace->dDeltap_self[i]);
-        /*2nd, dBO*/
-        rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi, bo_ij->dln_BOp_pi );
-        /*1st, dBOpi*/
-        rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi, bo_ij->dBOp );
-        /*2nd, dBOpi*/
-        rvec_ScaledAdd(atoms[i].f, coef.C4dbopi, workspace->dDeltap_self[i]);
-        /*3rd, dBOpi*/
-        rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-        /*1st, dBOpi2*/
-        rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi2, bo_ij->dBOp );
-        /*2nd, dBOpi2*/
-        rvec_ScaledAdd(atoms[i].f, coef.C4dbopi2, workspace->dDeltap_self[i]);
-        /*3rd, dBOpi2*/
-    }
-HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list *bonds)
-    int pk;
-    bond_data *nbr_k, *nbr_k_sym;
-    /*
-       for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-       nbr_k = &(bonds->select.bond_list[pk]);
-       rvec_Add (atoms[i].f, nbr_k->t_f);
-       }
-     */
-    for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-        nbr_k = &(bonds->select.bond_list[pk]);
-        nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] );
-        rvec_Add (atoms[i].f, nbr_k_sym->t_f);
-    }
 /* Locate j on i's list.
    This function assumes that j is there for sure!
@@ -1031,7 +753,7 @@ void Calculate_Bond_Orders( reax_system *system, control_params *control,
             //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
             if( i < j ) {
-                twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] );          
+                twbp = &( system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ] );          
                 Set_Start_Index( pj, top_dbo, dBOs );
                 /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", 
@@ -1348,557 +1070,3 @@ bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
     Print_Bond_Orders( system, control, data, workspace, lists, out_control );
-//Cuda Functions
-GLOBAL void Cuda_Calculate_Bond_Orders_Init (  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
-        static_storage workspace, int num_atom_types, int N )
-    int i, type_i;
-    real p_boc1, p_boc2;
-    single_body_parameters *sbp_i;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    p_boc1 = g_params.l[0];
-    p_boc2 = g_params.l[1];
-    /* Calculate Deltaprime, Deltaprime_boc values */
-    type_i = atoms[i].type;
-    sbp_i = &(sbp[type_i]);
-    workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency;
-    workspace.Deltap_boc[i] = 
-        workspace.total_bond_order[i] - sbp_i->valency_val;
-    workspace.total_bond_order[i] = 0;
-/* A very important and crucial assumption here is that each segment
-   belonging to a different atom in nbrhoods->nbr_list is sorted in its own.
-   This can either be done in the general coordinator function or here */
-GLOBAL void Cuda_Calculate_Bond_Orders (  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
-        two_body_parameters *tbp, static_storage workspace, list bonds,
-        list dDeltas, list dBOs, int num_atom_types, int N )
-    int i, j, pj, type_i, type_j;
-    int start_i, end_i;
-    int num_bonds, sym_index;
-    real p_boc1, p_boc2;
-    real val_i, Deltap_i, Deltap_boc_i;
-    real val_j, Deltap_j, Deltap_boc_j;
-    real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
-    real exp_p1i,    exp_p2i, exp_p1j, exp_p2j;
-    real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
-    real Cf45_ij, Cf45_ji, p_lp1;
-    real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
-    real explp1;
-    two_body_parameters *twbp;
-    bond_order_data *bo_ij, *bo_ji;
-    single_body_parameters *sbp_i, *sbp_j;
-#if defined(TEST_FORCES)
-    int  k, pk, start_j, end_j;
-    int  top_dbo=0, top_dDelta=0;
-    dbond_data *pdbo;
-    dDelta_data *ptop_dDelta;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    num_bonds = 0;
-    p_boc1 = g_params.l[0];
-    p_boc2 = g_params.l[1];
-    /* Calculate Deltaprime, Deltaprime_boc values */
-    //for( i = 0; i < system->N; ++i ) {
-    /*
-       if (i < N) {
-       type_i = atoms[i].type;
-       sbp_i = &(sbp[type_i]);
-       workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency;
-       workspace.Deltap_boc[i] = 
-       workspace.total_bond_order[i] - sbp_i->valency_val;
-       workspace.total_bond_order[i] = 0;
-       }
-       __syncthreads ();
-     */
-    // fprintf( stderr, "done with uncorrected bond orders\n" );
-    /* Corrected Bond Order calculations */
-    //for( i = 0; i < system->N; ++i ) {
-    type_i = atoms[i].type;
-    sbp_i = &(sbp[type_i]);
-    val_i = sbp_i->valency;
-    Deltap_i = workspace.Deltap[i];
-    Deltap_boc_i = workspace.Deltap_boc[i];
-    start_i = Start_Index(i, &bonds);
-    end_i = End_Index(i, &bonds);
-    //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
-    //       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
-    for( pj = start_i; pj < end_i; ++pj ) {
-        j = bonds.select.bond_list[pj].nbr;
-        type_j = atoms[j].type;
-        bo_ij = &( bonds.select.bond_list[pj].bo_data );
-        //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
-        if( i < j ) {
-            twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );          
-            Set_Start_Index( pj, top_dbo, &dBOs );
-            /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", 
-               workspace->reverse_map[i], workspace->reverse_map[j], 
-               twbp->ovc, twbp->v13cor, bo_ij->BO ); */
-            if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
-                /* There is no correction to bond orders nor to derivatives of 
-                   bond order prime! So we leave bond orders unchanged and 
-                   set derivative of bond order coefficients s.t. 
-                   dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
-                bo_ij->C1dbo = 1.000000;
-                bo_ij->C2dbo = 0.000000;
-                bo_ij->C3dbo = 0.000000; 
-                bo_ij->C1dbopi = bo_ij->BO_pi;
-                bo_ij->C2dbopi = 0.000000;
-                bo_ij->C3dbopi = 0.000000;
-                bo_ij->C4dbopi = 0.000000;
-                bo_ij->C1dbopi2 = bo_ij->BO_pi2; 
-                bo_ij->C2dbopi2 = 0.000000;
-                bo_ij->C3dbopi2 = 0.000000;
-                bo_ij->C4dbopi2 = 0.000000;
-                pdbo = &(dBOs.select.dbo_list[ top_dbo ]);
-                // compute dBO_ij/dr_i
-                pdbo->wrt = i;
-                rvec_Copy( pdbo->dBO, bo_ij->dBOp );
-                rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
-                rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
-                // compute dBO_ij/dr_j
-                pdbo++;
-                pdbo->wrt = j;
-                rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp );
-                rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi );
-                rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 );
-                top_dbo += 2;
-            }
-            else {
-                val_j = sbp[type_j].valency;
-                Deltap_j = workspace.Deltap[j];
-                Deltap_boc_j = workspace.Deltap_boc[j];
-                /* on page 1 */
-                if( twbp->ovc >= 0.001 ) {
-                    /* Correction for overcoordination */        
-                    exp_p1i = EXP( -p_boc1 * Deltap_i );
-                    exp_p2i = EXP( -p_boc2 * Deltap_i );
-                    exp_p1j = EXP( -p_boc1 * Deltap_j );
-                    exp_p2j = EXP( -p_boc2 * Deltap_j );
-                    f2 = exp_p1i + exp_p1j;            
-                    f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
-                    f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + 
-                            ( val_j + f2 )/( val_j + f2 + f3 ) );
-                    /*fprintf( stderr,"%6d%6d\t%g %g   j:%g %g  p_boc:%g %g\n",
-                      i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 );
-                      fprintf( stderr,"\tf:%g  %g  %g, exp:%g %g %g %g\n", 
-                      f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
-                    /* Now come the derivates */        
-                    /* Bond Order pages 5-7, derivative of f1 */
-                    temp = f2 + f3;
-                    u1_ij = val_i + temp;
-                    u1_ji = val_j + temp;
-                    Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji ));
-                    Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + 
-                            ( u1_ji - f3 ) / SQR( u1_ji ));
-                    //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
-                    //          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
-                    Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - 
-                            ((val_i+f2) / SQR(u1_ij)) * 
-                            ( -p_boc1 * exp_p1i + 
-                              exp_p2i / ( exp_p2i + exp_p2j ) ) + 
-                            -p_boc1 * exp_p1i / u1_ji - 
-                            ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i +  
-                            exp_p2i / ( exp_p2i + exp_p2j ) ));
-                    Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + 
-                        Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); 
-                    //fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
-                }
-                else {
-                    /* No overcoordination correction! */
-                    f1 = 1.0;
-                    Cf1_ij = Cf1_ji = 0.0;          
-                }
-                if( twbp->v13cor >= 0.001 ) {
-                    /* Correction for 1-3 bond orders */
-                    exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
-                                Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
-                    exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
-                                Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
-                    f4 = 1. / (1. + exp_f4);
-                    f5 = 1. / (1. + exp_f5);
-                    f4f5 = f4 * f5;
-                    /* Bond Order pages 8-9, derivative of f4 and f5 */
-                    /*temp = twbp->p_boc5 - 
-                      twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
-                      u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
-                      u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
-                      Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
-                      Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
-                    Cf45_ij = -f4 * exp_f4;
-                    Cf45_ji = -f5 * exp_f5;
-                }
-                else {
-                    f4 = f5 = f4f5 = 1.0;
-                    Cf45_ij = Cf45_ji = 0.0;
-                }
-                /* Bond Order page 10, derivative of total bond order */
-                A0_ij = f1 * f4f5;
-                A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * 
-                    (Cf45_ij + Cf45_ji);
-                A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
-                A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
-                A3_ij = A2_ij + Cf1_ij / f1;
-                A3_ji = A2_ji + Cf1_ji / f1;
-                /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f 
-A2_ji: %f, A3_ij: %f, A3_ji: %f\n",
-bo_ij->BO, A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/
-                /* find corrected bond order values and their deriv coefs */
-                bo_ij->BO    = bo_ij->BO    * A0_ij;
-                bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
-                bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
-                bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
-                bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
-                bo_ij->C2dbo = bo_ij->BO * A2_ij;
-                bo_ij->C3dbo = bo_ij->BO * A2_ji; 
-                bo_ij->C1dbopi = f1*f1*f4*f5;
-                bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
-                bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
-                bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
-                bo_ij->C1dbopi2 = f1*f1*f4*f5;
-                bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
-                bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
-                /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", 
-                  i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/
-                /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
-                //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n",
-                workspace->orig_id[i], workspace->orig_id[j]
-                A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji
-                bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s,
-                bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, 
-                bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi,
-                bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2
-                ); */
-                Calculate_dBO( i, pj, workspace, lists, &top_dbo );
-            }
-            /* neglect bonds that are < 1e-10 */
-            if( bo_ij->BO < 1e-10 )
-                bo_ij->BO = 0.0;
-            if( bo_ij->BO_s < 1e-10 )
-                bo_ij->BO_s = 0.0;
-            if( bo_ij->BO_pi < 1e-10 )
-                bo_ij->BO_pi = 0.0;
-            if( bo_ij->BO_pi2 < 1e-10 )
-                bo_ij->BO_pi2 = 0.0;
-            workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
-            /* fprintf( stderr, "%d %d\t%g %g %g %g\n
-Cdbo:\t%g %g %g\n
-Cdbopi:\t%g %g %g %g\n
-Cdbopi2:%g %g %g %g\n\n", 
-i+1, j+1, bonds->select.bond_list[ pj ].d, 
-bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, 
-bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo,
-bo_ij->C1dbopi, bo_ij->C2dbopi, bo_ij->C3dbopi, bo_ij->C4dbopi,
-bo_ij->C1dbopi2, bo_ij->C2dbopi2, 
-bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
-            /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
-               i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */
-            Set_End_Index( pj, top_dbo, &dBOs );
-            //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
-        }
-        /*
-           else {
-        // We only need to update bond orders from bo_ji
-        //   everything else is set in uncorrected_bo calculations 
-        sym_index = bonds.select.bond_list[pj].sym_index;
-        bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data);
-        bo_ij->BO = bo_ji->BO;
-        bo_ij->BO_s = bo_ji->BO_s;
-        bo_ij->BO_pi = bo_ji->BO_pi;
-        bo_ij->BO_pi2 = bo_ji->BO_pi2;
-        workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
-        //Add_dBO( system, lists, j, sym_index, 1.0, workspace.dDelta );
-         */
-#ifdef TEST_FORCES 
-// fprintf( stderr, "dDelta computations\nj:" );
-Set_Start_Index( i, top_dDelta, &dDeltas );
-ptop_dDelta = &( dDeltas.select.dDelta_list[top_dDelta] );
-for( pj = start_i; pj < end_i; ++pj ) {
-    j = bonds.select.bond_list[pj].nbr;
-    // fprintf( stderr, "%d  ", j );
-    if( !rvec_isZero( workspace.dDelta[j] ) ) {
-        ptop_dDelta->wrt = j;
-        rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[j] );
-        rvec_MakeZero( workspace.dDelta[j] );
-        ++top_dDelta, ++ptop_dDelta;
-    }
-    start_j = Start_Index(j, &bonds);
-    end_j = End_Index(j, &bonds);     
-    for( pk = start_j; pk < end_j; ++pk ) {
-        k = bonds.select.bond_list[pk].nbr;    
-        if( !rvec_isZero( workspace.dDelta[k] ) ) {
-            ptop_dDelta->wrt = k;
-            rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[k] );
-            rvec_MakeZero( workspace.dDelta[k] );
-            ++top_dDelta, ++ptop_dDelta;
-        }
-    }
-Set_End_Index( i, top_dDelta, &dDeltas );
-/*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj )
-  fprintf( stdout, "dDel: %d %d [%g %g %g]\n",
-  i+1, dDeltas->select.dDelta_list[pj].wrt+1,
-  dDeltas->select.dDelta_list[pj].dVal[0], 
-  dDeltas->select.dDelta_list[pj].dVal[1], 
-  dDeltas->select.dDelta_list[pj].dVal[2] );*/
-/*fprintf(stderr,"\tCalculated actual bond orders ...\n" );
-  fprintf(stderr,"%6s%8s%8s%8s%8s%8s%8s%8s\n", 
-  "atom", "Delta", "Delta_e", "Delta_boc", "nlp", 
-  "Delta_lp", "Clp", "dDelta_lp" );*/
-   p_lp1 = g_params.l[15];
-//get the kernel ID for the following computation
-j = i;
-// Calculate some helper variables that are  used at many places 
-//  throughout force calculations 
-//for( j = 0; j < system->N; ++j ) {
-type_j = atoms[j].type;
-sbp_j = &(sbp[ type_j ]);
-workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency;
-workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e;
-workspace.Delta_boc[j] = workspace.total_bond_order[j] - 
-workspace.vlpex[j] =  workspace.Delta_e[j] - 
-2.0 * (int)(workspace.Delta_e[j]/2.0);
-explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j]));
-workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0);
-workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j];
-workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]);
-// Adri uses different dDelta_lp values than the ones in notes... //
-workspace.dDelta_lp[j] = workspace.Clp[j];
-//workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
-//((fabs(workspace->Delta_e[j]/2.0 - 
-//       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
-if( sbp_j->mass > 21.0 ) {
-workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
-workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
-workspace.dDelta_lp_temp[j] = 0.;
-else {
-workspace.nlp_temp[j] = workspace.nlp[j];
-workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
-workspace.dDelta_lp_temp[j] = workspace.Clp[j];
-//fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",
-//j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], 
-//workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt,
-//workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] );
- */
-//Print_Bonds( system, bonds, "sbonds.out" );
-#if defined(DEBUG)
-//fprintf( stderr, "Number of bonds: %d\n", num_bonds );
-//Print_Bond_Orders( system, control, data, workspace, lists, out_control );
-GLOBAL void Cuda_Update_Uncorrected_BO (  static_storage workspace, list bonds, int N )
-    int i, j, pj;
-    int start_i, end_i;
-    int sym_index;
-    bond_order_data *bo_ij, *bo_ji;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    start_i = Start_Index(i, &bonds);
-    end_i = End_Index(i, &bonds);
-    for( pj = start_i; pj < end_i; ++pj ) {
-        j = bonds.select.bond_list[pj].nbr;
-        bo_ij = &( bonds.select.bond_list[pj].bo_data );
-        if( i >= j ) {
-            // We only need to update bond orders from bo_ji
-            //   everything else is set in uncorrected_bo calculations 
-            sym_index = bonds.select.bond_list[pj].sym_index;
-            bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data);
-            bo_ij->BO = bo_ji->BO;
-            bo_ij->BO_s = bo_ji->BO_s;
-            bo_ij->BO_pi = bo_ji->BO_pi;
-            bo_ij->BO_pi2 = bo_ji->BO_pi2;
-            workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
-        }
-    }
-GLOBAL void Cuda_Update_Workspace_After_Bond_Orders(  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
-        static_storage workspace, int N )
-    int j, type_j;
-    real explp1;
-    real p_lp1;
-    single_body_parameters *sbp_i, *sbp_j;
-    j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= N) return;
-    p_lp1 = g_params.l[15];
-    /* Calculate some helper variables that are  used at many places 
-       throughout force calculations */
-    //for( j = 0; j < system->N; ++j ) {
-    type_j = atoms[j].type;
-    sbp_j = &(sbp[ type_j ]);
-    workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency;
-    workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e;
-    workspace.Delta_boc[j] = workspace.total_bond_order[j] - 
-        sbp_j->valency_boc;
-    workspace.vlpex[j] =  workspace.Delta_e[j] - 
-        2.0 * (int)(workspace.Delta_e[j]/2.0);
-    explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j]));
-    workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0);
-    workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j];
-    workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]);
-    /* Adri uses different dDelta_lp values than the ones in notes... */
-    workspace.dDelta_lp[j] = workspace.Clp[j];
-    //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
-    //((fabs(workspace->Delta_e[j]/2.0 - 
-    //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
-    if( sbp_j->mass > 21.0 ) {
-        workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
-        workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
-        workspace.dDelta_lp_temp[j] = 0.;
-    }
-    else {
-        workspace.nlp_temp[j] = workspace.nlp[j];
-        workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
-        workspace.dDelta_lp_temp[j] = workspace.Clp[j];
-    }
-    //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",
-    //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], 
-    //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt,
-    //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] );
-    //}
-//Import from the forces file. 
-GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data, 
-        static_storage workspace, list p_bonds, int ensemble, int N)
-    int i, pj;
-    list *bonds = &p_bonds;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < N) 
-    {
-        for (pj = Start_Index (i, bonds); pj < End_Index (i, bonds); ++pj)
-        {
-            //int j = bonds->select.bond_list[pj].nbr;
-            if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) 
-                Cuda_Add_dBond_to_Forces (i, pj, atoms, &workspace, bonds );
-            else 
-                Cuda_Add_dBond_to_Forces_NPT (i, pj, atoms, data, &workspace, bonds );
-        }
-    }
-GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *atoms, simulation_data *data, 
-        static_storage workspace, list p_bonds, int ensemble, int N)
-    int i, pj;
-    list *bonds = &p_bonds;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < N) 
-    {
-        if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) 
-            Cuda_dbond_to_Forces_postprocess (i, atoms, bonds );
-    }
diff --git a/PuReMD-GPU/src/bond_orders.h b/PuReMD-GPU/src/bond_orders.h
index 19fea911f129715cec305f0a4f1f0ea433d412d4..476bacfb0704c97d4b3b44b15e76883be4cad9b9 100644
--- a/PuReMD-GPU/src/bond_orders.h
+++ b/PuReMD-GPU/src/bond_orders.h
@@ -21,8 +21,10 @@
 #ifndef __BOND_ORDERS_H_
 #define __BOND_ORDERS_H_
 #include "mytypes.h"
 typedef struct
     real C1dbo, C2dbo, C3dbo;
@@ -31,6 +33,7 @@ typedef struct
     real C1dDelta, C2dDelta, C3dDelta;
 } dbond_coefficients;
 void Get_dBO( reax_system*, list**, int, int, real, rvec* );
 void Get_dBOpinpi2( reax_system*, list**, int, int, real, real, rvec*, rvec* );
@@ -52,16 +55,4 @@ void Add_dBond_to_Forces_NPT( int, int, reax_system*, simulation_data*,
 void Calculate_Bond_Orders( reax_system*, control_params*, simulation_data*,
                             static_storage*, list**, output_controls* );
-//CUDA Functions
-GLOBAL void Cuda_Calculate_Bond_Orders_Init (  reax_atom *, global_parameters , single_body_parameters *,
-        static_storage , int , int );
-GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *, global_parameters , single_body_parameters *,
-        two_body_parameters *, static_storage , list , list , list , int , int );
-GLOBAL void Cuda_Update_Uncorrected_BO (  static_storage , list , int );
-GLOBAL void Cuda_Update_Workspace_After_Bond_Orders(  reax_atom *, global_parameters , single_body_parameters *,
-        static_storage , int );
-GLOBAL void Cuda_Compute_Total_Force (reax_atom *, simulation_data *, static_storage , list , int , int );
-GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *, simulation_data *, static_storage , list , int , int );
-//HOST_DEVICE void Cuda_Add_dBond_to_Forces( int, int, reax_atom *, static_storage*, list* );
-//HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int, int, reax_atom *, simulation_data*, static_storage*, list* );
diff --git a/PuReMD-GPU/src/box.cu b/PuReMD-GPU/src/box.c
similarity index 100%
rename from PuReMD-GPU/src/box.cu
rename to PuReMD-GPU/src/box.c
diff --git a/PuReMD-GPU/src/box.h b/PuReMD-GPU/src/box.h
index ed8cc9f4e1900a71f5a6b319f98e7d425400f928..418aa6208a81fb05ee56ff09afc6ff76751f75c9 100644
--- a/PuReMD-GPU/src/box.h
+++ b/PuReMD-GPU/src/box.h
@@ -21,11 +21,13 @@
 #ifndef __BOX_H__
 #define __BOX_H__
 #include "mytypes.h"
 /* Initializes box from CRYST1 line of PDB */
 void Init_Box_From_CRYST(real, real, real, real, real, real,
-                         simulation_box*/*, int*/);
+        simulation_box*/*, int*/);
 /* Initializes box from box rtensor */
 void Update_Box(rtensor, simulation_box* /*, int*/);
@@ -43,13 +45,11 @@ void Transform( rvec, simulation_box*, char, rvec );
 void Transform_to_UnitBox( rvec, simulation_box*, char, rvec );
 void Get_NonPeriodic_Far_Neighbors( rvec, rvec, simulation_box*,
-                                    control_params*, far_neighbor_data*, int* );
+        control_params*, far_neighbor_data*, int* );
 void Get_Periodic_Far_Neighbors_Big_Box( rvec, rvec, simulation_box*,
-        control_params*, far_neighbor_data*,
-        int* );
+        control_params*, far_neighbor_data*, int* );
 void Get_Periodic_Far_Neighbors_Small_Box( rvec, rvec, simulation_box*,
-        control_params*, far_neighbor_data*,
-        int* );
+        control_params*, far_neighbor_data*, int* );
 void Distance_on_T3_Gen( rvec, rvec, simulation_box*, rvec );
 void Inc_on_T3_Gen( rvec, rvec, simulation_box* );
@@ -61,7 +61,9 @@ void Inc_Nbr_Box_Press( simulation_box*, int, int, int, rvec );*/
 /* this function returns cartesian norm but triclinic distance vector */
 real Metric_Product( rvec, rvec, simulation_box* );
-HOST_DEVICE inline real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r)
+void Print_Box_Information( simulation_box*, FILE* );
+static inline HOST_DEVICE real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box, rvec r)
     real norm = 0.0;
@@ -94,12 +96,8 @@ HOST_DEVICE inline real Sq_Distance_on_T3( rvec x1, rvec x2, simulation_box* box
-void Print_Box_Information( simulation_box*, FILE* );
-//CUDA Device Functions
-//HOST_DEVICE inline void Inc_on_T3( rvec, rvec, simulation_box* );
-HOST_DEVICE inline void Inc_on_T3( rvec x, rvec dx, simulation_box *box )
+static inline HOST_DEVICE void Inc_on_T3( rvec x, rvec dx, simulation_box *box )
     int i;
     real tmp;
@@ -115,4 +113,5 @@ HOST_DEVICE inline void Inc_on_T3( rvec x, rvec dx, simulation_box *box )
diff --git a/PuReMD-GPU/src/center_mass.h b/PuReMD-GPU/src/center_mass.h
deleted file mode 100644
index 4048511e2e4b311f9de74cef94c2a661d51b4b39..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/center_mass.h
+++ /dev/null
@@ -1,48 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#ifndef __CENTER_MASS_H__
-#define __CENTER_MASS_H__
-#include "mytypes.h"
-GLOBAL void center_of_mass_blocks (single_body_parameters *, reax_atom *,
-                                   rvec *res_xcm,
-                                   rvec *res_vcm,
-                                   rvec *res_amcm,
-                                   size_t n);
-GLOBAL void center_of_mass (rvec *xcm,
-                            rvec *vcm,
-                            rvec *amcm,
-                            rvec *res_xcm,
-                            rvec *res_vcm,
-                            rvec *res_amcm,
-                            size_t n);
-GLOBAL void compute_center_mass (single_body_parameters *sbp,
-                                 reax_atom *atoms,
-                                 real *results,
-                                 real xcm0, real xcm1, real xcm2,
-                                 size_t n);
-GLOBAL void compute_center_mass (real *input, real *output, size_t n);
diff --git a/PuReMD-GPU/src/cuda_QEq.cu b/PuReMD-GPU/src/cuda_QEq.cu
new file mode 100644
index 0000000000000000000000000000000000000000..033945338aa76aa4c02909f90d2ca3eb1dacad58
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_QEq.cu
@@ -0,0 +1,724 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_QEq.h"
+#include "QEq.h"
+#include "allocate.h"
+#include "lin_alg.h"
+#include "list.h"
+#include "print_utils.h"
+#include "index_utils.h"
+#include "system_props.h"
+#include "cuda_copy.h"
+#include "cuda_init.h"
+#include "cuda_utils.h"
+#include "cuda_lin_alg.h"
+#include "cuda_reduction.h"
+#include "sort.h"
+#include "validation.h"
+GLOBAL void Cuda_Sort_Matrix_Rows( sparse_matrix A )
+    int i;
+    int si, ei;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= A.n ) return;
+    si = A.start[i];
+    ei = A.end [i];
+    quick_sort( A.entries + si, 0, ei-si-1 );
+GLOBAL void Cuda_Calculate_Droptol( sparse_matrix p_A, real *droptol, real dtol )
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int k, j, offset, x, diagonal;
+    real val;
+    sparse_matrix *A = &p_A;
+    if ( i < A->n ) {
+        droptol [i] = 0;
+        for (k = A->start[i]; k < A->end[i]; ++k ) {
+            val = A->entries[k].val;
+            droptol [i] += val*val;
+        }
+    }
+    __syncthreads ();
+    if ( i < A->n ) {
+        droptol [i] = SQRT (droptol[i]) * dtol;
+    }
+GLOBAL void Cuda_Calculate_Droptol_js( sparse_matrix p_A, real *droptol, real dtol )
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int k, j, offset, x, diagonal;
+    real val;
+    sparse_matrix *A = &p_A;
+    for (x = 0; x < A->n; x ++)
+    {
+        if (i < (A->end[i]-1 - A->start[i])) {
+            offset = A->start [i] + i;
+            j = A->entries[offset].j;
+            val = A->entries[offset].val;
+            droptol [j] += val * val;
+        }
+        __syncthreads ();
+    }
+GLOBAL void Cuda_Calculate_Droptol_diagonal( sparse_matrix p_A, real *droptol, real dtol )
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int k, j, offset, x, diagonal;
+    real val;
+    sparse_matrix *A = &p_A;
+    if ( i < A->n ) {
+        //diagonal element
+        diagonal = A->end[i]-1;
+        val = A->entries [diagonal].val;
+        droptol [i] += val*val;
+    }
+    /*calculate local droptol for each row*/
+    if ( i < A->n )
+        droptol [i] = SQRT (droptol[i]) * dtol;
+GLOBAL void Cuda_Estimate_LU_Fill( sparse_matrix p_A, real *droptol, int *fillin )
+    int i, j, pj;
+    real val;
+    sparse_matrix *A = &p_A;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= A->n) return;
+    fillin [i] = 0;
+    for (pj = A->start[i]; pj < A->end[i]-1; ++pj)
+    {
+        j = A->entries [pj].j;
+        val = A->entries[pj].val;
+        if (fabs (val) > droptol [i]) ++fillin [i];
+    }
+void Cuda_ICHOLT( sparse_matrix *A, real *droptol, 
+        sparse_matrix *L, sparse_matrix *U )
+    sparse_matrix_entry tmp[1000];
+    int i, j, pj, k1, k2, tmptop, Ltop;
+    real val;
+    int *Utop;
+    Utop = (int*) malloc((A->n+1) * sizeof(int));
+    // clear variables
+    Ltop = 0;
+    tmptop = 0;
+    for( i = 0; i <= A->n; ++i )
+        L->start[i] = U->start[i] = 0;
+    for( i = 0; i < A->n; ++i )
+        Utop[i] = 0;
+    //fprintf( stderr, "n: %d\n", A->n );
+    for( i = 0; i < A->n; ++i ){
+        L->start[i] = Ltop;
+        tmptop = 0;
+        for( pj = A->start[i]; pj < A->end[i]-1; ++pj ){
+            j = A->entries[pj].j;
+            val = A->entries[pj].val;
+            //fprintf( stderr, "i: %d, j: %d", i, j );
+            //CHANGE ORIGINAL
+            if (j >= i) break;
+            //CHANGE ORIGINAL
+            if( fabs(val) > droptol[i] ){
+                k1 = 0;
+                k2 = L->start[j];
+                while( k1 < tmptop && k2 < L->start[j+1] ){
+                    if( tmp[k1].j < L->entries[k2].j )
+                        ++k1;
+                    else if( tmp[k1].j > L->entries[k2].j )
+                        ++k2;
+                    else
+                        val -= (tmp[k1++].val * L->entries[k2++].val);
+                }
+                // L matrix is lower triangular, 
+                // so right before the start of next row comes jth diagonal
+                val /= L->entries[L->start[j+1]-1].val;
+                tmp[tmptop].j = j;
+                tmp[tmptop].val = val;
+                ++tmptop;
+            }
+            //fprintf( stderr, " -- done\n" );
+        }
+        // compute the ith diagonal in L
+        // sanity check
+        if( A->entries[pj].j != i ) {
+            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
+            exit(999);
+        }
+        val = A->entries[pj].val;
+        for( k1 = 0; k1 < tmptop; ++k1 )
+            val -= (tmp[k1].val * tmp[k1].val);
+        tmp[tmptop].j = i;
+        tmp[tmptop].val = SQRT(val);
+        // apply the dropping rule once again
+        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
+        //for( k1 = 0; k1<= tmptop; ++k1 )
+        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
+        //fprintf( stderr, "\n" );
+        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
+        for( k1 = 0; k1 < tmptop; ++k1 )
+            if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
+                L->entries[Ltop].j = tmp[k1].j;
+                L->entries[Ltop].val = tmp[k1].val;
+                U->start[tmp[k1].j+1]++;
+                ++Ltop;
+                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
+            }
+        // keep the diagonal in any case
+        L->entries[Ltop].j = tmp[k1].j;
+        L->entries[Ltop].val = tmp[k1].val;
+        ++Ltop;
+        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
+    }
+    L->start[i] = Ltop;
+    //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
+    for( i = 1; i <= U->n; ++i )
+        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
+    for( i = 0; i < L->n; ++i )
+        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
+            j = L->entries[pj].j;
+            U->entries[Utop[j]].j = i;
+            U->entries[Utop[j]].val = L->entries[pj].val;
+            Utop[j]++;
+        }
+    //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+//Parallel for each row
+//Each kernel will run for 6540 number of times.
+GLOBAL void Cuda_ICHOLT( reax_system *system, sparse_matrix p_A, real *droptol, 
+sparse_matrix p_L, sparse_matrix p_U )
+int start, end, count;
+real tempvalue, val;
+int i,pj,tmptop, offset;
+int j, k1, k2;
+sparse_matrix *A, *L, *U;
+sparse_matrix_entry *tmp;
+A = &p_A;
+L = &p_L;
+U = &p_U;
+real *null_val;
+null_val = 0;
+extern __shared__ real tmp_val[];
+extern __shared__ sparse_matrix_entry sh_tmp[];
+int kid = blockIdx.x * blockDim.x + threadIdx.x;
+tmp = (sparse_matrix_entry *) (tmp_val + blockDim.x);
+offset = 0;
+for( i = 0; i < 10; ++i )
+//if (kid == 0) L->start[i] = i * system->max_sparse_matrix_entries;
+if (kid == 0) L->start[i] = offset;
+tmptop = 0;
+start = A->start[i];
+end = A->end[i]-1; //inclusive
+count = end - start; //inclusive
+tmp_val [kid] = 0;
+if (kid < count) //diagonal not included
+pj = start + kid;
+j = A->entries[pj].j;
+val = A->entries[pj].val;
+if( fabs(val) > droptol[i] )
+k1 = 0;
+k2 = L->start[j];
+while( k1 < tmptop && k2 < L->end[j] ){
+if( tmp[k1].j < L->entries[k2].j )
+else if( tmp[k1].j > L->entries[k2].j )
+tmp_val[kid] = (tmp[k1++].val * L->entries[k2++].val);
+//here read the shared memory of all the kernels 
+if (kid == 0)
+for (i = 0; i < count; i++)
+tempvalue += tmp_val [i];
+val -= tempvalue;
+// L matrix is lower triangular, 
+// so right before the start of next row comes jth diagonal
+val /= L->entries[L->end[j]-1].val;
+tmp[tmptop].j = j;
+tmp[tmptop].val = val;
+__syncthreads ();
+// compute the ith diagonal in L
+// sanity check
+if (kid == 0) 
+    if( A->entries[end].j != i ) {
+        //intentional core dump here for sanity sake
+        *null_val = 1;
+    }
+//diagonal element
+//val = A->entries[pj].val;
+//for( k1 = 0; k1 < tmptop; ++k1 )
+if (kid < count) 
+    tmp_val[kid] = (tmp[kid].val * tmp[kid].val);
+    __syncthreads ();
+if (kid == 0)
+    val = A->entries [end].val;
+    for (i = 0; i < count; i++)
+        tempvalue += tmp_val [i];
+    val -= tempvalue;
+    tmp[tmptop].j = i;
+    tmp[tmptop].val = SQRT(val);
+__syncthreads ();
+//Fill in the LU entries
+//for( k1 = 0; k1 < count; ++k1 )
+if (kid < count )
+    if( fabs(tmp[kid].val) > droptol[i] / tmp[tmptop].val ){
+        L->entries[offset + kid].j = tmp[kid].j;
+        L->entries[offset + kid].val = tmp[kid].val;
+        U->start[tmp[kid].j+1]++;
+    }
+__syncthreads ();
+if (kid == 0) {
+    // keep the diagonal in any case
+    offset += count;
+    L->entries[offset].j = tmp[count].j;
+    L->entries[offset].val = tmp[count].val;
+    ++offset;
+    L->end [i] = offset;
+__syncthreads ();
+} // end of main for loop
+void Cuda_Fill_U    ( sparse_matrix *A, real *droptol, 
+        sparse_matrix *L, sparse_matrix *U )
+    int i, pj, j;
+    for( i = 1; i <= U->n; ++i )
+        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
+    for( i = 0; i < L->n; ++i )
+        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
+            j = L->entries[pj].j;
+            U->entries[Utop[j]].j = i;
+            U->entries[Utop[j]].val = L->entries[pj].val;
+            Utop[j]++;
+        }
+void Cuda_Init_MatVec( reax_system *system, control_params *control,
+        simulation_data *data, static_storage *workspace, list *far_nbrs )
+    int i, fillin;
+    real s_tmp, t_tmp;
+    int *spad = (int *)scratch;
+    real start = 0, end = 0;
+    if( control->refactor > 0 && 
+            ((data->step-data->prev_steps)%control->refactor==0 ||
+             dev_workspace->L.entries==NULL) )
+    {
+        Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>>
+            ( dev_workspace->H );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+#ifdef __DEBUG_CUDA__
+        fprintf (stderr, "Sorting done... \n");
+        Cuda_Calculate_Droptol<<<BLOCKS, BLOCK_SIZE >>>
+            ( dev_workspace->H, dev_workspace->droptol, control->droptol );
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+#ifdef __DEBUG_CUDA__
+        fprintf (stderr, "Droptol done... \n");
+        if( dev_workspace->L.entries == NULL )
+        {
+            cuda_memset( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH );
+            Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>>
+                ( dev_workspace->H, dev_workspace->droptol, spad );
+            cudaThreadSynchronize( );
+            cudaCheckError( );
+            //Reduction for fill in 
+            Cuda_reduction_int<<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>>  
+                (spad, spad + system->N,  system->N);
+            cudaThreadSynchronize( );
+            cudaCheckError( );
+            Cuda_reduction_int<<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
+                (spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); 
+            cudaThreadSynchronize( );
+            cudaCheckError( );
+            copy_host_device( &fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH );
+            fillin += dev_workspace->H.n;
+#ifdef __DEBUG_CUDA__
+            fprintf (stderr, "Calculated value of the fill in is --> %d \n ", fillin );
+            dev_workspace->L.n = far_nbrs->n;
+            dev_workspace->L.m = fillin;
+            Cuda_Init_Sparse_Matrix( &dev_workspace->L, fillin, far_nbrs->n );
+            dev_workspace->U.n = far_nbrs->n;
+            dev_workspace->U.m = fillin;
+            Cuda_Init_Sparse_Matrix( &dev_workspace->U, fillin, far_nbrs->n );
+        }
+#ifdef __DEBUG_CUDA__
+        fprintf (stderr, "LU matrix done...\n");
+        //TODO -- This is the ILU Factorization of the H Matrix. 
+        //This is present in the CUDA 5.0 compilation which is not working currently. 
+        //Fix this when CUDA 5.0 is correctly setup. 
+        //TODO
+        //shared memory is per block
+        // here we have only one block - 
+        /*
+           fprintf (stderr, "max sparse matrix entries %d \n", system->max_sparse_matrix_entries );
+           Cuda_ICHOLT <<<1, system->max_sparse_matrix_entries, 
+           system->max_sparse_matrix_entries *(REAL_SIZE + SPARSE_MATRIX_ENTRY_SIZE)   >>>
+           ( system, dev_workspace->H, 
+           dev_workspace->droptol, 
+           dev_workspace->L, 
+           dev_workspace->U );
+           cudaThreadSynchronize ();
+           fprintf (stderr, "Cuda_ICHOLT .. done ...-> %d\n ", cudaGetLastError ());
+         */
+        //1. copy the H matrix from device to host
+        //2. Allocate the L/U matrices on the host and device. 
+        //3. Compute the L/U on the host
+        //4. copy the results to the device
+        //5. Continue the computation.
+        sparse_matrix t_H, t_L, t_U;
+        real *t_droptol;
+        t_droptol = (real *) malloc( REAL_SIZE * system->N );
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m );
+        start = Get_Time( );
+        if( !Allocate_Matrix(&t_H, dev_workspace->H.n, dev_workspace->H.m) )
+        {
+            fprintf(stderr, "No space for H matrix \n");
+            exit( 0 );
+        }
+        if( !Allocate_Matrix(&t_L, far_nbrs->n, dev_workspace->L.m) )
+        {
+            fprintf( stderr, "No space for L matrix \n" );
+            exit( 0 );
+        }
+        if( !Allocate_Matrix(&t_U, far_nbrs->n, dev_workspace->U.m) )
+        {
+            fprintf( stderr, "No space for U matrix \n" );
+            exit( 0 );
+        }
+        copy_host_device( t_H.start, dev_workspace->H.start, INT_SIZE *
+                (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost,
+                RES_SPARSE_MATRIX_INDEX );
+        copy_host_device( t_H.end, dev_workspace->H.end, INT_SIZE *
+                (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost,
+                RES_SPARSE_MATRIX_INDEX );
+        copy_host_device( t_H.entries, dev_workspace->H.entries,
+                SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m,
+                cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY );
+        copy_host_device( t_droptol, dev_workspace->droptol, REAL_SIZE *
+                system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL );
+        //fprintf (stderr, " Done copying LUH .. \n");
+        Cuda_ICHOLT( &t_H, t_droptol, &t_L, &t_U );
+        Sync_Host_Device_Mat( &t_L, &t_U, cudaMemcpyHostToDevice );
+        end += Get_Timing_Info( start );
+        /*
+           fprintf (stderr, "Done syncing .... \n");
+           free (t_droptol);
+           fprintf (stderr, "Freed droptol ... \n");
+           Deallocate_Matrix (&t_H);
+           fprintf (stderr, "Freed H ... \n");
+           Deallocate_Matrix (&t_L);
+           fprintf (stderr, "Freed l ... \n");
+           Deallocate_Matrix (&t_U);
+           fprintf (stderr, "Freed u ... \n");
+         */
+        //#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "Done copying the L/U matrices to the device ---> %f \n", end );
+        //#endif
+        //#ifdef __BUILD_DEBUG__
+        //        validate_lu (workspace);
+        //#endif
+    }
+GLOBAL void Init_MatVec_Postprocess( static_storage p_workspace, int N )
+    static_storage *workspace = &p_workspace;
+    real s_tmp, t_tmp;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    // no extrapolation
+    //s_tmp = workspace->s[0][i];
+    //t_tmp = workspace->t[0][i];
+    // linear
+    //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
+    //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
+    // quadratic
+    //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
+    t_tmp = workspace->t[index_wkspace_sys(2,i,N)] + 3*(workspace->t[index_wkspace_sys(0,i,N)]-workspace->t[index_wkspace_sys(1,i,N)]);
+    // cubic
+    s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,N)] + workspace->s[index_wkspace_sys(2,i,N)]) - 
+        (6 * workspace->s[index_wkspace_sys(1,i,N)] + workspace->s[index_wkspace_sys(3,i,N)] );
+    //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
+    //  (6 * workspace->t[1][i] + workspace->t[3][i] );
+    // 4th order
+    //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
+    //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
+    //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
+    //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
+    workspace->s[index_wkspace_sys(4,i,N)] = workspace->s[index_wkspace_sys(3,i,N)];
+    workspace->s[index_wkspace_sys(3,i,N)] = workspace->s[index_wkspace_sys(2,i,N)]; 
+    workspace->s[index_wkspace_sys(2,i,N)] = workspace->s[index_wkspace_sys(1,i,N)];
+    workspace->s[index_wkspace_sys(1,i,N)] = workspace->s[index_wkspace_sys(0,i,N)];
+    workspace->s[index_wkspace_sys(0,i,N)] = s_tmp;
+    workspace->t[index_wkspace_sys(4,i,N)] = workspace->t[index_wkspace_sys(3,i,N)];
+    workspace->t[index_wkspace_sys(3,i,N)] = workspace->t[index_wkspace_sys(2,i,N)]; 
+    workspace->t[index_wkspace_sys(2,i,N)] = workspace->t[index_wkspace_sys(1,i,N)];
+    workspace->t[index_wkspace_sys(1,i,N)] = workspace->t[index_wkspace_sys(0,i,N)];
+    workspace->t[index_wkspace_sys(0,i,N)] = t_tmp;
+GLOBAL void Cuda_Update_Atoms_q( reax_atom *atoms, real *s, real u, real *t, int N )
+    int i = blockIdx.x*blockDim.x + threadIdx.x;
+    if (i >= N)
+    {
+        return;
+    }
+    atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)];
+void Cuda_Calculate_Charges( reax_system *system, static_storage *workspace )
+    real *spad = (real *) scratch;
+    real u, s_sum, t_sum;
+    cuda_memset( spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
+    //s_sum 
+    Cuda_reduction<<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
+        (&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+        (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( &s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    //t_sum
+    cuda_memset( spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
+        (&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+        (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    copy_host_device( &t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    //fraction here
+    u = s_sum / t_sum;
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
+    Cuda_Update_Atoms_q<<< BLOCKS, BLOCK_SIZE >>>
+        ( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data, 
+        static_storage *workspace, list *far_nbrs, 
+        output_controls *out_control )
+    int matvecs = 0;
+    real t_start, t_elapsed;
+#ifdef __DEBUG_CUDA__
+    t_start = Get_Time( );
+    /*
+    //Cuda_Init_MatVec( system, control, data, workspace, far_nbrs );
+    Cuda_Sort_Matrix_Rows<<< BLOCKS, BLOCK_SIZE >>>
+    ( dev_workspace->H );
+    cudaThreadSynchronize();
+    cudaCheckError();
+    t_elapsed = Get_Timing_Info (t_start);
+    fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed);
+     */
+    Init_MatVec_Postprocess<<< BLOCKS, BLOCK_SIZE >>>
+        (*dev_workspace, system->N);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+#ifdef __DEBUG_CUDA__
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf( stderr, "Done with post processing of init_matvec --> %d  with time ---> %f \n", cudaGetLastError (), t_elapsed );
+    //Here goes the GMRES part of the program ()
+    //#ifdef __DEBUG_CUDA__
+    t_start = Get_Time( );
+    //#endif
+    //matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s );
+    //matvecs += Cuda_GMRES( dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t );
+    matvecs = Cublas_GMRES( system, dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s );
+    matvecs += Cublas_GMRES( system, dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t );
+    d_timing.matvecs += matvecs;
+#ifdef __DEBUG_CUDA__
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf( stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed );
+    Cuda_Calculate_Charges( system, workspace );
diff --git a/PuReMD-GPU/src/cuda_QEq.h b/PuReMD-GPU/src/cuda_QEq.h
new file mode 100644
index 0000000000000000000000000000000000000000..f62ab1157e3812837cd3a8a65e07856a78b22bf2
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_QEq.h
@@ -0,0 +1,39 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_QEq_H_
+#define __CUDA_QEq_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+void Cuda_QEq( reax_system*, control_params*, simulation_data*, static_storage*,
+        list*, output_controls* );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/allocate.cu b/PuReMD-GPU/src/cuda_allocate.cu
similarity index 64%
rename from PuReMD-GPU/src/allocate.cu
rename to PuReMD-GPU/src/cuda_allocate.cu
index 37b80693217700dc20bbffdb515cfe5e1f6eb986..d0e1f22b6f970038820f82c504dc3472d50d1d7a 100644
--- a/PuReMD-GPU/src/allocate.cu
+++ b/PuReMD-GPU/src/cuda_allocate.cu
@@ -18,32 +18,24 @@
-#include "allocate.h"
-#include "list.h"
+#include "cuda_allocate.h"
 #include "cuda_utils.h"
-#include "reduction.h"
+#include "cuda_list.h"
+#include "cuda_reduction.h"
-void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
-    Delete_List( far_nbrs );
-    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs )){
-        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-        exit( INIT_ERR );
-    }
+#include "list.h"
+GLOBAL void Init_HBond_Indexes ( int *, int *, list , int  );
+GLOBAL void Init_Bond_Indexes ( int *, list , int  );
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
-            num_intrs, far_nbrs->num_intrs );  
-    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-            num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
 void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
-    Delete_List( far_nbrs, TYP_DEVICE );
-    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE )){
+    Cuda_Delete_List( far_nbrs );
+    if(!Cuda_Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs ))
+    {
         fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
         exit( INIT_ERR );
@@ -57,23 +49,6 @@ void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
-int Allocate_Matrix( sparse_matrix *H, int n, int m )
-    H->n = n;
-    H->m = m;
-    if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL )
-        return 0;
-    if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL )
-        return 0;
-    if( (H->entries = 
-                (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL )
-        return 0;
-    return 1;
 int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m )
     H->n = n;
@@ -87,13 +62,6 @@ int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m )
-void Deallocate_Matrix( sparse_matrix *H )
-    free(H->start);
-    free(H->entries);
-    free(H->end);
 void Cuda_Deallocate_Matrix( sparse_matrix *H )
     cuda_free(H->start, RES_SPARSE_MATRIX_INDEX);
@@ -106,23 +74,6 @@ void Cuda_Deallocate_Matrix( sparse_matrix *H )
-int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
-    Deallocate_Matrix( H );
-    if( !Allocate_Matrix( H, n, m ) ) {
-        fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
-        exit( 1 );
-    }
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
-            name, n, m );
-    fprintf( stderr, "memory allocated: %s = %ldMB\n", 
-            name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
-    return 1;
 int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
     Cuda_Deallocate_Matrix( H );
@@ -142,56 +93,6 @@ int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
-int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, 
-        list *hbonds )
-    int i, num_hbonds;
-    num_hbonds = 0;
-    /* find starting indexes for each H and the total number of hbonds */
-    for( i = 1; i < n; ++i )
-        hb_top[i] += hb_top[i-1];
-    num_hbonds = hb_top[n-1];
-    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) {
-        fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-        exit( INIT_ERR );
-    }
-    for( i = 0; i < n; ++i )
-        if( h_index[i] == 0 ){
-            Set_Start_Index( 0, 0, hbonds ); 
-            Set_End_Index( 0, 0, hbonds ); 
-        }
-        else if( h_index[i] > 0 ){
-            Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); 
-            Set_End_Index( h_index[i], hb_top[i-1], hbonds ); 
-        }
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds );
-    fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
-            num_hbonds * sizeof(hbond_data) / (1024*1024) );
-    return 1;
-GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N )
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index >= N) return;
-    if( h_index[index] == 0 ){
-        Set_Start_Index( 0, 0, &hbonds ); 
-        Set_End_Index( 0, 0, &hbonds ); 
-    }
-    else if( h_index[index] > 0 ){
-        Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); 
-        Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); 
-    }
 int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list *hbonds )
     int i, num_hbonds;
@@ -204,7 +105,7 @@ int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list
         hb_top[i] += hb_top[i-1];
     num_hbonds = hb_top[n-1];
-    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds , TYP_DEVICE) ) {
+    if( !Cuda_Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) {
         fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
         exit( INIT_ERR );
@@ -225,27 +126,6 @@ int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list
     return 1;
-int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
-    int i;
-    int *hb_top;
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "reallocating hbonds\n" );
-    hb_top = (int *)calloc( n, sizeof(int) );
-    for( i = 0; i < n; ++i )
-        if( h_index[i] >= 0 )
-            hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS);
-    Delete_List( hbonds );
-    Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
-    free( hb_top );
-    return 1;
 int Cuda_Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
@@ -270,7 +150,7 @@ int Cuda_Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
         //if( h_index[i] >= 0 )
         hb_top[i] = MAX((hb_end [i] - hb_start[i])*SAFE_HBONDS, MIN_HBONDS);
-    Delete_List( hbonds, TYP_DEVICE );
+    Cuda_Delete_List( hbonds );
     Cuda_Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
@@ -281,21 +161,6 @@ int Cuda_Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
     return 1;
-GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N )
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index >= N) return;
-    if( index == 0 ){
-        Set_Start_Index( 0, 0, &bonds ); 
-        Set_End_Index( 0, 0, &bonds ); 
-    }
-    else if( index > 0 ){
-        Set_Start_Index( index, b_top[index-1], &bonds ); 
-        Set_End_Index( index, b_top[index-1], &bonds ); 
-    }
 int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds )
@@ -308,7 +173,7 @@ int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds )
         b_top[i] += b_top[i-1];
     num_bonds = b_top[num_b-1];
-    if( !Make_List(num_b, num_bonds, TYP_BOND, bonds, TYP_DEVICE) ) {
+    if( !Cuda_Make_List(num_b, num_bonds, TYP_BOND, bonds ) ) {
         fprintf( stderr, "not enough space for bonds list. terminating!\n" );
         exit( INIT_ERR );
@@ -326,93 +191,6 @@ int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds )
-int Allocate_Bond_List( int n, int *bond_top, list *bonds )
-    int i, num_bonds;
-    num_bonds = 0;
-    /* find starting indexes for each atom and the total number of bonds */
-    for( i = 1; i < n; ++i )
-        bond_top[i] += bond_top[i-1];
-    num_bonds = bond_top[n-1];
-    if( !Make_List(n, num_bonds, TYP_BOND, bonds ) ) {
-        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-        exit( INIT_ERR );
-    }
-    Set_Start_Index( 0, 0, bonds ); 
-    Set_End_Index( 0, 0, bonds ); 
-    for( i = 1; i < n; ++i ) {
-        Set_Start_Index( i, bond_top[i-1], bonds ); 
-        Set_End_Index( i, bond_top[i-1], bonds ); 
-    }
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds );
-    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
-            num_bonds * sizeof(bond_data) / (1024*1024) );
-    return 1;
-int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
-    int i;
-    int *bond_top;
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "reallocating bonds\n" );
-    bond_top = (int *)calloc( n, sizeof(int) );
-    *est_3body = 0;
-    for( i = 0; i < n; ++i ){
-        *est_3body += SQR( Num_Entries( i, bonds ) );
-        bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
-    }
-    Delete_List( bonds );
-    Allocate_Bond_List( n, bond_top, bonds );
-    *num_bonds = bond_top[n-1];
-    free( bond_top );
-    return 1;
-void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n)
-    extern __shared__ int sh_input[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real x = 0;
-    if(i < n)
-    {
-        x = SQR (Num_Entries( i, &bonds ) );
-        bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS );
-    }
-    sh_input[threadIdx.x] = x;
-    __syncthreads();
-    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if(threadIdx.x < offset)
-        {   
-            sh_input[threadIdx.x] += sh_input[threadIdx.x + offset];
-        }   
-        __syncthreads();
-    }
-    if(threadIdx.x == 0)
-    {
-        per_block_results[blockIdx.x] = sh_input[0];
-    }
 int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body )
     int i;
@@ -437,7 +215,7 @@ int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body )
         b_top[i] = MAX((b_end [i] - b_start[i])*2, MIN_BONDS);
-    Delete_List( bonds, TYP_DEVICE );
+    Cuda_Delete_List( bonds );
     Cuda_Allocate_Bond_List(n, b_top, bonds );
@@ -450,6 +228,7 @@ int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body )
     return i;
 int Cuda_Reallocate_ThreeBody_List ( list *thblist, int count )
     int i;
@@ -479,10 +258,10 @@ int Cuda_Reallocate_ThreeBody_List ( list *thblist, int count )
     new_total = thb_total;
     new_count = count;
-    Delete_List( thblist, TYP_DEVICE );
+    Cuda_Delete_List( thblist );
     /*Allocate the list */
-    if(!Make_List( new_count, new_total, TYP_THREE_BODY, thblist, TYP_DEVICE )){
+    if(!Cuda_Make_List( new_count, new_total, TYP_THREE_BODY, thblist )){
         fprintf(stderr, "Problem in reallocating three-body list. Terminating!\n");
         exit( INIT_ERR );
@@ -523,14 +302,14 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH );
  cudaThreadSynchronize ();
  cudaCheckError ();
- Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
+ Cuda_reduction_int<<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
  (d_bond_top + n, d_bond_top + n + BLOCKS_POW_2, BLOCKS_POW_2); 
  cudaThreadSynchronize ();
  copy_host_device (bond_top, d_bond_top, n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
  copy_host_device (est_3body, d_bond_top + n + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, __LINE__);
- Delete_List( bonds, TYP_DEVICE );
+ Cuda_Delete_List( bonds );
  Cuda_Allocate_Bond_List( n, bond_top, bonds );
  *num_bonds = bond_top[n-1];
@@ -542,83 +321,6 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH );
-void Reallocate( reax_system *system, static_storage *workspace, list **lists, 
-        int nbr_flag )
-    int num_bonds, est_3body;
-    reallocate_data *realloc;
-    grid *g;
-    realloc = &(workspace->realloc);
-    g = &(system->g);
-    if( realloc->num_far > 0 && nbr_flag ) {
-        fprintf (stderr, " Reallocating neighbors \n");
-        Reallocate_Neighbor_List( (*lists)+FAR_NBRS, 
-                system->N, realloc->num_far * SAFE_ZONE );
-        realloc->num_far = -1;
-    }
-    if( realloc->Htop > 0 ){
-        fprintf (stderr, " Reallocating Matrix \n");
-        Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
-        realloc->Htop = -1;
-        Deallocate_Matrix( &workspace->L );
-        Deallocate_Matrix( &workspace->U );
-    }
-    if( realloc->hbonds > 0 ){
-        fprintf (stderr, " Reallocating hbonds \n");
-        Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
-                (*lists)+HBONDS );
-        realloc->hbonds = -1;
-    }
-    num_bonds = est_3body = -1;
-    if( realloc->bonds > 0 ){
-        fprintf (stderr, " Reallocating bonds \n");
-        Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body );
-        realloc->bonds = -1;
-        realloc->num_3body = MAX( realloc->num_3body, est_3body );
-    }
-    if( realloc->num_3body > 0 ) {
-        fprintf (stderr, " Reallocating 3Body \n");
-        Delete_List( (*lists)+THREE_BODIES );
-        if( num_bonds == -1 )
-            num_bonds = ((*lists)+BONDS)->num_intrs;
-        realloc->num_3body *= SAFE_ZONE;
-        if( !Make_List( num_bonds, realloc->num_3body,
-                    TYP_THREE_BODY, (*lists)+THREE_BODIES ) ) {
-            fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-            exit( INIT_ERR );
-        }
-        realloc->num_3body = -1;
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "reallocating 3 bodies\n" );
-        fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds );
-        fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body );
-        fprintf( stderr, "reallocated 3body memory: %ldMB\n", 
-                realloc->num_3body*sizeof(three_body_interaction_data)/
-                (1024*1024) );
-    }
-    if( realloc->gcell_atoms > -1 ){
-#if defined(DEBUG_FOCUS)
-        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
-        free (g->atoms);
-        g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
-                sizeof (int) * workspace->realloc.gcell_atoms);
-        realloc->gcell_atoms = -1;
-    }
 void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *lists, 
         int nbr_flag, int step )
@@ -724,3 +426,68 @@ void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *list
         realloc->gcell_atoms = -1;
+GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N )
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= N) return;
+    if( h_index[index] == 0 ){
+        Set_Start_Index( 0, 0, &hbonds ); 
+        Set_End_Index( 0, 0, &hbonds ); 
+    }
+    else if( h_index[index] > 0 ){
+        Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); 
+        Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); 
+    }
+GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N )
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= N) return;
+    if( index == 0 ){
+        Set_Start_Index( 0, 0, &bonds ); 
+        Set_End_Index( 0, 0, &bonds ); 
+    }
+    else if( index > 0 ){
+        Set_Start_Index( index, b_top[index-1], &bonds ); 
+        Set_End_Index( index, b_top[index-1], &bonds ); 
+    }
+void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n)
+    extern __shared__ int sh_input[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+    if(i < n)
+    {
+        x = SQR (Num_Entries( i, &bonds ) );
+        bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS );
+    }
+    sh_input[threadIdx.x] = x;
+    __syncthreads();
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {   
+            sh_input[threadIdx.x] += sh_input[threadIdx.x + offset];
+        }   
+        __syncthreads();
+    }
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sh_input[0];
+    }
diff --git a/PuReMD-GPU/src/cuda_allocate.h b/PuReMD-GPU/src/cuda_allocate.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc672d3bd76c1afa7a468aa2fdda3dd0ca3d3ec9
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_allocate.h
@@ -0,0 +1,41 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_ALLOCATE_H_
+#define __CUDA_ALLOCATE_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+int Cuda_Allocate_Matrix( sparse_matrix*, int, int );
+int Cuda_Allocate_HBond_List( int, int, int*, int*, list* );
+int Cuda_Allocate_Bond_List( int, int*, list* );
+void Cuda_Reallocate( reax_system*, static_storage*, list*, int, int );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_bond_orders.cu b/PuReMD-GPU/src/cuda_bond_orders.cu
new file mode 100644
index 0000000000000000000000000000000000000000..81a7462033a3da94d8ae601288efb83a93e387f2
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_bond_orders.cu
@@ -0,0 +1,857 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_bond_orders.h"
+#include "bond_orders.h"
+#include "list.h"
+#include "lookup.h"
+#include "print_utils.h"
+#include "vector.h"
+#include "index_utils.h"
+#include "cuda_utils.h"
+#include "cuda_helpers.h"
+HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms, 
+        simulation_data *data, static_storage *workspace, 
+        list *bonds )
+    bond_data *nbr_j, *nbr_k;
+    bond_order_data *bo_ij, *bo_ji; 
+    dbond_coefficients coef;
+    rvec temp, ext_press;
+    ivec rel_box;
+    int pk, k, j;
+    /* Initializations */
+    nbr_j = &(bonds->select.bond_list[pj]);
+    j = nbr_j->nbr;
+    bo_ij = &(nbr_j->bo_data);
+    bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    /************************************
+     * forces related to atom i          *
+     * first neighbors of atom i         *
+     ************************************/
+    for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        k = nbr_k->nbr;
+        rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp );       /*2nd,dBO*/
+        rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/
+        rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/
+        rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/
+        /* force */
+        rvec_Add( atoms[k].f, temp );
+        /* pressure */
+        rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
+        rvec_Add( data->ext_press, ext_press );
+    }
+    /* then atom i itself  */
+    rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd, dBO*/
+    rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/
+    rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );         /*1st,dBOpi*/
+    rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );               /*2nd,dBOpi*/
+    rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/
+    rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ;      /*1st,dBO_pi2*/
+    rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBO_pi2*/
+    rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/
+    /* force */
+    rvec_Add( atoms[i].f, temp );
+    /* ext pressure due to i dropped, counting force on j only will be enough */
+    /****************************************************************************
+     * forces and pressure related to atom j                                    *
+     * first neighbors of atom j                                                *
+     ***************************************************************************/
+    for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        k = nbr_k->nbr;
+        rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );       /*3rd,dBO*/
+        rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ 
+        rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/
+        rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/
+        /* force */
+        rvec_Add( atoms[k].f, temp );
+        /* pressure */
+        if( k != i ) {
+            ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box  wrt i
+            rvec_iMultiply( ext_press, rel_box, temp );
+            rvec_Add( data->ext_press, ext_press );
+        }
+    }
+    /* then atom j itself */
+    rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                     /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );   /*2nd, dBO*/
+    rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );              /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/
+    rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
+    rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
+    rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/
+    rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2);       /*1st,dBOpi2*/
+    rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBOpi2*/
+    rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
+    /* force */
+    rvec_Add( atoms[j].f, temp );
+    /* pressure */
+    rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
+    rvec_Add( data->ext_press, ext_press );
+HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms, 
+        static_storage *workspace, list *bonds )
+    bond_data *nbr_j, *nbr_k;
+    bond_order_data *bo_ij, *bo_ji; 
+    dbond_coefficients coef;
+    int pk, k, j;
+    rvec t_f;
+    /* Initializations */ 
+    nbr_j = &(bonds->select.bond_list[pj]);
+    j = nbr_j->nbr;
+    if (i < j)
+    {
+        bo_ij = &(nbr_j->bo_data);
+        bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    } else {
+        bo_ji = &(nbr_j->bo_data);
+        bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    }
+    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    if ( i < j) {
+        for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+            nbr_k = &(bonds->select.bond_list[pk]);
+            k = nbr_k->nbr;
+            rvec_MakeZero (t_f);
+            rvec_ScaledAdd( t_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); 
+            /*2nd, dBO*/
+            rvec_ScaledAdd( t_f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
+            /*dDelta*/
+            rvec_ScaledAdd( t_f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
+            /*3rd, dBOpi*/
+            rvec_ScaledAdd( t_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
+            /*3rd, dBOpi2*/
+            //Store in the temp place
+            rvec_Add (nbr_k->t_f, t_f);
+        }
+        rvec_ScaledAdd( atoms[i].f, coef.C1dbo, bo_ij->dBOp );
+        /*1st, dBO*/
+        rvec_ScaledAdd( atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] );
+        /*2nd, dBO*/
+        rvec_ScaledAdd(atoms[i].f, coef.C1dDelta, bo_ij->dBOp);
+        /*1st, dBO*/
+        rvec_ScaledAdd(atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]);
+        /*2nd, dBO*/
+        rvec_ScaledAdd( atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi );
+        /*1st, dBOpi*/
+        rvec_ScaledAdd( atoms[i].f, coef.C2dbopi, bo_ij->dBOp );
+        /*2nd, dBOpi*/
+        rvec_ScaledAdd( atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]);
+        /*3rd, dBOpi*/
+        rvec_ScaledAdd( atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+        /*1st, dBO_pi2*/
+        rvec_ScaledAdd( atoms[i].f, coef.C2dbopi2, bo_ij->dBOp );
+        /*2nd, dBO_pi2*/
+        rvec_ScaledAdd( atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]);
+        /*3rd, dBO_pi2*/
+    }
+    else 
+    {
+        for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+            nbr_k = &(bonds->select.bond_list[pk]);
+            k = nbr_k->nbr;
+            rvec_MakeZero (t_f);
+            rvec_ScaledAdd( t_f, -coef.C3dbo, nbr_k->bo_data.dBOp );
+            /*3rd, dBO*/
+            rvec_ScaledAdd( t_f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
+            /*dDelta*/ 
+            rvec_ScaledAdd( t_f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
+            /*4th, dBOpi*/
+            rvec_ScaledAdd( t_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
+            /*4th, dBOpi2*/
+            //Store in the temp place
+            rvec_Add (nbr_k->t_f, t_f);
+        }
+        rvec_ScaledAdd( atoms[i].f, -coef.C1dbo, bo_ij->dBOp );
+        /*1st, dBO*/
+        rvec_ScaledAdd( atoms[i].f, coef.C3dbo, workspace->dDeltap_self[i] );
+        /*2nd, dBO*/
+        rvec_ScaledAdd( atoms[i].f, -coef.C1dDelta, bo_ij->dBOp );
+        /*1st, dBO*/
+        rvec_ScaledAdd(atoms[i].f, coef.C3dDelta, workspace->dDeltap_self[i]);
+        /*2nd, dBO*/
+        rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi, bo_ij->dln_BOp_pi );
+        /*1st, dBOpi*/
+        rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi, bo_ij->dBOp );
+        /*2nd, dBOpi*/
+        rvec_ScaledAdd(atoms[i].f, coef.C4dbopi, workspace->dDeltap_self[i]);
+        /*3rd, dBOpi*/
+        rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+        /*1st, dBOpi2*/
+        rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi2, bo_ij->dBOp );
+        /*2nd, dBOpi2*/
+        rvec_ScaledAdd(atoms[i].f, coef.C4dbopi2, workspace->dDeltap_self[i]);
+        /*3rd, dBOpi2*/
+    }
+HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list *bonds)
+    int pk;
+    bond_data *nbr_k, *nbr_k_sym;
+    /*
+       for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+       nbr_k = &(bonds->select.bond_list[pk]);
+       rvec_Add (atoms[i].f, nbr_k->t_f);
+       }
+     */
+    for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] );
+        rvec_Add (atoms[i].f, nbr_k_sym->t_f);
+    }
+GLOBAL void Cuda_Calculate_Bond_Orders_Init (  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
+        static_storage workspace, int num_atom_types, int N )
+    int i, type_i;
+    real p_boc1, p_boc2;
+    single_body_parameters *sbp_i;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    p_boc1 = g_params.l[0];
+    p_boc2 = g_params.l[1];
+    /* Calculate Deltaprime, Deltaprime_boc values */
+    type_i = atoms[i].type;
+    sbp_i = &(sbp[type_i]);
+    workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency;
+    workspace.Deltap_boc[i] = 
+        workspace.total_bond_order[i] - sbp_i->valency_val;
+    workspace.total_bond_order[i] = 0;
+/* A very important and crucial assumption here is that each segment
+   belonging to a different atom in nbrhoods->nbr_list is sorted in its own.
+   This can either be done in the general coordinator function or here */
+GLOBAL void Cuda_Calculate_Bond_Orders (  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
+        two_body_parameters *tbp, static_storage workspace, list bonds,
+        list dDeltas, list dBOs, int num_atom_types, int N )
+    int i, j, pj, type_i, type_j;
+    int start_i, end_i;
+    int num_bonds, sym_index;
+    real p_boc1, p_boc2;
+    real val_i, Deltap_i, Deltap_boc_i;
+    real val_j, Deltap_j, Deltap_boc_j;
+    real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
+    real exp_p1i,    exp_p2i, exp_p1j, exp_p2j;
+    real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
+    real Cf45_ij, Cf45_ji, p_lp1;
+    real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
+    real explp1;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij, *bo_ji;
+    single_body_parameters *sbp_i, *sbp_j;
+#if defined(TEST_FORCES)
+    int  k, pk, start_j, end_j;
+    int  top_dbo=0, top_dDelta=0;
+    dbond_data *pdbo;
+    dDelta_data *ptop_dDelta;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    num_bonds = 0;
+    p_boc1 = g_params.l[0];
+    p_boc2 = g_params.l[1];
+    /* Calculate Deltaprime, Deltaprime_boc values */
+    //for( i = 0; i < system->N; ++i ) {
+    /*
+       if (i < N) {
+       type_i = atoms[i].type;
+       sbp_i = &(sbp[type_i]);
+       workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency;
+       workspace.Deltap_boc[i] = 
+       workspace.total_bond_order[i] - sbp_i->valency_val;
+       workspace.total_bond_order[i] = 0;
+       }
+       __syncthreads ();
+     */
+    // fprintf( stderr, "done with uncorrected bond orders\n" );
+    /* Corrected Bond Order calculations */
+    //for( i = 0; i < system->N; ++i ) {
+    type_i = atoms[i].type;
+    sbp_i = &(sbp[type_i]);
+    val_i = sbp_i->valency;
+    Deltap_i = workspace.Deltap[i];
+    Deltap_boc_i = workspace.Deltap_boc[i];
+    start_i = Start_Index(i, &bonds);
+    end_i = End_Index(i, &bonds);
+    //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
+    //       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
+    for( pj = start_i; pj < end_i; ++pj ) {
+        j = bonds.select.bond_list[pj].nbr;
+        type_j = atoms[j].type;
+        bo_ij = &( bonds.select.bond_list[pj].bo_data );
+        //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
+        if( i < j ) {
+            twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );          
+            Set_Start_Index( pj, top_dbo, &dBOs );
+            /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", 
+               workspace->reverse_map[i], workspace->reverse_map[j], 
+               twbp->ovc, twbp->v13cor, bo_ij->BO ); */
+            if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
+                /* There is no correction to bond orders nor to derivatives of 
+                   bond order prime! So we leave bond orders unchanged and 
+                   set derivative of bond order coefficients s.t. 
+                   dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
+                bo_ij->C1dbo = 1.000000;
+                bo_ij->C2dbo = 0.000000;
+                bo_ij->C3dbo = 0.000000; 
+                bo_ij->C1dbopi = bo_ij->BO_pi;
+                bo_ij->C2dbopi = 0.000000;
+                bo_ij->C3dbopi = 0.000000;
+                bo_ij->C4dbopi = 0.000000;
+                bo_ij->C1dbopi2 = bo_ij->BO_pi2; 
+                bo_ij->C2dbopi2 = 0.000000;
+                bo_ij->C3dbopi2 = 0.000000;
+                bo_ij->C4dbopi2 = 0.000000;
+                pdbo = &(dBOs.select.dbo_list[ top_dbo ]);
+                // compute dBO_ij/dr_i
+                pdbo->wrt = i;
+                rvec_Copy( pdbo->dBO, bo_ij->dBOp );
+                rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
+                rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
+                // compute dBO_ij/dr_j
+                pdbo++;
+                pdbo->wrt = j;
+                rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp );
+                rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi );
+                rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 );
+                top_dbo += 2;
+            }
+            else {
+                val_j = sbp[type_j].valency;
+                Deltap_j = workspace.Deltap[j];
+                Deltap_boc_j = workspace.Deltap_boc[j];
+                /* on page 1 */
+                if( twbp->ovc >= 0.001 ) {
+                    /* Correction for overcoordination */        
+                    exp_p1i = EXP( -p_boc1 * Deltap_i );
+                    exp_p2i = EXP( -p_boc2 * Deltap_i );
+                    exp_p1j = EXP( -p_boc1 * Deltap_j );
+                    exp_p2j = EXP( -p_boc2 * Deltap_j );
+                    f2 = exp_p1i + exp_p1j;            
+                    f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
+                    f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + 
+                            ( val_j + f2 )/( val_j + f2 + f3 ) );
+                    /*fprintf( stderr,"%6d%6d\t%g %g   j:%g %g  p_boc:%g %g\n",
+                      i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 );
+                      fprintf( stderr,"\tf:%g  %g  %g, exp:%g %g %g %g\n", 
+                      f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
+                    /* Now come the derivates */        
+                    /* Bond Order pages 5-7, derivative of f1 */
+                    temp = f2 + f3;
+                    u1_ij = val_i + temp;
+                    u1_ji = val_j + temp;
+                    Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji ));
+                    Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + 
+                            ( u1_ji - f3 ) / SQR( u1_ji ));
+                    //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
+                    //          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
+                    Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - 
+                            ((val_i+f2) / SQR(u1_ij)) * 
+                            ( -p_boc1 * exp_p1i + 
+                              exp_p2i / ( exp_p2i + exp_p2j ) ) + 
+                            -p_boc1 * exp_p1i / u1_ji - 
+                            ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i +  
+                            exp_p2i / ( exp_p2i + exp_p2j ) ));
+                    Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + 
+                        Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); 
+                    //fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
+                }
+                else {
+                    /* No overcoordination correction! */
+                    f1 = 1.0;
+                    Cf1_ij = Cf1_ji = 0.0;          
+                }
+                if( twbp->v13cor >= 0.001 ) {
+                    /* Correction for 1-3 bond orders */
+                    exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
+                                Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
+                    exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
+                                Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
+                    f4 = 1. / (1. + exp_f4);
+                    f5 = 1. / (1. + exp_f5);
+                    f4f5 = f4 * f5;
+                    /* Bond Order pages 8-9, derivative of f4 and f5 */
+                    /*temp = twbp->p_boc5 - 
+                      twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
+                      u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
+                      u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
+                      Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
+                      Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
+                    Cf45_ij = -f4 * exp_f4;
+                    Cf45_ji = -f5 * exp_f5;
+                }
+                else {
+                    f4 = f5 = f4f5 = 1.0;
+                    Cf45_ij = Cf45_ji = 0.0;
+                }
+                /* Bond Order page 10, derivative of total bond order */
+                A0_ij = f1 * f4f5;
+                A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * 
+                    (Cf45_ij + Cf45_ji);
+                A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
+                A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
+                A3_ij = A2_ij + Cf1_ij / f1;
+                A3_ji = A2_ji + Cf1_ji / f1;
+                /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f 
+A2_ji: %f, A3_ij: %f, A3_ji: %f\n",
+bo_ij->BO, A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/
+                /* find corrected bond order values and their deriv coefs */
+                bo_ij->BO    = bo_ij->BO    * A0_ij;
+                bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
+                bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
+                bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+                bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
+                bo_ij->C2dbo = bo_ij->BO * A2_ij;
+                bo_ij->C3dbo = bo_ij->BO * A2_ji; 
+                bo_ij->C1dbopi = f1*f1*f4*f5;
+                bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
+                bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
+                bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
+                bo_ij->C1dbopi2 = f1*f1*f4*f5;
+                bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
+                bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
+                /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", 
+                  i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/
+                /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
+                //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n",
+                workspace->orig_id[i], workspace->orig_id[j]
+                A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji
+                bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s,
+                bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, 
+                bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi,
+                bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2
+                ); */
+                Calculate_dBO( i, pj, workspace, lists, &top_dbo );
+            }
+            /* neglect bonds that are < 1e-10 */
+            if( bo_ij->BO < 1e-10 )
+                bo_ij->BO = 0.0;
+            if( bo_ij->BO_s < 1e-10 )
+                bo_ij->BO_s = 0.0;
+            if( bo_ij->BO_pi < 1e-10 )
+                bo_ij->BO_pi = 0.0;
+            if( bo_ij->BO_pi2 < 1e-10 )
+                bo_ij->BO_pi2 = 0.0;
+            workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
+            /* fprintf( stderr, "%d %d\t%g %g %g %g\n
+Cdbo:\t%g %g %g\n
+Cdbopi:\t%g %g %g %g\n
+Cdbopi2:%g %g %g %g\n\n", 
+i+1, j+1, bonds->select.bond_list[ pj ].d, 
+bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, 
+bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo,
+bo_ij->C1dbopi, bo_ij->C2dbopi, bo_ij->C3dbopi, bo_ij->C4dbopi,
+bo_ij->C1dbopi2, bo_ij->C2dbopi2, 
+bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
+            /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
+               i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */
+            Set_End_Index( pj, top_dbo, &dBOs );
+            //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
+        }
+        /*
+           else {
+        // We only need to update bond orders from bo_ji
+        //   everything else is set in uncorrected_bo calculations 
+        sym_index = bonds.select.bond_list[pj].sym_index;
+        bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data);
+        bo_ij->BO = bo_ji->BO;
+        bo_ij->BO_s = bo_ji->BO_s;
+        bo_ij->BO_pi = bo_ji->BO_pi;
+        bo_ij->BO_pi2 = bo_ji->BO_pi2;
+        workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
+        //Add_dBO( system, lists, j, sym_index, 1.0, workspace.dDelta );
+         */
+    }
+#ifdef TEST_FORCES 
+    // fprintf( stderr, "dDelta computations\nj:" );
+    Set_Start_Index( i, top_dDelta, &dDeltas );
+    ptop_dDelta = &( dDeltas.select.dDelta_list[top_dDelta] );
+    for( pj = start_i; pj < end_i; ++pj ) {
+        j = bonds.select.bond_list[pj].nbr;
+        // fprintf( stderr, "%d  ", j );
+        if( !rvec_isZero( workspace.dDelta[j] ) ) {
+            ptop_dDelta->wrt = j;
+            rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[j] );
+            rvec_MakeZero( workspace.dDelta[j] );
+            ++top_dDelta, ++ptop_dDelta;
+        }
+        start_j = Start_Index(j, &bonds);
+        end_j = End_Index(j, &bonds);     
+        for( pk = start_j; pk < end_j; ++pk ) {
+            k = bonds.select.bond_list[pk].nbr;    
+            if( !rvec_isZero( workspace.dDelta[k] ) ) {
+                ptop_dDelta->wrt = k;
+                rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[k] );
+                rvec_MakeZero( workspace.dDelta[k] );
+                ++top_dDelta, ++ptop_dDelta;
+            }
+        }
+    }
+    Set_End_Index( i, top_dDelta, &dDeltas );
+    /*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj )
+      fprintf( stdout, "dDel: %d %d [%g %g %g]\n",
+      i+1, dDeltas->select.dDelta_list[pj].wrt+1,
+      dDeltas->select.dDelta_list[pj].dVal[0], 
+      dDeltas->select.dDelta_list[pj].dVal[1], 
+      dDeltas->select.dDelta_list[pj].dVal[2] );*/
+    //}
+    /*fprintf(stderr,"\tCalculated actual bond orders ...\n" );
+      fprintf(stderr,"%6s%8s%8s%8s%8s%8s%8s%8s\n", 
+      "atom", "Delta", "Delta_e", "Delta_boc", "nlp", 
+      "Delta_lp", "Clp", "dDelta_lp" );*/
+    /*
+       p_lp1 = g_params.l[15];
+    //get the kernel ID for the following computation
+    j = i;
+    // Calculate some helper variables that are  used at many places 
+    //  throughout force calculations 
+    //for( j = 0; j < system->N; ++j ) {
+    type_j = atoms[j].type;
+    sbp_j = &(sbp[ type_j ]);
+    workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency;
+    workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e;
+    workspace.Delta_boc[j] = workspace.total_bond_order[j] - 
+    sbp_j->valency_boc;
+    workspace.vlpex[j] =  workspace.Delta_e[j] - 
+    2.0 * (int)(workspace.Delta_e[j]/2.0);
+    explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j]));
+    workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0);
+    workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j];
+    workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]);
+    // Adri uses different dDelta_lp values than the ones in notes... //
+    workspace.dDelta_lp[j] = workspace.Clp[j];
+    //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
+    //((fabs(workspace->Delta_e[j]/2.0 - 
+    //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
+    if( sbp_j->mass > 21.0 ) {
+    workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
+    workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
+    workspace.dDelta_lp_temp[j] = 0.;
+    }
+    else {
+    workspace.nlp_temp[j] = workspace.nlp[j];
+    workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
+    workspace.dDelta_lp_temp[j] = workspace.Clp[j];
+    }
+    //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",
+    //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], 
+    //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt,
+    //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] );
+    //}
+     */
+    //Print_Bonds( system, bonds, "sbonds.out" );
+#if defined(DEBUG)
+    //fprintf( stderr, "Number of bonds: %d\n", num_bonds );
+    //Print_Bond_Orders( system, control, data, workspace, lists, out_control );
+GLOBAL void Cuda_Update_Uncorrected_BO (  static_storage workspace, list bonds, int N )
+    int i, j, pj;
+    int start_i, end_i;
+    int sym_index;
+    bond_order_data *bo_ij, *bo_ji;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    start_i = Start_Index(i, &bonds);
+    end_i = End_Index(i, &bonds);
+    for( pj = start_i; pj < end_i; ++pj ) {
+        j = bonds.select.bond_list[pj].nbr;
+        bo_ij = &( bonds.select.bond_list[pj].bo_data );
+        if( i >= j ) {
+            // We only need to update bond orders from bo_ji
+            //   everything else is set in uncorrected_bo calculations 
+            sym_index = bonds.select.bond_list[pj].sym_index;
+            bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data);
+            bo_ij->BO = bo_ji->BO;
+            bo_ij->BO_s = bo_ji->BO_s;
+            bo_ij->BO_pi = bo_ji->BO_pi;
+            bo_ij->BO_pi2 = bo_ji->BO_pi2;
+            workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
+        }
+    }
+GLOBAL void Cuda_Update_Workspace_After_Bond_Orders(  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
+        static_storage workspace, int N )
+    int j, type_j;
+    real explp1;
+    real p_lp1;
+    single_body_parameters *sbp_i, *sbp_j;
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
+    p_lp1 = g_params.l[15];
+    /* Calculate some helper variables that are  used at many places 
+       throughout force calculations */
+    //for( j = 0; j < system->N; ++j ) {
+    type_j = atoms[j].type;
+    sbp_j = &(sbp[ type_j ]);
+    workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency;
+    workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e;
+    workspace.Delta_boc[j] = workspace.total_bond_order[j] - 
+        sbp_j->valency_boc;
+    workspace.vlpex[j] =  workspace.Delta_e[j] - 
+        2.0 * (int)(workspace.Delta_e[j]/2.0);
+    explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j]));
+    workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0);
+    workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j];
+    workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]);
+    /* Adri uses different dDelta_lp values than the ones in notes... */
+    workspace.dDelta_lp[j] = workspace.Clp[j];
+    //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
+    //((fabs(workspace->Delta_e[j]/2.0 - 
+    //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
+    if( sbp_j->mass > 21.0 ) {
+        workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
+        workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
+        workspace.dDelta_lp_temp[j] = 0.;
+    }
+    else {
+        workspace.nlp_temp[j] = workspace.nlp[j];
+        workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
+        workspace.dDelta_lp_temp[j] = workspace.Clp[j];
+    }
+    //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",
+    //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], 
+    //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt,
+    //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] );
+    //}
+//Import from the forces file. 
+GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data, 
+        static_storage workspace, list p_bonds, int ensemble, int N)
+    int i, pj;
+    list *bonds = &p_bonds;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) 
+    {
+        for (pj = Start_Index (i, bonds); pj < End_Index (i, bonds); ++pj)
+        {
+            //int j = bonds->select.bond_list[pj].nbr;
+            if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) 
+                Cuda_Add_dBond_to_Forces (i, pj, atoms, &workspace, bonds );
+            else 
+                Cuda_Add_dBond_to_Forces_NPT (i, pj, atoms, data, &workspace, bonds );
+        }
+    }
+GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *atoms, simulation_data *data, 
+        static_storage workspace, list p_bonds, int ensemble, int N)
+    int i, pj;
+    list *bonds = &p_bonds;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) 
+    {
+        if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) 
+            Cuda_dbond_to_Forces_postprocess (i, atoms, bonds );
+    }
diff --git a/PuReMD-GPU/src/cuda_bond_orders.h b/PuReMD-GPU/src/cuda_bond_orders.h
new file mode 100644
index 0000000000000000000000000000000000000000..4015b9fa34aa1e66843efc8dc1a49f1c75da749c
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_bond_orders.h
@@ -0,0 +1,48 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_BOND_ORDERS_H_
+#define __CUDA_BOND_ORDERS_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+GLOBAL void Cuda_Calculate_Bond_Orders_Init (  reax_atom *, global_parameters , single_body_parameters *,
+        static_storage , int , int );
+GLOBAL void Cuda_Calculate_Bond_Orders ( reax_atom *, global_parameters , single_body_parameters *,
+        two_body_parameters *, static_storage , list , list , list , int , int );
+GLOBAL void Cuda_Update_Uncorrected_BO (  static_storage , list , int );
+GLOBAL void Cuda_Update_Workspace_After_Bond_Orders(  reax_atom *, global_parameters , single_body_parameters *,
+        static_storage , int );
+GLOBAL void Cuda_Compute_Total_Force (reax_atom *, simulation_data *, static_storage , list , int , int );
+GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *, simulation_data *, static_storage , list , int , int );
+//HOST_DEVICE void Cuda_Add_dBond_to_Forces( int, int, reax_atom *, static_storage*, list* );
+//HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int, int, reax_atom *, simulation_data*, static_storage*, list* );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/helpers.cu b/PuReMD-GPU/src/cuda_box.cu
similarity index 87%
rename from PuReMD-GPU/src/helpers.cu
rename to PuReMD-GPU/src/cuda_box.cu
index 29ae31e305598009053100ae239ac6dbeed59d44..a9eb16faac9970f6bc89eeec4a42241cd30f0724 100644
--- a/PuReMD-GPU/src/helpers.cu
+++ b/PuReMD-GPU/src/cuda_box.cu
@@ -18,11 +18,13 @@
+#include "cuda_helpers.h"
-#include "helpers.h"
 #include "box.h"
-GLOBAL void compute_Inc_on_T3 (reax_atom *atoms, unsigned int N, simulation_box *box, real d1, real d2, real d3)
+GLOBAL void k_compute_Inc_on_T3(reax_atom *atoms, unsigned int N,
+    simulation_box *box, real d1, real d2, real d3)
     int index = blockIdx.x * blockDim.x + threadIdx.x;
     rvec dx;
@@ -31,5 +33,7 @@ GLOBAL void compute_Inc_on_T3 (reax_atom *atoms, unsigned int N, simulation_box
     dx[2] = d3;
     if (index < N )
+    {
         Inc_on_T3( atoms[index].x, dx, box );
+    }
diff --git a/PuReMD-GPU/src/matvec.h b/PuReMD-GPU/src/cuda_box.h
similarity index 83%
rename from PuReMD-GPU/src/matvec.h
rename to PuReMD-GPU/src/cuda_box.h
index e032febd4ef79968a7334e43c07bfb1db5e6f49c..913abc15867cf5ec9d8e87dad0fa56198de9b1ff 100644
--- a/PuReMD-GPU/src/matvec.h
+++ b/PuReMD-GPU/src/cuda_box.h
@@ -18,13 +18,14 @@
-#ifndef __MATVEC__H_
-#define __MATVEC__H_
+#ifndef __CUDA_BOX_H__
+#define __CUDA_BOX_H__
 #include "mytypes.h"
-GLOBAL void Cuda_Matvec (sparse_matrix , real *, real *, int );
-GLOBAL void Cuda_Matvec_csr (sparse_matrix , real *, real *, int );
+GLOBAL void k_compute_Inc_on_T3 (reax_atom *atoms, unsigned int N,
+    simulation_box *box, real d1, real d2, real d3);
diff --git a/PuReMD-GPU/src/center_mass.cu b/PuReMD-GPU/src/cuda_center_mass.cu
similarity index 91%
rename from PuReMD-GPU/src/center_mass.cu
rename to PuReMD-GPU/src/cuda_center_mass.cu
index ea8f799846b0d8c794007762ba18869bc9686787..158d3a16489f20362bd854312eb39aa0dcec57e8 100644
--- a/PuReMD-GPU/src/center_mass.cu
+++ b/PuReMD-GPU/src/cuda_center_mass.cu
@@ -18,17 +18,13 @@
+#include "cuda_center_mass.h"
-#include "center_mass.h"
 #include "vector.h"
-GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms,
-        rvec *res_xcm, 
-        rvec *res_vcm, 
-        rvec *res_amcm, 
-        size_t n)
+GLOBAL void k_center_of_mass_blocks( single_body_parameters *sbp, reax_atom *atoms,
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
     extern __shared__ rvec xcm[];
     extern __shared__ rvec vcm[];
@@ -76,13 +72,9 @@ GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms
-GLOBAL void center_of_mass (rvec *xcm, 
-        rvec *vcm, 
-        rvec *amcm, 
-        rvec *res_xcm,
-        rvec *res_vcm,
-        rvec *res_amcm,
-        size_t n)
+GLOBAL void k_center_of_mass( rvec *xcm, rvec *vcm, rvec *amcm,
+        rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n )
     extern __shared__ rvec sh_xcm[];
     extern __shared__ rvec sh_vcm[];
@@ -131,11 +123,9 @@ GLOBAL void center_of_mass (rvec *xcm,
-GLOBAL void compute_center_mass (single_body_parameters *sbp, 
-        reax_atom *atoms,
-        real *results, 
-        real xcm0, real xcm1, real xcm2,
-        size_t n)
+GLOBAL void k_compute_center_mass_sbp( single_body_parameters *sbp, reax_atom *atoms,
+        real *results, real xcm0, real xcm1, real xcm2, size_t n )
     extern __shared__ real xx[];
     extern __shared__ real xy[];
@@ -160,11 +150,11 @@ GLOBAL void compute_center_mass (single_body_parameters *sbp,
     xcm[1] = xcm1;
     xcm[2] = xcm2;
     xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
         yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-    if (i < n){
+    if (i < n)
+    {
         m = sbp[ atoms[i].type ].mass;
         rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
         xx[ xx_i ] = diff[0] * diff[0] * m;
@@ -176,8 +166,10 @@ GLOBAL void compute_center_mass (single_body_parameters *sbp,
     __syncthreads ();
-    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-        if (threadIdx.x < offset){
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset) 
+        {
             index = threadIdx.x + offset;
             xx[ threadIdx.x ] += xx[ index ];
             xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
@@ -189,7 +181,8 @@ GLOBAL void compute_center_mass (single_body_parameters *sbp,
         __syncthreads ();
-    if (threadIdx.x == 0) {
+    if (threadIdx.x == 0)
+    {
         results [ blockIdx.x*6 ] = xx [ 0 ];
         results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
         results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
@@ -199,7 +192,8 @@ GLOBAL void compute_center_mass (single_body_parameters *sbp,
-GLOBAL void compute_center_mass (real *input, real *output, size_t n)
+GLOBAL void k_compute_center_mass( real *input, real *output, size_t n )
     extern __shared__ real xx[];
     extern __shared__ real xy[];
diff --git a/PuReMD-GPU/src/cuda_center_mass.h b/PuReMD-GPU/src/cuda_center_mass.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c1d76ec63defe3cb6a0738a2843b10d02104bb5
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_center_mass.h
@@ -0,0 +1,44 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_CENTER_MASS_H__
+#define __CUDA_CENTER_MASS_H__
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+GLOBAL void k_center_of_mass_blocks( single_body_parameters *, reax_atom *,
+    rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n ); 
+GLOBAL void k_center_of_mass( rvec *xcm,
+    rvec *vcm, rvec *amcm, rvec *res_xcm, rvec *res_vcm, rvec *res_amcm, size_t n );
+GLOBAL void k_compute_center_mass_sbp( single_body_parameters *sbp,
+    reax_atom *atoms, real *results, real xcm0, real xcm1, real xcm2, size_t n );
+GLOBAL void k_compute_center_mass( real *input, real *output, size_t n );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_copy.cu b/PuReMD-GPU/src/cuda_copy.cu
index 2db79e3718e0c5ea3c54913e29147c71735552f9..1f50dbf3c74fae4d9ab73149d02c51dbec8ca10e 100644
--- a/PuReMD-GPU/src/cuda_copy.cu
+++ b/PuReMD-GPU/src/cuda_copy.cu
@@ -18,91 +18,96 @@
+#include "cuda_copy.h"
+#include "cuda_list.h"
-#include "cuda_copy.h"
 #include "vector.h"
-void Sync_Host_Device (grid *host, grid *dev, enum cudaMemcpyKind dir)
+void Sync_Host_Device_Grid( grid *host, grid *dev, enum cudaMemcpyKind dir )
-    copy_host_device (host->top, dev->top, 
-            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP);
+    copy_host_device( host->top, dev->top, 
+            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP );
-    copy_host_device (host->mark, dev->mark, 
-            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_MARK);
+    copy_host_device( host->mark, dev->mark, 
+            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_MARK );
-    copy_host_device (host->start, dev->start, 
-            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_START);
+    copy_host_device( host->start, dev->start, 
+            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_START );
-    copy_host_device (host->end, dev->end, 
-            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_END);
+    copy_host_device( host->end, dev->end, 
+            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_END );
-    copy_host_device (host->atoms, dev->atoms, 
-            INT_SIZE * host->max_atoms*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_ATOMS);
+    copy_host_device( host->atoms, dev->atoms, 
+            INT_SIZE * host->max_atoms*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_ATOMS );
-    copy_host_device (host->nbrs, dev->nbrs, 
-            IVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS);
+    copy_host_device( host->nbrs, dev->nbrs, 
+            IVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS );
-    copy_host_device (host->nbrs_cp, dev->nbrs_cp, 
-            RVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS_CP);
+    copy_host_device( host->nbrs_cp, dev->nbrs_cp, 
+            RVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS_CP );
-void Sync_Host_Device (reax_system *sys, enum cudaMemcpyKind dir)
+void Sync_Host_Device_Sys( reax_system *sys, enum cudaMemcpyKind dir )
-    copy_host_device (sys->atoms, sys->d_atoms, 
-            REAX_ATOM_SIZE * sys->N, dir, RES_SYSTEM_ATOMS);
+    copy_host_device( sys->atoms, sys->d_atoms, 
+            REAX_ATOM_SIZE * sys->N, dir, RES_SYSTEM_ATOMS );
-    copy_host_device (&(sys->box), sys->d_box, SIMULATION_BOX_SIZE, dir, RES_SYSTEM_SIMULATION_BOX );
+    copy_host_device( &(sys->box), sys->d_box, SIMULATION_BOX_SIZE, dir, RES_SYSTEM_SIMULATION_BOX );
     //synch bonds here.
-    copy_host_device (sys->reaxprm.sbp, sys->reaxprm.d_sbp, SBP_SIZE * sys->reaxprm.num_atom_types, 
+    copy_host_device( sys->reaxprm.sbp, sys->reaxprm.d_sbp, SBP_SIZE * sys->reaxprm.num_atom_types, 
             dir, RES_REAX_INT_SBP );
-    copy_host_device (sys->reaxprm.tbp, sys->reaxprm.d_tbp, TBP_SIZE * pow (sys->reaxprm.num_atom_types, 2), 
+    copy_host_device( sys->reaxprm.tbp, sys->reaxprm.d_tbp, TBP_SIZE * pow (sys->reaxprm.num_atom_types, 2), 
             dir, RES_REAX_INT_TBP );
-    copy_host_device (sys->reaxprm.thbp, sys->reaxprm.d_thbp, THBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), 
+    copy_host_device( sys->reaxprm.thbp, sys->reaxprm.d_thbp, THBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), 
             dir, RES_REAX_INT_THBP );
-    copy_host_device (sys->reaxprm.hbp, sys->reaxprm.d_hbp, HBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), 
+    copy_host_device( sys->reaxprm.hbp, sys->reaxprm.d_hbp, HBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), 
             dir, RES_REAX_INT_HBP );
-    copy_host_device (sys->reaxprm.fbp, sys->reaxprm.d_fbp, FBP_SIZE * pow (sys->reaxprm.num_atom_types, 4),
+    copy_host_device( sys->reaxprm.fbp, sys->reaxprm.d_fbp, FBP_SIZE * pow (sys->reaxprm.num_atom_types, 4),
             dir, RES_REAX_INT_FBP );
-    copy_host_device (sys->reaxprm.gp.l, sys->reaxprm.d_gp.l, REAL_SIZE * sys->reaxprm.gp.n_global, 
+    copy_host_device( sys->reaxprm.gp.l, sys->reaxprm.d_gp.l, REAL_SIZE * sys->reaxprm.gp.n_global, 
             dir, RES_GLOBAL_PARAMS );
     sys->reaxprm.d_gp.n_global = sys->reaxprm.gp.n_global; 
     sys->reaxprm.d_gp.vdw_type = sys->reaxprm.gp.vdw_type; 
-void Sync_Host_Device (simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir)
+void Sync_Host_Device_Data( simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir )
-    copy_host_device (host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA );
+    copy_host_device( host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA );
-void Sync_Host_Device (sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir )
+void Sync_Host_Device_Mat( sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir )
-    copy_host_device ( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
-    copy_host_device ( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
-    copy_host_device ( L->entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * L->m, dir, RES_SPARSE_MATRIX_ENTRY );
+    copy_host_device( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
+    copy_host_device( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
+    copy_host_device( L->entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * L->m, dir, RES_SPARSE_MATRIX_ENTRY );
-    copy_host_device ( U->start, dev_workspace->U.start, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
-    copy_host_device ( U->end, dev_workspace->U.end, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
-    copy_host_device ( U->entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * U->m, dir, RES_SPARSE_MATRIX_ENTRY );
+    copy_host_device( U->start, dev_workspace->U.start, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
+    copy_host_device( U->end, dev_workspace->U.end, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
+    copy_host_device( U->entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * U->m, dir, RES_SPARSE_MATRIX_ENTRY );
-void Sync_Host_Device (output_controls *, control_params *, enum cudaMemcpyKind)
+void Sync_Host_Device_Control( output_controls *, control_params *, enum cudaMemcpyKind )
-void Sync_Host_Device (control_params *host, control_params *device, enum cudaMemcpyKind)
+void Sync_Host_Device_Params( control_params *host, control_params *device, enum cudaMemcpyKind )
-    copy_host_device (host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
+    copy_host_device( host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
-void Prep_Device_For_Output (reax_system *system, simulation_data *data )
+void Prep_Device_For_Output( reax_system *system, simulation_data *data )
     //int size = sizeof (simulation_data) - (2*sizeof (reax_timing) + sizeof (void *));
     //unsigned long start_address = (unsigned long)data->d_simulation_data + (unsigned long) (2 * INT_SIZE + REAL_SIZE);
@@ -112,7 +117,7 @@ void Prep_Device_For_Output (reax_system *system, simulation_data *data )
     //fprintf (stderr, "size to copy --> %d \n", size );
     //copy_host_device (data, (simulation_data *)data->d_simulation_data, size, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-    //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost );
+    //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost );
        copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, 
        REAL_SIZE * 13, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
@@ -126,7 +131,7 @@ void Prep_Device_For_Output (reax_system *system, simulation_data *data )
     simulation_data local_data;
-    copy_host_device (&local_data, (simulation_data *)data->d_simulation_data, 
+    copy_host_device( &local_data, (simulation_data *)data->d_simulation_data, 
             SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
     data->E_BE = local_data.E_BE;
     data->E_Ov = local_data.E_Ov;
@@ -141,43 +146,46 @@ void Prep_Device_For_Output (reax_system *system, simulation_data *data )
     data->E_vdW = local_data.E_vdW;
     data->E_Ele = local_data.E_Ele;
     data->E_Kin = local_data.E_Kin;
-    rvec_Copy (data->int_press, local_data.int_press);
-    rvec_Copy (data->ext_press, local_data.ext_press);
+    rvec_Copy( data->int_press, local_data.int_press);
+    rvec_Copy( data->ext_press, local_data.ext_press);
     data->kin_press =  local_data.kin_press;
     data->therm.T = local_data.therm.T;
-    //Sync_Host_Device (&system.g, &system.d_g, cudaMemcpyDeviceToHost );
-    Sync_Host_Device (system, cudaMemcpyDeviceToHost );
+    //Sync_Host_Device_Sys( &system.g, &system.d_g, cudaMemcpyDeviceToHost );
+    Sync_Host_Device_Sys( system, cudaMemcpyDeviceToHost );
-void Sync_Host_Device (list *host, list *device, int type)
+void Sync_Host_Device_List( list *host, list *device, int type )
     //list is already allocated -- discard it first
     if (host->n > 0)
-        Delete_List (host, TYP_HOST);
+    {
+        Cuda_Delete_List( host );
+    }
     //memory is allocated on the host
-    Make_List(device->n, device->num_intrs, type, host, TYP_HOST );
+    Cuda_Make_List( device->n, device->num_intrs, type, host );
     //memcpy the entries from device to host
-    copy_host_device (host->index, device->index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_INDEX );
-    copy_host_device (host->end_index, device->end_index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_END_INDEX );
+    copy_host_device( host->index, device->index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_INDEX );
+    copy_host_device( host->end_index, device->end_index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_END_INDEX );
     switch (type)
         case TYP_BOND:
-            copy_host_device (host->select.bond_list, device->select.bond_list, 
+            copy_host_device( host->select.bond_list, device->select.bond_list, 
                     BOND_DATA_SIZE * device->num_intrs, cudaMemcpyDeviceToHost, LIST_BOND_DATA );
         case TYP_THREE_BODY:
-            copy_host_device (host->select.three_body_list, device->select.three_body_list, 
-                    sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, LIST_THREE_BODY_DATA );
+            copy_host_device( host->select.three_body_list, device->select.three_body_list, 
+                    sizeof( three_body_interaction_data ) * device->num_intrs, cudaMemcpyDeviceToHost, LIST_THREE_BODY_DATA );
-            fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type );
-            exit (1);
+            fprintf( stderr, "Unknown list synching from device to host ---- > %d \n", type );
+            exit( 1 );
diff --git a/PuReMD-GPU/src/cuda_copy.h b/PuReMD-GPU/src/cuda_copy.h
index 561a49fbcce0fbb7b4ff03a38874b0fcadb5d7e8..6b6f38ae3d12d7d00d8cd38eef996d61cb765bd5 100644
--- a/PuReMD-GPU/src/cuda_copy.h
+++ b/PuReMD-GPU/src/cuda_copy.h
@@ -18,24 +18,32 @@
 #ifndef __CUDA_COPY_H_
 #define __CUDA_COPY_H_
 #include "cuda_utils.h"
-#include "cuda.h"
 #include "mytypes.h"
 #include "list.h"
-void Sync_Host_Device (grid *, grid *, enum cudaMemcpyKind);
-void Sync_Host_Device (reax_system *, enum cudaMemcpyKind);
-void Sync_Host_Device (control_params *, control_params *, enum cudaMemcpyKind);
-void Sync_Host_Device (simulation_data *, simulation_data *, enum cudaMemcpyKind);
-void Sync_Host_Device (sparse_matrix *, sparse_matrix *, enum cudaMemcpyKind);
-void Sync_Host_Device (output_controls *, enum cudaMemcpyKind);
-void Prep_Device_For_Output (reax_system *, simulation_data *);
-void Sync_Host_Device (list *host, list *device, int type);
+#ifdef __cplusplus
+extern "C"  {
+void Sync_Host_Device_Grid( grid *, grid *, enum cudaMemcpyKind );
+void Sync_Host_Device_Sys( reax_system *, enum cudaMemcpyKind );
+void Sync_Host_Device_Params( control_params *, control_params *, enum cudaMemcpyKind );
+void Sync_Host_Device_Data( simulation_data *, simulation_data *, enum cudaMemcpyKind );
+void Sync_Host_Device_Mat( sparse_matrix *, sparse_matrix *, enum cudaMemcpyKind );
+void Sync_Host_Device_Control( output_controls *, enum cudaMemcpyKind );
+void Prep_Device_For_Output( reax_system *, simulation_data * );
+void Sync_Host_Device_List( list *host, list *device, int type );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_environment.cu b/PuReMD-GPU/src/cuda_environment.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cd6ae50d82b4716aef1d16f2f957dc5f4976cca7
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_environment.cu
@@ -0,0 +1,71 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_environment.h"
+#include "cuda_utils.h"
+void Setup_Cuda_Environment( int rank, int nprocs, int gpus_per_node )
+    int deviceCount = 0;
+    cudaError_t flag;
+    cublasHandle_t cublasHandle;
+    cusparseHandle_t cusparseHandle;
+    cusparseMatDescr_t matdescriptor;
+    flag = cudaGetDeviceCount( &deviceCount );
+    if ( flag != cudaSuccess || deviceCount < 1 )
+    {
+        fprintf( stderr, "ERROR: no CUDA capable device(s) found. Terminating...\n" );
+        exit( 1 );
+    }
+    //Calculate the # of GPUs per processor
+    //and assign the GPU for each process
+    //TODO: handle condition where # CPU procs > # GPUs
+    cudaSetDevice( rank % deviceCount );
+#if defined(__CUDA_DEBUG__)
+    fprintf( stderr, "p:%d is using GPU: %d \n", rank, rank % deviceCount );
+    //cudaDeviceSetLimit( cudaLimitStackSize, 8192 );
+    //cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
+    //cudaCheckError( );
+    cublasCheckError( cublasCreate(&cublasHandle) );
+    cusparseCheckError( cusparseCreate(&cusparseHandle) );
+    cusparseCheckError( cusparseCreateMatDescr(&matdescriptor) );
+    cusparseSetMatType( matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL );
+    cusparseSetMatIndexBase( matdescriptor, CUSPARSE_INDEX_BASE_ZERO );
+void Cleanup_Cuda_Environment( )
+    cudaDeviceReset( );
+    cudaDeviceSynchronize( );
diff --git a/PuReMD-GPU/src/cuda_environment.h b/PuReMD-GPU/src/cuda_environment.h
new file mode 100644
index 0000000000000000000000000000000000000000..61f811db3354628a3067a3423442f875f16e6871
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_environment.h
@@ -0,0 +1,39 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+void Setup_Cuda_Environment( int, int, int );
+void Cleanup_Cuda_Environment( );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_forces.cu b/PuReMD-GPU/src/cuda_forces.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bf277b391ce0df0c5336ea0a0653b6863ca14fec
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_forces.cu
@@ -0,0 +1,2002 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_forces.h"
+#include "box.h"
+#include "forces.h"
+#include "index_utils.h"
+#include "list.h"
+#include "print_utils.h"
+#include "system_props.h"
+#include "vector.h"
+#include "cuda_utils.h"
+#include "cuda_init.h"
+#include "cuda_bond_orders.h"
+#include "cuda_single_body_interactions.h"
+#include "cuda_two_body_interactions.h"
+#include "cuda_three_body_interactions.h"
+#include "cuda_four_body_interactions.h"
+#include "cuda_list.h"
+#include "cuda_QEq.h"
+#include "cuda_reduction.h"
+#include "cuda_system_props.h"
+#include "validation.h"
+#include "cudaProfiler.h"
+void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
+    real t_start, t_elapsed;
+    real *spad = (real *)scratch;
+    rvec *rvec_spad;
+    //Compute the bonded for interaction here. 
+    //Step 1.
+#ifdef __DEBUG_CUDA__
+    t_start = Get_Time( );
+    fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE);
+    Cuda_Calculate_Bond_Orders_Init<<< BLOCKS, BLOCK_SIZE >>>
+        (  system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp,
+           *dev_workspace, system->reaxprm.num_atom_types, system->N);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    Cuda_Calculate_Bond_Orders<<< BLOCKS, BLOCK_SIZE >>>
+        ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
+          system->reaxprm.d_tbp, *dev_workspace, 
+          *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), 
+          system->reaxprm.num_atom_types, system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    Cuda_Update_Uncorrected_BO<<<BLOCKS, BLOCK_SIZE>>>
+        (*dev_workspace, *(dev_lists + BONDS), system->N);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    Cuda_Update_Workspace_After_Bond_Orders<<<BLOCKS, BLOCK_SIZE>>>
+        (system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
+         *dev_workspace, system->N);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+#ifdef __DEBUG_CUDA__
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf( stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+    fprintf( stderr, "Cuda_Calculate_Bond_Orders Done... \n" );
+    //Step 2.
+#ifdef __DEBUG_CUDA__
+    t_start = Get_Time( );
+    //cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE + system->N * REAL_SIZE + 16 * REAL_SIZE), RES_SCRATCH );
+    cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE ) , RES_SCRATCH );
+    Cuda_Bond_Energy <<< BLOCKS, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>
+        ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, system->reaxprm.d_tbp,
+          (simulation_data *)data->d_simulation_data, *dev_workspace, *(dev_lists + BONDS), 
+          system->N, system->reaxprm.num_atom_types, spad );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Reduction for E_BE
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (spad, spad + system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+        //(spad + system->N, spad + system->N + 16, 16);
+        (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_BE, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+#ifdef __DEBUG_CUDA__
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+    fprintf (stderr, "Cuda_Bond_Energy Done... \n");
+    //Step 3.
+#ifdef __DEBUG_CUDA__
+    t_start = Get_Time( );
+    cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N ), RES_SCRATCH );
+    test_LonePair_OverUnder_Coordination_Energy_LP <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, 
+            system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+            *dev_workspace, (simulation_data *)data->d_simulation_data,
+            *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, 
+            spad, spad + 2 * system->N, spad + 4*system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    test_LonePair_OverUnder_Coordination_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, 
+            system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+            *dev_workspace, (simulation_data *)data->d_simulation_data,
+            *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, 
+            spad, spad + 2 * system->N, spad + 4*system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    test_LonePair_Postprocess        <<<BLOCKS, BLOCK_SIZE, 0>>>( system->d_atoms, system->reaxprm.d_gp, 
+            system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+            *dev_workspace, (simulation_data *)data->d_simulation_data,
+            *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Reduction for E_Lp
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (spad, spad + system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+        (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Lp, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Reduction for E_Ov
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (spad + 2*system->N, spad + 3*system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+        (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ov, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Reduction for E_Un
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (spad + 4*system->N, spad + 5*system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+        (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Un, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+#ifdef __DEBUG_CUDA__
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+    fprintf (stderr, "test_LonePair_postprocess Done... \n");
+    //Step 4.
+#ifdef __DEBUG_CUDA__
+    t_start = Get_Time( );
+    cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH);
+    k_Three_Body_Estimate<<<BLOCKS, BLOCK_SIZE>>>
+        (system->d_atoms, 
+         (control_params *)control->d_control, 
+         *(dev_lists + BONDS),
+         system->N, (int *)spad);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+#ifdef __DEBUG_CUDA__
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf (stderr, "Three_Body_Estimate... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+    int *thbody = (int *) malloc (sizeof (int) * (dev_lists + BONDS)->num_intrs);
+    memset (thbody, 0, sizeof (int) * (dev_lists + BONDS)->num_intrs);
+    copy_host_device (thbody, spad, (dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, RES_SCRATCH);
+    int total_3body = thbody [0] * SAFE_ZONE;
+    for (int x = 1; x < (dev_lists + BONDS)->num_intrs; x++) {
+        total_3body += thbody [x]*SAFE_ZONE;
+        thbody [x] += thbody [x-1];
+    }
+    system->num_thbodies = thbody [(dev_lists+BONDS)->num_intrs-1];
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Total Three body estimate is %d (bonds: %d) \n", total_3body, (dev_lists+BONDS)->num_intrs);
+    if (!system->init_thblist) 
+    {
+        system->init_thblist = TRUE;
+        if(!Cuda_Make_List( (dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES )) {
+            fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
+            exit( INIT_ERR );
+        }
+#ifdef __CUDA_MEM__
+        fprintf (stderr, "Device memory allocated: three body list = %d MB\n", 
+                sizeof (three_body_interaction_data) * total_3body / (1024*1024));
+    } else {
+        if ((dev_workspace->realloc.bonds > 0) || (system->num_thbodies > (dev_lists+THREE_BODIES)->num_intrs )) {
+            int size = MAX (dev_workspace->realloc.num_bonds, (dev_lists+BONDS)->num_intrs);
+            /*Delete Three-body list*/
+            Cuda_Delete_List( dev_lists + THREE_BODIES );
+#ifdef __CUDA_MEM__
+            fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", 
+                    data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies);
+            /*Recreate Three-body list */
+            if(!Cuda_Make_List( size, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES )) {
+                fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
+                exit( INIT_ERR );
+            }
+        }
+    }
+    //copy the indexes into the thb list;
+    copy_host_device (thbody, ((dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), 
+            cudaMemcpyHostToDevice, LIST_INDEX);
+    copy_host_device (thbody, ((dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), 
+            cudaMemcpyHostToDevice, LIST_END_INDEX);
+    free (thbody );
+#ifdef __DEBUG_CUDA__
+    t_start = Get_Time( );
+    cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
+    k_Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
+        ( system->d_atoms,
+          system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, 
+          (control_params *)control->d_control,
+          (simulation_data *)data->d_simulation_data,
+          *dev_workspace, 
+          *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
+          system->N, system->reaxprm.num_atom_types, 
+          spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N));
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Not necessary to validate three-body list anymore, 
+    // Estimate is already done at the beginning which makes sure that 
+    // we have sufficient size for this list
+    //Cuda_Threebody_List( system, workspace, dev_lists + THREE_BODIES, data->step );
+    //Reduction for E_Ang
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (spad, spad + system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+        (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Ang, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Reduction for E_Pen
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (spad + 2*system->N, spad + 3*system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+        (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Pen, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Reduction for E_Coa
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (spad + 4*system->N, spad + 5*system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+        (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    //Reduction for ext_pres
+    rvec_spad = (rvec *) (spad + 6*system->N);
+    Cuda_reduction_rvec<<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
+        (rvec_spad, rvec_spad + system->N,  system->N);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    Cuda_reduction_rvec<<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
+        (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    real t_1, t_2;
+    t_1 = Get_Time( );
+    //Sum up the f vector for each atom and collect the CdDelta from all the bonds
+    k_Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, (control_params *)control->d_control,
+            *dev_workspace, *(dev_lists + BONDS), system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    t_2 = Get_Timing_Info( t_1 );
+#ifdef __DEBUG_CUDA__
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf( stderr, "Three_Body_Interactions post process Timing %lf \n", t_2 );
+    fprintf( stderr, "Three_Body_Interactions ...  Timing %lf \n", t_elapsed );
+    fprintf( stderr, "Three_Body_Interactions Done... \n" );
+    //Step 5.
+#ifdef __DEBUG_CUDA__
+    t_start = Get_Time( );
+    cuda_memset( spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
+    //k_Four_Body_Interactions<<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>>
+    k_Four_Body_Interactions<<< BLOCKS, BLOCK_SIZE >>>
+        ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_fbp,
+          (control_params *)control->d_control, *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
+          (simulation_box *)system->d_box, (simulation_data *)data->d_simulation_data,
+          *dev_workspace, system->N, system->reaxprm.num_atom_types, 
+          spad, spad + 2*system->N, (rvec *) (spad + 4*system->N) );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    //Reduction for E_Tor
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (spad, spad + system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+        (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Tor, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Reduction for E_Con
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (spad + 2*system->N, spad + 3*system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+        (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Con, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Reduction for ext_pres
+    rvec_spad = (rvec *) (spad + 4*system->N);
+    Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
+        (rvec_spad, rvec_spad + system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
+        (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //Post process here
+    k_Four_Body_Postprocess<<< BLOCKS, BLOCK_SIZE >>>
+        ( system->d_atoms, *dev_workspace, *(dev_lists + BONDS),
+            system->N );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+#ifdef __DEBUG_CUDA__
+    t_elapsed = Get_Timing_Info( t_start );
+    fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed );
+    fprintf (stderr, " Four_Body_ Done... \n");
+    //Step 6.
+    if (control->hb_cut > 0) {
+#ifdef __DEBUG_CUDA__
+        t_start = Get_Time( );
+        cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH );
+        /*
+           k_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>>
+           (  system->d_atoms, 
+           system->reaxprm.d_sbp,
+           system->reaxprm.d_hbp,
+           (control_params *)control->d_control,
+           (simulation_data *)data->d_simulation_data,
+         *dev_workspace, 
+         *(dev_lists + BONDS), *(dev_lists + HBONDS),
+         system->N, system->reaxprm.num_atom_types, 
+         spad, (rvec *) (spad + 2*system->N), NULL);
+         cudaThreadSynchronize ();
+         cudaCheckError ();
+         */
+#ifdef __DEBUG_CUDA__
+        real test1,test2;
+        test1 = Get_Time ();
+        int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + 
+            (((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1);
+        k_Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE )  >>>
+            (  system->d_atoms, 
+               system->reaxprm.d_sbp,
+               system->reaxprm.d_hbp,
+               (control_params *)control->d_control,
+               (simulation_data *)data->d_simulation_data,
+               *dev_workspace, 
+               *(dev_lists + BONDS), *(dev_lists + HBONDS),
+               system->N, system->reaxprm.num_atom_types, 
+               spad, (rvec *) (spad + 2*system->N), NULL);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+#ifdef __DEBUG_CUDA__
+        test2 = Get_Timing_Info (test1);
+        fprintf (stderr, "Timing for the hb and forces ---> %f \n", test2);
+        //Reduction for E_HB
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad, spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_HB, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        //Reduction for ext_pres
+        rvec_spad = (rvec *) (spad + 2*system->N);
+        Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
+            (rvec_spad, rvec_spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
+            (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        //Post process here
+#ifdef __DEBUG_CUDA__
+        real t_1, t_2;
+        t_1 = Get_Time ();
+        k_Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>>
+            (     system->d_atoms, 
+                system->reaxprm.d_sbp, 
+                *dev_workspace, 
+                *(dev_lists + BONDS),
+                *(dev_lists + HBONDS), 
+                *(dev_lists + FAR_NBRS),
+                system->N, 
+                spad); //this is for the fix to use the shared memory
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+#ifdef __DEBUG_CUDA__
+        t_2 = Get_Timing_Info ( t_1 );
+        fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2);
+        t_1 = Get_Time ();
+        //k_Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
+        k_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
+            (     system->d_atoms, 
+                system->reaxprm.d_sbp, 
+                *dev_workspace, 
+                *(dev_lists + BONDS),
+                *(dev_lists + HBONDS), 
+                *(dev_lists + FAR_NBRS),
+                system->N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        t_2 = Get_Timing_Info ( t_1 );
+#ifdef __DEBUG_CUDA__
+        fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2);
+        t_elapsed = Get_Timing_Info( t_start );
+        fprintf (stderr, "Hydrogen bonds post process return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed );
+        fprintf (stderr, "Hydrogen_Bond Done... \n");
+    }
+    return; 
+void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
+        simulation_data *data,static_storage *workspace,
+        list** lists, output_controls *out_control )
+    real t_start, t_elapsed;
+    real t1 = 0, t2 = 0;
+    real *spad = (real *) scratch;
+    rvec *rvec_spad;
+    int cblks;
+    t_start = Get_Time( );
+    Cuda_QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
+    t_elapsed = Get_Timing_Info( t_start );
+    d_timing.QEq += t_elapsed;
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, " Cuda_QEq done with timing %lf \n", t_elapsed );
+    cuda_memset (spad, 0, system->N * ( 4 * REAL_SIZE + 2 * RVEC_SIZE), RES_SCRATCH );
+    t_start = Get_Time ();
+    if ( control->tabulate == 0)
+    {
+        cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + 
+            ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1);
+        Cuda_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE * ( 2*REAL_SIZE + RVEC_SIZE) >>>
+            ( system->d_atoms,   
+              system->reaxprm.d_tbp,
+              system->reaxprm.d_gp, 
+              (control_params *)control->d_control, 
+              (simulation_data *)data->d_simulation_data,  
+              *(dev_lists + FAR_NBRS), 
+              spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), 
+              system->reaxprm.num_atom_types,
+              system->N ) ;
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+    else
+    {
+        cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + 
+            ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1);
+        Cuda_Tabulated_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE* (2*REAL_SIZE + RVEC_SIZE)>>>
+            (   (reax_atom *)system->d_atoms, 
+                (control_params *)control->d_control,
+                (simulation_data *)data->d_simulation_data, 
+                *(dev_lists + FAR_NBRS), 
+                spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), 
+                d_LR,
+                system->reaxprm.num_atom_types,
+                out_control->energy_update_freq,
+                system->N ) ;
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+    t_elapsed = Get_Timing_Info (t_start );
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed - t2));
+    fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed));
+    //Reduction on E_vdW
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
+        (spad, spad + system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+        (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_vdW, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    //reduction on E_Ele
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
+        (spad + 2*system->N, spad + 3*system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+        (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ele, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    rvec_spad = (rvec *) (spad + 4*system->N);
+    //reduction on ext_press
+    Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE>>> 
+        (rvec_spad, rvec_spad + system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2>>> 
+        (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+void Cuda_Validate_Lists( reax_system *system, static_storage *workspace, list **lists, int step, int n,
+        int num_bonds, int num_hbonds )
+    int i, flag;
+    list *bonds, *hbonds, *thblist;
+    int *bonds_start, *bonds_end;
+    int *hbonds_start, *hbonds_end;
+    int *mat_start, *mat_end;
+    int max_sparse_entries = 0;
+    bonds = *lists + BONDS;
+    hbonds = *lists + HBONDS;
+    bonds_start = (int *) calloc (bonds->n, INT_SIZE);
+    bonds_end = (int *) calloc (bonds->n, INT_SIZE);
+    hbonds_start = (int *) calloc (hbonds->n, INT_SIZE );
+    hbonds_end = (int *) calloc (hbonds->n, INT_SIZE );
+    mat_start = (int *) calloc (workspace->H.n, INT_SIZE );
+    mat_end = (int *) calloc (workspace->H.n, INT_SIZE );
+    copy_host_device (bonds_start, bonds->index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device (bonds_end, bonds->end_index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device (hbonds_start, hbonds->index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device (hbonds_end, hbonds->end_index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device (mat_start, workspace->H.start, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device (mat_end, workspace->H.end, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    /* Sparse Matrix entries */
+#ifdef __CUDA_TEST__
+    /*
+       workspace->realloc.Htop = 0;
+       for (i = 0; i < workspace->H.n-1; i++) {
+       if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])){
+       workspace->realloc.Htop = mat_end[i] - mat_start[i];
+       }
+       }
+     */
+    flag = -1;
+    workspace->realloc.Htop = 0;
+    for ( i = 0; i < n-1; i ++){
+        if( (mat_end[i] - mat_start[i]) > 
+                (system->max_sparse_matrix_entries * DANGER_ZONE )) {
+            //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index: %d (%d %d) \n", 
+            //                                step, i, mat_start[i], mat_end[i]);
+            if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i]))
+                workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ;
+        }
+        if ( (mat_end[i] > mat_start[i+1]) ){
+            fprintf( stderr, "step%d-matcheck failed: i=%d end(i)=%d start(i+1)=%d\n",
+                    step, flag, mat_end[i], mat_start[i+1]);
+            exit(INSUFFICIENT_SPACE);
+        }
+    }
+    if( (mat_end[i] - mat_start[i]) > system->max_sparse_matrix_entries * DANGER_ZONE ) {
+        if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i]))
+            workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ;
+        //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index %d (%d %d)  -- %d \n", 
+        //                                step, i, mat_start[i], mat_end[i], 
+        //                                (int) (system->max_sparse_matrix_entries * DANGER_ZONE));
+        if( mat_end[i] > system->N * system->max_sparse_matrix_entries ) {
+            fprintf( stderr, "step%d-matchk failed: i=%d end(i)=%d mat_end=%d\n",
+                    step, flag, mat_end[i], system->N * system->max_sparse_matrix_entries);
+            exit(INSUFFICIENT_SPACE);
+        }
+    }
+    /* bond list */
+#ifdef __CUDA_TEST__
+    //workspace->realloc.bonds = 1;
+    flag = -1;
+    workspace->realloc.num_bonds = 0;
+    for( i = 0; i < n-1; ++i ) {
+        workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS );
+        if( bonds_end[i] >= bonds_start[i+1]-2 ) {
+            workspace->realloc.bonds = 1;
+            //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", 
+            //                        step, i, bonds_start [i], bonds_end[i]);
+            if( bonds_end[i] > bonds_start[i+1] )
+                flag = i;
+        }
+    }
+    if( flag > -1 ) {
+        fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+                step, flag, bonds_end[flag], bonds_start[flag+1] );
+        exit(INSUFFICIENT_SPACE);
+    }    
+    workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS );
+    if( bonds_end[i] >= bonds->num_intrs-2 ) {
+        workspace->realloc.bonds = 1;
+        //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", 
+        //                        step, i, bonds_start [i], bonds_end[i]);
+        if( bonds_end[i] > bonds->num_intrs ) {
+            fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
+                    step, flag, bonds_end[i], bonds->num_intrs );
+            exit(INSUFFICIENT_SPACE);
+        }
+    }
+    //fprintf (stderr, "step:%d Total bonds: %d \n", step, workspace->realloc.num_bonds);
+    /* hbonds list */
+    if( workspace->num_H > 0 ) {
+#ifdef __CUDA_TEST__
+        //workspace->realloc.hbonds = 1;
+        flag = -1;
+        workspace->realloc.num_hbonds = 0;
+        for( i = 0; i < workspace->num_H-1; ++i ) {
+            workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS );
+            if( (hbonds_end[i] - hbonds_start[i]) >= 
+                    (hbonds_start[i+1] - hbonds_start[i]) * DANGER_ZONE ) {
+                workspace->realloc.hbonds = 1;
+                //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", 
+                //                        step, i, hbonds_start [i], hbonds_end[i]);
+                if( hbonds_end[i] > hbonds_start[i+1] )
+                    flag = i;
+            }
+        }
+        if( flag > -1 ) {
+            fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d,end(i)=%d str(i+1)=%d\n",
+                    step, flag, hbonds_start[(flag)],hbonds_end[(flag)], hbonds_start[(flag+1)] );
+            exit(INSUFFICIENT_SPACE);
+        }
+        workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS );
+        if( (hbonds_end[i] - hbonds_start[i]) >= 
+                (hbonds->num_intrs - hbonds_start[i]) * DANGER_ZONE ) {
+            workspace->realloc.hbonds = 1;
+            //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", 
+            //                        step, i, hbonds_start [i], hbonds_end[i]);
+            if( hbonds_end[i] > hbonds->num_intrs ) {
+                fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
+                        step, flag, hbonds_end[i], hbonds->num_intrs );
+                exit(INSUFFICIENT_SPACE);
+            }
+        }
+    }
+    //fprintf (stderr, "step:%d Total Hbonds: %d \n", step, workspace->realloc.num_hbonds);
+    free (bonds_start);
+    free (bonds_end );
+    free (hbonds_start );
+    free (hbonds_end  );
+    free (mat_start );
+    free (mat_end );
+void Cuda_Threebody_List( reax_system *system, static_storage *workspace, list *thblist, int step )
+    int *thb_start, *thb_end;
+    int i, flag;
+    thb_start = (int *) calloc (thblist->n, INT_SIZE);
+    thb_end = (int *) calloc (thblist->n, INT_SIZE );
+    copy_host_device (thb_start, thblist->index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device (thb_end, thblist->end_index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    /*three_body list*/
+    flag = -1;
+    workspace->realloc.num_3body = 0;
+    for( i = 0; i < thblist->n-1; ++i ){
+        if( (thb_end[i] - thb_start[i]) >= (thb_start[i+1] - thb_start[i])*DANGER_ZONE ) {
+            workspace->realloc.thbody = 1;
+            if( thb_end[i] > thb_end[i+1] || thb_end[i] > thblist->num_intrs ) {
+                flag = i;
+                break;
+            }
+        }
+    }
+    if( flag > -1 ) {
+        //fprintf( stderr, "step%d-thbchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+        //   step, flag, thb_end[flag], thb_start[flag+1] );
+        fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
+                step, flag-1, thb_start[flag-1], thb_end[flag-1], thblist->num_intrs );
+        fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
+                step, flag, thb_start[flag], thb_end[flag], thblist->num_intrs );
+        exit(INSUFFICIENT_SPACE);
+    }    
+    if( (thb_end[i]-thb_start[i]) >= (thblist->num_intrs - thb_start[i])*DANGER_ZONE ) {
+        workspace->realloc.thbody = 1;
+        if( thb_end[i] > thblist->num_intrs ) {
+            fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
+                    step, i-1, thb_start[i-1], thb_end[i-1], thblist->num_intrs );
+            fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
+                    step, i, thb_start[i], thb_end[i], thblist->num_intrs );
+            exit(INSUFFICIENT_SPACE);
+        }
+    }
+    free (thb_start);
+    free (thb_end);
+GLOBAL void k_Estimate_Sparse_Matrix_Entries ( reax_atom *atoms, control_params *control, 
+        simulation_data *data, simulation_box *box, list far_nbrs, int N, int *indices ) {
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop;
+    int flag;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+    int temp;
+    Htop = 0;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    atom_i = &(atoms[i]);
+    type_i  = atom_i->type;
+    start_i = Start_Index(i, &far_nbrs);
+    end_i   = End_Index(i, &far_nbrs);
+    indices[i] = Htop;
+    for( pj = start_i; pj < end_i; ++pj ) {
+        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &(atoms[j]);
+        //if (i < j) continue;
+        flag = 0;
+        if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+            if( nbr_pj->d <= control->r_cut)
+                flag = 1;
+            else flag = 0;
+        }
+        else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec)) <=     
+                SQR(control->r_cut)){
+            nbr_pj->d = sqrt(nbr_pj->d);
+            flag = 1;
+        }
+        if( flag ){    
+            ++Htop;
+        }
+    }
+    ++Htop;
+    // mark the end of j list
+    indices[i] = Htop;
+GLOBAL void k_Init_Forces( reax_atom *atoms,         global_parameters g_params, control_params *control, 
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        simulation_data *data, simulation_box *box,    static_storage workspace,
+        list far_nbrs,             list bonds,                list hbonds, 
+        int N,                         int max_sparse_entries, int num_atom_types ) 
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int ihb, jhb, ihb_top, jhb_top;
+    int flag;
+    real r_ij, r2, self_coef;
+    real dr3gamij_1, dr3gamij_3, Tap;
+    //real val, dif, base;
+    real C12, C34, C56;
+    real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2;   
+    sparse_matrix *H;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    //LR_lookup_table *t;
+    reax_atom *atom_i, *atom_j;
+    bond_data *ibond, *jbond;
+    bond_order_data *bo_ij, *bo_ji;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    H = &( workspace.H );
+    //Htop = 0;
+    Htop = i * max_sparse_entries;
+    num_bonds = 0;
+    num_hbonds = 0;
+    btop_i = btop_j = 0;
+    p_boc1 = g_params.l[0];
+    p_boc2 = g_params.l[1];
+    //for( i = 0; i < system->N; ++i ) 
+    atom_i = &(atoms[i]);
+    type_i  = atom_i->type;
+    start_i = Start_Index(i, &far_nbrs);
+    end_i   = End_Index(i, &far_nbrs);
+    H->start[i] = Htop;
+    H->end[i] = Htop;
+    btop_i = End_Index( i, &bonds );
+    sbp_i = &(sbp[type_i]);
+    ihb = ihb_top = -1;
+    ihb = sbp_i->p_hbond;
+    if( control->hb_cut > 0 && (ihb==1 || ihb == 2))
+        ihb_top = End_Index( workspace.hbond_index[i], &hbonds );
+    for( pj = start_i; pj < end_i; ++pj ) {
+        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &(atoms[j]);
+        flag = 0;
+        if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+            if( nbr_pj->d <= control->r_cut)
+                flag = 1;
+            else flag = 0;
+        }
+        else if (i > j) {
+            if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+        } else if (i < j) {
+            if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+        }
+        if( flag ){    
+            type_j = atoms[j].type;
+            r_ij = nbr_pj->d;
+            sbp_j = &(sbp[type_j]);
+            twbp = &(tbp[ index_tbp (type_i,type_j, num_atom_types) ]);
+            self_coef = (i == j) ? 0.5 : 1.0;
+            /* H matrix entry */
+            //CHANGE ORIGINAL
+            //if (i > j) {
+            Tap = control->Tap7 * r_ij + control->Tap6;
+            Tap = Tap * r_ij + control->Tap5;
+            Tap = Tap * r_ij + control->Tap4;
+            Tap = Tap * r_ij + control->Tap3;
+            Tap = Tap * r_ij + control->Tap2;
+            Tap = Tap * r_ij + control->Tap1;
+            Tap = Tap * r_ij + control->Tap0;          
+            dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+            dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+            H->entries[Htop].j = j;
+            H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
+            ++Htop;
+            //}
+            //CHANGE ORIGINAL
+            /* hydrogen bond lists */ 
+            if( control->hb_cut > 0 && (ihb==1 || ihb == 2) && 
+                    nbr_pj->d <= control->hb_cut ) {
+                // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                jhb = sbp_j->p_hbond;
+                if (ihb == 1 && jhb == 2) {
+                    if (i > j) {
+                        hbonds.select.hbond_list[ihb_top].nbr = j;
+                        hbonds.select.hbond_list[ihb_top].scl = 1;
+                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                        //Auxilary data structures
+                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    } else {
+                        hbonds.select.hbond_list[ihb_top].nbr = j;
+                        hbonds.select.hbond_list[ihb_top].scl = -1;
+                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                        //Auxilary data structures
+                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    }
+                } else if (ihb == 2 && jhb == 1) { 
+                    hbonds.select.hbond_list[ihb_top].nbr = j; 
+                    hbonds.select.hbond_list[ihb_top].scl = 1; 
+                    hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                    //TODO
+                    rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                    hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                    ++ihb_top;
+                    ++num_hbonds;
+                } 
+            }
+            /* uncorrected bond orders */
+            if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                r2 = SQR(r_ij);
+                if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                }
+                else BO_s = C12 = 0.0;
+                if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                    BO_pi = EXP( C34 );
+                }
+                else BO_pi = C34 = 0.0;
+                if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                    BO_pi2= EXP( C56 );
+                }
+                else BO_pi2 = C56 = 0.0;
+                /* Initially BO values are the uncorrected ones, page 1 */
+                BO = BO_s + BO_pi + BO_pi2;
+                if( BO >= control->bo_cut ) {
+                    //CHANGE ORIGINAL
+                    num_bonds += 1;
+                    //CHANGE ORIGINAL
+                    /****** bonds i-j and j-i ******/
+                    /* Bond Order page2-3, derivative of total bond order prime */
+                    Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                    Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                    Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+                    if (i > j) 
+                    {
+                        ibond = &( bonds.select.bond_list[btop_i] );
+                        ibond->nbr = j;
+                        ibond->d = r_ij;
+                        rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                        ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
+                        //ibond->dbond_index = btop_i;
+                        //ibond->sym_index = btop_j;
+                        ++btop_i;
+                        bo_ij = &( ibond->bo_data );
+                        bo_ij->BO = BO;
+                        bo_ij->BO_s = BO_s;
+                        bo_ij->BO_pi = BO_pi;
+                        bo_ij->BO_pi2 = BO_pi2;
+                        //Auxilary data structures
+                        ibond->scratch = 0;
+                        ibond->CdDelta_ij = 0;
+                        rvec_MakeZero (ibond->f);
+                        ibond->l = -1;
+                        ibond->CdDelta_jk = 0;
+                        ibond->Cdbo_kl = 0;
+                        rvec_MakeZero (ibond->i_f);
+                        rvec_MakeZero (ibond->k_f);
+                        rvec_MakeZero (ibond->h_f);
+                        rvec_MakeZero (ibond->t_f);
+                        // Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                        //     dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 
+                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi2,
+                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                        // Only dBOp wrt. dr_i is stored here, note that 
+                        //    dBOp/dr_i = -dBOp/dr_j and all others are 0 
+                        rvec_Scale( bo_ij->dBOp, 
+                                -(bo_ij->BO_s * Cln_BOp_s + 
+                                    bo_ij->BO_pi * Cln_BOp_pi + 
+                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                        rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp );
+                        bo_ij->BO_s -= control->bo_cut;
+                        bo_ij->BO -= control->bo_cut;
+                        workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp
+                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
+                    } else if ( i < j )
+                    {
+                        rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
+                        rvec dBOp;
+                        btop_j = btop_i;
+                        jbond = &(bonds.select.bond_list[btop_j]);
+                        jbond->nbr = j;
+                        jbond->d = r_ij;
+                        rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
+                        ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+                        btop_i ++;
+                        //jbond->dbond_index = btop_i;
+                        //jbond->sym_index = btop_i;
+                        bo_ji = &( jbond->bo_data );
+                        bo_ji->BO = BO;
+                        bo_ji->BO_s = BO_s;
+                        bo_ji->BO_pi = BO_pi;
+                        bo_ji->BO_pi2 = BO_pi2;
+                        //Auxilary data structures
+                        jbond->scratch = 0;
+                        jbond->CdDelta_ij = 0;
+                        rvec_MakeZero (jbond->f);
+                        jbond->l = -1;
+                        jbond->CdDelta_jk = 0;
+                        jbond->Cdbo_kl = 0;
+                        rvec_MakeZero (jbond->i_f);
+                        rvec_MakeZero (jbond->k_f);
+                        rvec_MakeZero (jbond->h_f);
+                        rvec_MakeZero (jbond->t_f);
+                        // Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                        // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0
+                        rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec);
+                        rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec);
+                        rvec_Scale(dln_BOp_pi2,
+                                -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec);
+                        rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s);
+                        rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi );
+                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 );
+                        // Only dBOp wrt. dr_i is stored here, note that 
+                        //    dBOp/dr_i = -dBOp/dr_j and all others are 0 
+                        rvec_Scale( dBOp, 
+                                -(BO_s * Cln_BOp_s + 
+                                    BO_pi * Cln_BOp_pi + 
+                                    BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec );
+                        rvec_Scale( bo_ji->dBOp, -1., dBOp );
+                        rvec_Add( workspace.dDeltap_self[i] , bo_ji->dBOp );
+                        bo_ji->BO_s -= control->bo_cut;
+                        bo_ji->BO -= control->bo_cut;
+                        workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp
+                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+                    }
+                } 
+            }
+        }
+    }
+    H->entries[Htop].j = i;
+    H->entries[Htop].val = sbp[type_i].eta;
+    ++Htop;
+    H->end[i] = Htop;
+    Set_End_Index( i, btop_i, &bonds );
+    if( ihb == 1 || ihb == 2)
+        Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds );
+    //fprintf( stderr, "%d bonds start: %d, end: %d\n", 
+    //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
+    //}
+    // mark the end of j list
+    //H->start[i] = Htop; 
+    /* validate lists - decide if reallocation is required! */
+    //Validate_Lists( workspace, lists, 
+    //      data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+GLOBAL void k_Init_Forces_Tab ( reax_atom *atoms, global_parameters g_params, control_params *control, 
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        simulation_data *data, simulation_box *box, static_storage workspace,
+        list far_nbrs, list bonds, list hbonds, 
+        int N, int max_sparse_entries, int num_atom_types, 
+        LR_lookup_table *d_LR) 
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int tmin, tmax, r;
+    int ihb, jhb, ihb_top, jhb_top;
+    int flag;
+    real r_ij, r2, self_coef;
+    real val, dif, base;
+    real C12, C34, C56;
+    real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2;   
+    sparse_matrix *H;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    LR_lookup_table *t;
+    reax_atom *atom_i, *atom_j;
+    bond_data *ibond, *jbond;
+    bond_order_data *bo_ij, *bo_ji;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    H = &(workspace.H);
+    Htop = i * max_sparse_entries;
+    num_bonds = 0;
+    num_hbonds = 0;
+    btop_i = btop_j = 0;
+    p_boc1 = g_params.l[0];
+    p_boc2 = g_params.l[1];
+    //for( i = 0; i < system->N; ++i )
+    atom_i = &(atoms[i]);
+    type_i  = atom_i->type;
+    start_i = Start_Index(i, &far_nbrs);
+    end_i   = End_Index(i, &far_nbrs);
+    H->start[i] = Htop;
+    H->end[i] = Htop;
+    btop_i = End_Index( i, &bonds );
+    sbp_i = &(sbp[type_i]);
+    ihb = ihb_top = -1;
+    ihb = sbp_i->p_hbond;
+    if( control->hb_cut > 0 && (ihb==1 || ihb == 2))
+        ihb_top = End_Index( workspace.hbond_index[i], &hbonds );
+    for( pj = start_i; pj < end_i; ++pj ) {
+        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &(atoms[j]);
+        flag = 0;
+        if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+            if(nbr_pj->d <= control->r_cut)
+                flag = 1;
+            else flag = 0;
+        }
+        else if (i > j) {
+            if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+        }
+        else if ( i < j) {
+            if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+        }
+        if( flag ){    
+            type_j = atoms[j].type;
+            r_ij = nbr_pj->d;
+            sbp_j = &(sbp[type_j]);
+            twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+            self_coef = (i == j) ? 0.5 : 1.0;
+            tmin  = MIN( type_i, type_j );
+            tmax  = MAX( type_i, type_j );
+            t = &( d_LR[ index_lr (tmin, tmax, num_atom_types) ]);      
+            /* cubic spline interpolation */
+            //CHANGE ORIGINAL
+            //if (i > j) {
+            r = (int)(r_ij * t->inv_dx);
+            if( r == 0 )  ++r;
+            base = (real)(r+1) * t->dx;
+            dif = r_ij - base;
+            val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                t->ele[r].a;
+            val *= EV_to_KCALpMOL / C_ele;
+            H->entries[Htop].j = j;
+            H->entries[Htop].val = self_coef * val;
+            //H->j [Htop] = j;
+            //H->val [Htop] = self_coef * val;
+            ++Htop;
+            //}
+            //CHANGE ORIGINAL
+            /* hydrogen bond lists */ 
+            if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
+                    nbr_pj->d <= control->hb_cut ) {
+                // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                jhb = sbp_j->p_hbond;
+                if ( ihb == 1 && jhb == 2 ) {
+                    if (i > j) {
+                        hbonds.select.hbond_list[ihb_top].nbr = j;
+                        hbonds.select.hbond_list[ihb_top].scl = 1;
+                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                        //Auxilary data structures
+                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    } else {
+                        hbonds.select.hbond_list[ihb_top].nbr = j;
+                        hbonds.select.hbond_list[ihb_top].scl = -1;
+                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                        //Auxilary data structures
+                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    }
+                } else if (ihb == 2 && jhb == 1) {
+                    hbonds.select.hbond_list[ihb_top].nbr = j;
+                    hbonds.select.hbond_list[ihb_top].scl = 1;
+                    hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                    //Auxilary data structures
+                    rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                    hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                    ++ihb_top;
+                    ++num_hbonds;
+                }
+            }
+            /* uncorrected bond orders */
+            if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                r2 = SQR(r_ij);
+                if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                }
+                else BO_s = C12 = 0.0;
+                if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                    BO_pi = EXP( C34 );
+                }
+                else BO_pi = C34 = 0.0;
+                if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                    BO_pi2= EXP( C56 );
+                }
+                else BO_pi2 = C56 = 0.0;
+                /* Initially BO values are the uncorrected ones, page 1 */
+                BO = BO_s + BO_pi + BO_pi2;
+                if( BO >= control->bo_cut ) {
+                    //CHANGE ORIGINAL
+                    num_bonds += 1;
+                    //CHANGE ORIGINAL
+                    /****** bonds i-j and j-i ******/
+                    if ( i > j )
+                    {
+                        ibond = &( bonds.select.bond_list[btop_i] );
+                        ibond->nbr = j;
+                        ibond->d = r_ij;
+                        rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                        ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
+                        //ibond->dbond_index = btop_i;
+                        //ibond->sym_index = btop_j;
+                        ++btop_i;
+                        bo_ij = &( ibond->bo_data );
+                        bo_ij->BO = BO;
+                        bo_ij->BO_s = BO_s;
+                        bo_ij->BO_pi = BO_pi;
+                        bo_ij->BO_pi2 = BO_pi2;
+                        //Auxilary data strucutres to resolve dependencies
+                        ibond->scratch = 0;
+                        ibond->CdDelta_ij = 0;
+                        rvec_MakeZero (ibond->f);
+                        ibond->l = -1;
+                        ibond->CdDelta_jk = 0;
+                        ibond->Cdbo_kl = 0;
+                        rvec_MakeZero (ibond->i_f);
+                        rvec_MakeZero (ibond->k_f);
+                        rvec_MakeZero (ibond->h_f);
+                        rvec_MakeZero (ibond->t_f);
+                        /* Bond Order page2-3, derivative of total bond order prime */
+                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                           dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi2,
+                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                        /* Only dBOp wrt. dr_i is stored here, note that 
+                           dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+                        rvec_Scale( bo_ij->dBOp, 
+                                -(bo_ij->BO_s * Cln_BOp_s + 
+                                    bo_ij->BO_pi * Cln_BOp_pi + 
+                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                        rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp );
+                        bo_ij->BO_s -= control->bo_cut;
+                        bo_ij->BO -= control->bo_cut;
+                        workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp
+                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
+                    } 
+                    else {
+                        rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
+                        rvec dBOp;
+                        btop_j = btop_i;
+                        jbond = &( bonds.select.bond_list[btop_j] );
+                        jbond->nbr = j; 
+                        jbond->d = r_ij;
+                        rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
+                        ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+                        //jbond->dbond_index = btop_i;
+                        //jbond->sym_index = btop_i;
+                        ++btop_i;
+                        bo_ji = &( jbond->bo_data );
+                        bo_ji->BO = BO;
+                        bo_ji->BO_s = BO_s;
+                        bo_ji->BO_pi = BO_pi;
+                        bo_ji->BO_pi2 = BO_pi2;
+                        // Auxilary data structures to resolve dependencies
+                        jbond->scratch = 0;
+                        jbond->CdDelta_ij = 0;
+                        rvec_MakeZero (jbond->f);
+                        jbond->l = -1;
+                        jbond->CdDelta_jk = 0;
+                        jbond->Cdbo_kl = 0;
+                        rvec_MakeZero (jbond->i_f);
+                        rvec_MakeZero (jbond->k_f);
+                        rvec_MakeZero (jbond->h_f);
+                        rvec_MakeZero (jbond->t_f);
+                        // Bond Order page2-3, derivative of total bond order prime
+                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+                        // Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                        //   dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 
+                        rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec);
+                        rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec);
+                        rvec_Scale(dln_BOp_pi2, -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec);
+                        rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s);
+                        rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi );
+                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 );
+                        // Only dBOp wrt. dr_i is stored here, note that 
+                        //   dBOp/dr_i = -dBOp/dr_j and all others are 0
+                        //CHANGE ORIGINAL
+                        rvec_Scale( dBOp, 
+                                -(BO_s * Cln_BOp_s + 
+                                    BO_pi * Cln_BOp_pi + 
+                                    BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec);
+                        rvec_Scale( bo_ji->dBOp, -1., dBOp);
+                        //CHANGE ORIGINAL
+                        rvec_Add( workspace.dDeltap_self[i], bo_ji->dBOp );
+                        bo_ji->BO_s -= control->bo_cut;
+                        bo_ji->BO -= control->bo_cut;
+                        workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp
+                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+                    }
+                }
+            }
+        }
+    }
+    H->entries[Htop].j = i;
+    H->entries[Htop].val = sbp[type_i].eta;
+    //H->j [Htop] = i;
+    //H->val [Htop] = sbp[type_i].eta;
+    ++Htop;
+    H->end[i] = Htop;
+    Set_End_Index( i, btop_i, &bonds );
+    if( ihb == 1  || ihb == 2)
+        Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds );
+GLOBAL void k_fix_sym_dbond_indices (list pbonds, int N)
+    int i, nbr;
+    bond_data *ibond, *jbond;
+    int atom_j;
+    list *bonds = &pbonds;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++)
+    {
+        ibond = &( bonds->select.bond_list [j] );    
+        nbr = ibond->nbr;
+        for (int k = Start_Index (nbr, bonds); k < End_Index (nbr, bonds); k ++)
+        {
+            jbond = &( bonds->select.bond_list[ k ] );
+            atom_j = jbond->nbr;
+            if ( (atom_j == i) )
+            {
+                if (i > nbr) {
+                    ibond->dbond_index = j; 
+                    jbond->dbond_index = j;
+                    ibond->sym_index = k;
+                    jbond->sym_index = j;
+                }
+            }
+        }
+    }
+GLOBAL void k_fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N)
+    static_storage *workspace = &p_workspace;
+    hbond_data *ihbond, *jhbond;
+    int nbr;
+    //int i = (blockIdx.x * blockDim.x + threadIdx.x) >> 4;
+    int i = (blockIdx.x);
+    int start = Start_Index (workspace->hbond_index[i], &hbonds);
+    int end = End_Index (workspace->hbond_index[i], &hbonds);
+    //int j = start + threadIdx.x;
+    //int j = start + (threadIdx.x % 16);
+    //for (int j = Start_Index (workspace->hbond_index[i], &hbonds); 
+    //        j < End_Index (workspace->hbond_index[i], &hbonds); j++)
+    int j = start + threadIdx.x;
+    while (j < end)
+        //for (int j = start; j < end; j++)
+    {
+        ihbond = &( hbonds.select.hbond_list [j] );
+        nbr = ihbond->nbr;
+        int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds);
+        int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds);
+        for (int k = nbrstart; k < nbrend; k++)
+            //k = nbrstart + threadIdx.x;
+            //while (k < nbrend)
+        {
+            jhbond = &( hbonds.select.hbond_list [k] );
+            if (jhbond->nbr == i){
+                ihbond->sym_index = k;
+                jhbond->sym_index = j;
+                break;
+            }
+            //k += blockDim.x;
+        }
+        j += 32;
+    }
+GLOBAL void k_New_fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N )
+    static_storage *workspace = &p_workspace;
+    hbond_data *ihbond, *jhbond;
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warp_id = thread_id / __THREADS_PER_ATOM__;
+    int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
+    int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+    if (warp_id >= N) return;
+    int i = warp_id;
+    int nbr;
+    int k;
+    int start = Start_Index (workspace->hbond_index[i], &hbonds);
+    int end = End_Index (workspace->hbond_index[i], &hbonds);
+    int j = start + lane_id;
+    //for (int j = start; j < end; j++)
+    while (j < end)
+    {
+        ihbond = &( hbonds.select.hbond_list [j] );
+        nbr = ihbond->nbr;
+        int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds);
+        int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds);
+        //k = nbrstart + lane_id;
+        //if (lane_id == 0) found [my_bucket] = 0;
+        //while (k < nbrend)
+        for (k = nbrstart; k < nbrend; k++)
+        {
+            jhbond = &( hbonds.select.hbond_list [k] );
+            if (jhbond->nbr == i){
+                ihbond->sym_index = k;
+                jhbond->sym_index = j;
+                break;
+            }
+        }
+        j += __THREADS_PER_ATOM__;
+    }
+GLOBAL void k_Estimate_Storage_Sizes(reax_atom *atoms, 
+    int N, single_body_parameters *sbp,
+    two_body_parameters *tbp,
+    global_parameters gp, 
+    control_params *control, 
+    list far_nbrs,
+    int num_atom_types, int *results)
+    int *Htop = &results[0];
+    int *num_3body  = &results[1];
+    int *hb_top = &results [ 2 ];
+    int *bond_top = &results [ 2 + N ];
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int ihb, jhb;
+    real r_ij, r2;
+    real C12, C34, C56;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2; 
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+    p_boc1 = gp.l[0];
+    p_boc2 = gp.l[1];
+    //for( i = 0; i < N; ++i ) {
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N ) return ;
+    atom_i = &(atoms[i]);
+    type_i  = atom_i->type;
+    start_i = Start_Index(i, &far_nbrs);
+    end_i   = End_Index(i, &far_nbrs);
+    sbp_i = &(sbp[type_i]);
+    ihb = sbp_i->p_hbond;
+    for( pj = start_i; pj < end_i; ++pj ) {
+        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &( atoms[j] );
+        type_j = atom_j->type;
+        sbp_j = &( sbp[type_j] );
+        twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );
+        if( nbr_pj->d <= control->r_cut ) {
+            //++(*Htop);
+            atomicAdd(Htop, 1);
+            /* hydrogen bond lists */ 
+            //TODO - CHANGE ORIGINAL
+            if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
+                    nbr_pj->d <= control->hb_cut ) {
+                jhb = sbp_j->p_hbond;
+                if( ihb == 1 && jhb == 2 )
+                    //++hb_top[i];
+                    atomicAdd(&hb_top[i], 1);
+                else if( ihb == 2 && jhb == 1 )
+                    //++hb_top[j];
+                    //atomicAdd(&hb_top[j], 1);
+                    atomicAdd(&hb_top[i], 1);
+            }
+            //TODO -- CHANGE ORIGINAL
+            //CHANGE ORIGINAL
+            if (i < j) continue;
+            //CHANGE ORIGINAL
+            /* uncorrected bond orders */
+            if( nbr_pj->d <= control->nbr_cut ) {
+                r_ij = nbr_pj->d;
+                r2 = SQR(r_ij);
+                if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                }
+                else BO_s = C12 = 0.0;
+                if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                    BO_pi = EXP( C34 );
+                }
+                else BO_pi = C34 = 0.0;
+                if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                    BO_pi2= EXP( C56 );
+                }
+                else BO_pi2 = C56 = 0.0;
+                /* Initially BO values are the uncorrected ones, page 1 */
+                BO = BO_s + BO_pi + BO_pi2;
+                if( BO >= control->bo_cut ) {
+                    //++bond_top[i];
+                    //++bond_top[j];
+                    atomicAdd(&bond_top[i], 1);
+                    atomicAdd(&bond_top[j], 1);
+                }
+            }
+        }
+    }
+    //}
+void Cuda_Estimate_Storage_Sizes (reax_system *system, control_params *control, int *output)
+    int *Htop, *num_3body, input_size;
+    int *hb_top, *bond_top;
+    int *input = (int *) scratch;
+    int max_3body = 0;
+    Htop = 0;
+    num_3body = 0;
+    input_size = INT_SIZE * (2 * system->N + 1 + 1);
+    //cuda_malloc ((void **) &input, input_size, 1, __LINE__);
+    cuda_memset (input, 0, input_size, RES_SCRATCH );
+    k_Estimate_Storage_Sizes <<<BLOCKS_POW_2, BLOCK_SIZE>>>
+        (system->d_atoms, system->N, system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+         system->reaxprm.d_gp, (control_params *)control->d_control, *(dev_lists + FAR_NBRS), 
+         system->reaxprm.num_atom_types, input);
+    cudaThreadSynchronize();
+    cudaCheckError();
+    copy_host_device (output, input, input_size, cudaMemcpyDeviceToHost, __LINE__ );
+    Htop = &output[0];
+    num_3body  = &output[1];
+    hb_top = &output[ 2 ];
+    bond_top = &output[ 2 + system->N ];
+    *Htop += system->N;
+    *Htop *= SAFE_ZONE;
+    for( int i = 0; i < system->N; ++i ) {
+        hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
+        if (max_3body <= SQR (bond_top[i]))
+            max_3body = SQR (bond_top[i]);
+        *num_3body += SQR(bond_top[i]);
+        bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
+    }
+    *num_3body = max_3body * SAFE_ZONE;
+void Cuda_Compute_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list** lists, output_controls *out_control )
+    real t_start, t_elapsed;
+    real t_1, t_2;
+    int *indices;
+    int *Htop;
+    int max_sparse_entries = 0;
+    list *far_nbrs = dev_lists + FAR_NBRS;
+    int hblocks;
+    t_start = Get_Time ();
+    if ( !control->tabulate ) {
+        k_Init_Forces <<<BLOCKS, BLOCK_SIZE>>>
+            (system->d_atoms,         system->reaxprm.d_gp, (control_params *)control->d_control, 
+             system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+             (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace,
+             *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), 
+             system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types ); 
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+    else 
+    {
+        k_Init_Forces_Tab <<< BLOCKS, BLOCK_SIZE >>>
+            ( system->d_atoms,         system->reaxprm.d_gp, (control_params *)control->d_control, 
+              system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+              (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box,  *dev_workspace,
+              *(dev_lists + FAR_NBRS),     *(dev_lists + BONDS), *(dev_lists + HBONDS), 
+              system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types, 
+              d_LR );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+    /*This is for bonds processing to fix dbond and sym_indexes */
+    t_1 = Get_Time ();
+    k_fix_sym_dbond_indices <<<BLOCKS, BLOCK_SIZE>>> (*(dev_lists + BONDS), system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    t_2 = Get_Timing_Info ( t_1 );
+    //FIX -1 HYDROGEN BOND fix for cases where there are no hbonds.
+    if ((control->hb_cut > 0) && (dev_workspace->num_H > 0))
+    {
+        hblocks = (system->N * HBONDS_SYM_THREADS_PER_ATOM / HBONDS_SYM_BLOCK_SIZE) + 
+            ((system->N * HBONDS_SYM_THREADS_PER_ATOM % HBONDS_SYM_BLOCK_SIZE) == 0 ? 0 : 1);
+        t_1 = Get_Time ();
+        /*
+           int bs = system->N;
+           int ss = 32;
+           fix_sym_hbond_indices <<<bs, ss>>> (*dev_workspace, *(dev_lists + HBONDS), system->N);
+         */
+        k_New_fix_sym_hbond_indices <<<hblocks, HBONDS_SYM_BLOCK_SIZE>>> (*dev_workspace, *(dev_lists + HBONDS), system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+    t_2 = Get_Timing_Info ( t_1 );
+    t_elapsed = Get_Timing_Info (t_start);
+    d_timing.init_forces+= t_elapsed;
+    Cuda_Validate_Lists( system, dev_workspace, &dev_lists, data->step, system->N,
+            system->num_bonds, system->num_hbonds );
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Done with Cuda List Validation \n");
+    //Bonded Force Calculations here.
+    t_start = Get_Time ();
+    Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
+    t_elapsed = Get_Timing_Info (t_start);
+    d_timing.bonded += t_elapsed;
+    //Compute the Non Bonded Forces here. 
+    t_start = Get_Time ();
+    Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control );
+    t_elapsed = Get_Timing_Info (t_start);
+    d_timing.nonb += t_elapsed;
+    //Compute Total Forces here
+    Cuda_Compute_Total_Force<<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, 
+         *(dev_lists + BONDS), control->ensemble, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    Cuda_Compute_Total_Force_PostProcess<<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, 
+         *(dev_lists + BONDS), control->ensemble, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+int validate_device (reax_system *system, simulation_data *data, static_storage *workspace, list **lists )
+    int retval = FALSE;
+#ifdef __BUILD_DEBUG__
+    retval |= validate_neighbors (system, lists);
+    retval |= validate_sym_dbond_indices (system, workspace, lists);
+    retval |= validate_bonds (system, workspace, lists);
+    retval |= validate_sparse_matrix (system, workspace);
+    retval |= validate_three_bodies (system, workspace, lists );
+    retval |= validate_hbonds (system, workspace, lists);
+    retval |= validate_workspace (system, workspace, lists);
+    retval |= validate_data (system, data);
+    retval |= validate_atoms (system, lists);
+    //analyze_hbonds (system, workspace, lists);
+    if (!retval) {
+        fprintf (stderr, "Results *DOES NOT* mattch between device and host \n");
+    }
+    return retval;
diff --git a/PuReMD-GPU/src/cuda_forces.h b/PuReMD-GPU/src/cuda_forces.h
new file mode 100644
index 0000000000000000000000000000000000000000..b017e63ebeee45c03d8926a79f1a9a0dd7a4771d
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_forces.h
@@ -0,0 +1,48 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_FORCES_H_
+#define __CUDA_FORCES_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+GLOBAL void k_Estimate_Sparse_Matrix_Entries ( reax_atom *, control_params *, 
+        simulation_data *, simulation_box *, list, int, int * );
+void Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*,
+    static_storage*, list**, output_controls* );
+void Cuda_Estimate_Storage_Sizes (reax_system *, control_params *, int *);
+void Cuda_Threebody_List( reax_system *, static_storage *, list *, int );
+int validate_device (reax_system *, simulation_data *, static_storage *, list **);
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/four_body_interactions.cu b/PuReMD-GPU/src/cuda_four_body_interactions.cu
similarity index 53%
rename from PuReMD-GPU/src/four_body_interactions.cu
rename to PuReMD-GPU/src/cuda_four_body_interactions.cu
index d7bf757eff65253989cfe58d1ac4dfabd63d602a..60d9973482ddd61a34d874301f1ed360443625b9 100644
--- a/PuReMD-GPU/src/four_body_interactions.cu
+++ b/PuReMD-GPU/src/cuda_four_body_interactions.cu
@@ -18,20 +18,19 @@
-#include "four_body_interactions.h"
-#include "bond_orders.h"
+#include "cuda_four_body_interactions.h"
 #include "box.h"
+#include "index_utils.h"
 #include "list.h"
-#include "lookup.h"
 #include "vector.h"
-#include "math.h"
-#include "index_utils.h"
 #include "cuda_helpers.h"
 #define MIN_SINE 1e-10
-HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
+DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
         rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
         three_body_interaction_data *p_ijk, 
         three_body_interaction_data *p_jkl, 
@@ -72,7 +71,6 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_
     hnhd = r_ij * r_kl * cos_ijk * sin_jkl;
     hnhe = r_ij * r_kl * sin_ijk * cos_jkl;
     poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
     if( poem < 1e-20 ) poem = 1e-20;
@@ -81,9 +79,14 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_
                 r_jk * r_kl * cos_jkl );
     arg  = tel / poem;
-    if( arg >  1.0 ) arg =  1.0;
-    if( arg < -1.0 ) arg = -1.0;
+    if( arg >  1.0 )
+    {
+        arg =  1.0;
+    }
+    if( arg < -1.0 )
+    {
+        arg = -1.0;
+    }
     /*fprintf( out_control->etor, 
@@ -111,10 +114,22 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_
        -p_jkl->dcos_dk[2]/sin_jkl );*/
-    if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE;
-    else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE;
-    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE;
-    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE;
+    if( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+    {
+        sin_ijk = MIN_SINE;
+    }
+    else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+    {
+        sin_ijk = -MIN_SINE;
+    }
+    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+    {
+        sin_jkl = MIN_SINE;
+    }
+    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+    {
+        sin_jkl = -MIN_SINE;
+    }
     // dcos_omega_di
     rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li );
@@ -145,532 +160,7 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_
-void Four_Body_Interactions( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
-    int i, j, k, l, pi, pj, pk, pl, pij, plk;
-    int type_i, type_j, type_k, type_l;
-    int start_j, end_j, start_k, end_k;
-    int start_pj, end_pj, start_pk, end_pk;
-    int num_frb_intrs = 0;
-    real Delta_j, Delta_k;
-    real r_ij, r_jk, r_kl, r_li;
-    real BOA_ij, BOA_jk, BOA_kl;
-    real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
-    real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
-    real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
-    real fn10, f11_DjDk, dfn11, fn12;
-    real theta_ijk, theta_jkl;
-    real sin_ijk, sin_jkl;
-    real cos_ijk, cos_jkl;
-    real tan_ijk_i, tan_jkl_i;
-    real omega, cos_omega, cos2omega, cos3omega;
-    rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
-    real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
-    real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
-    real Cconj, CEconj1, CEconj2, CEconj3;
-    real CEconj4, CEconj5, CEconj6;
-    real e_tor, e_con;
-    rvec dvec_li;
-    rvec force, ext_press;
-    ivec rel_box_jl;
-    // rtensor total_rtensor, temp_rtensor;
-    four_body_header *fbh;
-    four_body_parameters *fbp;
-    bond_data *pbond_ij, *pbond_jk, *pbond_kl;
-    bond_order_data *bo_ij, *bo_jk, *bo_kl;
-    three_body_interaction_data *p_ijk, *p_jkl;
-    real p_tor2 = system->reaxprm.gp.l[23];
-    real p_tor3 = system->reaxprm.gp.l[24];
-    real p_tor4 = system->reaxprm.gp.l[25];
-    real p_cot2 = system->reaxprm.gp.l[27];
-    list *bonds = (*lists) + BONDS;
-    list *thb_intrs = (*lists) + THREE_BODIES;
-    for( j = 0; j < system->N; ++j ) {
-        type_j = system->atoms[j].type;
-        Delta_j = workspace->Delta_boc[j];
-        start_j = Start_Index(j, bonds);
-        end_j = End_Index(j, bonds);
-        for( pk = start_j; pk < end_j; ++pk ) {
-            pbond_jk = &( bonds->select.bond_list[pk] );
-            k = pbond_jk->nbr;
-            bo_jk = &( pbond_jk->bo_data );
-            BOA_jk = bo_jk->BO - control->thb_cut;
-            /* see if there are any 3-body interactions involving j&k
-               where j is the central atom. Otherwise there is no point in
-               trying to form a 4-body interaction out of this neighborhood */    
-            if( j < k && bo_jk->BO > control->thb_cut/*0*/ && 
-                    Num_Entries(pk, thb_intrs) ) {
-                start_k = Start_Index(k, bonds);
-                end_k = End_Index(k, bonds);                   
-                pj = pbond_jk->sym_index; // pj points to j on k's list
-                /* do the same check as above: are there any 3-body interactions 
-                   involving k&j where k is the central atom */
-                if( Num_Entries(pj, thb_intrs) ) {
-                    type_k = system->atoms[k].type;
-                    Delta_k = workspace->Delta_boc[k];
-                    r_jk = pbond_jk->d;
-                    start_pk = Start_Index(pk, thb_intrs );
-                    end_pk = End_Index(pk, thb_intrs );
-                    start_pj = Start_Index(pj, thb_intrs );
-                    end_pj = End_Index(pj, thb_intrs );        
-                    exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
-                    exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
-                    exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
-                    exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
-                    exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
-                    f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
-                    /* pick i up from j-k interaction where j is the centre atom */
-                    for( pi = start_pk; pi < end_pk; ++pi ) {
-                        p_ijk = &( thb_intrs->select.three_body_list[pi] );
-                        pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
-                        pbond_ij = &( bonds->select.bond_list[pij] );
-                        bo_ij = &( pbond_ij->bo_data );
-                        if( bo_ij->BO > control->thb_cut/*0*/ ) {
-                            i = p_ijk->thb;
-                            type_i = system->atoms[i].type;
-                            r_ij = pbond_ij->d;
-                            BOA_ij = bo_ij->BO - control->thb_cut;
-                            theta_ijk = p_ijk->theta;
-                            sin_ijk = SIN( theta_ijk );
-                            cos_ijk = COS( theta_ijk );
-                            //tan_ijk_i = 1. / TAN( theta_ijk );
-                            if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
-                                tan_ijk_i = cos_ijk / MIN_SINE;
-                            else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
-                                tan_ijk_i = cos_ijk / -MIN_SINE;
-                            else tan_ijk_i = cos_ijk / sin_ijk;
-                            exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
-                            exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
-                            /* pick l up from j-k intr. where k is the centre */
-                            for( pl = start_pj; pl < end_pj; ++pl ) {
-                                p_jkl = &( thb_intrs->select.three_body_list[pl] );
-                                l = p_jkl->thb;
-                                plk = p_jkl->pthb; //pointer to l on k's bond_list!
-                                pbond_kl = &( bonds->select.bond_list[plk] );
-                                bo_kl = &( pbond_kl->bo_data );
-                                type_l = system->atoms[l].type;
-                                fbh = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm ) ]);
-                                fbp = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm )].prm[0]);
-                                if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
-                                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
-                                    ++num_frb_intrs;
-                                    r_kl = pbond_kl->d;
-                                    BOA_kl = bo_kl->BO - control->thb_cut;
-                                    theta_jkl = p_jkl->theta;
-                                    sin_jkl = SIN( theta_jkl );
-                                    cos_jkl = COS( theta_jkl );
-                                    //tan_jkl_i = 1. / TAN( theta_jkl );
-                                    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
-                                        tan_jkl_i = cos_jkl / MIN_SINE;
-                                    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
-                                        tan_jkl_i = cos_jkl / -MIN_SINE;
-                                    else tan_jkl_i = cos_jkl /sin_jkl;
-                                    Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, 
-                                            &(system->box), dvec_li );
-                                    r_li = rvec_Norm( dvec_li );
-                                    /* omega and its derivative */
-                                    //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, 
-                                    omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, 
-                                            r_jk, pbond_kl->dvec, r_kl,
-                                            dvec_li, r_li, p_ijk, p_jkl,
-                                            dcos_omega_di, dcos_omega_dj,
-                                            dcos_omega_dk, dcos_omega_dl,
-                                            out_control);
-                                    cos_omega = COS( omega );
-                                    cos2omega = COS( 2. * omega );
-                                    cos3omega = COS( 3. * omega );
-                                    /* end omega calculations */
-                                    /* torsion energy */
-                                    exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk));
-                                    exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
-                                    exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) );
-                                    fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
-                                        (1.0 - exp_tor2_kl);
-                                    CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
-                                            fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
-                                            fbp->V3 * (1.0 + cos3omega) );
-                                    //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + 
-                                    //  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
-                                    //  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
-                                    data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV;
-                                    dfn11 = (-p_tor3 * exp_tor3_DjDk +
-                                            (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
-                                            (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
-                                    CEtors1 = sin_ijk * sin_jkl * CV;
-                                    CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * 
-                                        (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * 
-                                        sin_ijk * sin_jkl; 
-                                    CEtors3 = CEtors2 * dfn11;
-                                    CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
-                                        (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
-                                    CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * 
-                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
-                                    CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
-                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
-                                    cmn = -fn10 * CV;
-                                    CEtors7 = cmn * sin_jkl * tan_ijk_i;
-                                    CEtors8 = cmn * sin_ijk * tan_jkl_i;
-                                    CEtors9 = fn10 * sin_ijk * sin_jkl * 
-                                        (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-                                         1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
-                                    //cmn = -fn10 * CV;
-                                    //CEtors7 = cmn * sin_jkl * cos_ijk;
-                                    //CEtors8 = cmn * sin_ijk * cos_jkl;
-                                    //CEtors9 = fn10 * sin_ijk * sin_jkl * 
-                                    //  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-                                    //   fbp->V3 * (6*SQR(cos_omega) - 1.50));
-                                    /* end  of torsion energy */
-                                    /* 4-body conjugation energy */
-                                    fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-                                    data->E_Con += e_con = fbp->p_cot1 * fn12 * 
-                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
-                                    Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
-                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
-                                    CEconj1 = Cconj * (BOA_ij - 1.5e0);
-                                    CEconj2 = Cconj * (BOA_jk - 1.5e0);
-                                    CEconj3 = Cconj * (BOA_kl - 1.5e0);
-                                    CEconj4 = -fbp->p_cot1 * fn12 * 
-                                        (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
-                                    CEconj5 = -fbp->p_cot1 * fn12 * 
-                                        (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
-                                    //CEconj4 = -fbp->p_cot1 * fn12 * 
-                                    //  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
-                                    //CEconj5 = -fbp->p_cot1 * fn12 * 
-                                    //  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
-                                    CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
-                                        cos_omega * sin_ijk * sin_jkl;
-                                    /* end 4-body conjugation energy */
-                                    //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
-                                    //   workspace->orig_id[i], workspace->orig_id[j],
-                                    //       workspace->orig_id[k], workspace->orig_id[l], 
-                                    //    omega, cos_omega, cos2omega, cos3omega );
-                                    //fprintf(stdout, 
-                                    //    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //    CEtors2, CEtors3, CEtors4, CEtors5, 
-                                    //    CEtors6, CEtors7, CEtors8, CEtors9 );
-                                    //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //    theta_ijk, theta_jkl, sin_ijk, 
-                                    //    sin_jkl, cos_jkl, tan_jkl_i );
-                                    /* forces */
-                                    bo_jk->Cdbopi += CEtors2;
-                                    workspace->CdDelta[j] += CEtors3;
-                                    workspace->CdDelta[k] += CEtors3;
-                                    bo_ij->Cdbo += (CEtors4 + CEconj1);
-                                    bo_jk->Cdbo += (CEtors5 + CEconj2);
-                                    bo_kl->Cdbo += (CEtors6 + CEconj3);
-                                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                                        /* dcos_theta_ijk */
-                                        rvec_ScaledAdd( system->atoms[i].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
-                                        rvec_ScaledAdd( system->atoms[k].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_di );
-                                        /* dcos_theta_jkl */
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_di );
-                                        rvec_ScaledAdd( system->atoms[k].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                        rvec_ScaledAdd( system->atoms[l].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_dk );
-                                        /* dcos_omega */
-                                        rvec_ScaledAdd( system->atoms[i].f, 
-                                                CEtors9 + CEconj6, dcos_omega_di );
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dj );
-                                        rvec_ScaledAdd( system->atoms[k].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dk );
-                                        rvec_ScaledAdd( system->atoms[l].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dl );
-                                    }
-                                    else {
-                                        ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
-                                        /* dcos_theta_ijk */
-                                        rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                        rvec_Add( system->atoms[i].f, force );
-                                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
-                                        rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
-                                        rvec_Add( system->atoms[k].f, force );
-                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-                                        /* dcos_theta_jkl */
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors8 + CEconj5, p_jkl->dcos_di );
-                                        rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                        rvec_Add( system->atoms[k].f, force );
-                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-                                        rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
-                                        rvec_Add( system->atoms[l].f, force );
-                                        rvec_iMultiply( ext_press, rel_box_jl, force );
-                                        rvec_Add( data->ext_press, ext_press );
-                                        /* dcos_omega */                      
-                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
-                                        rvec_Add( system->atoms[i].f, force );
-                                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-                                        rvec_ScaledAdd( system->atoms[j].f, 
-                                                CEtors9 + CEconj6, dcos_omega_dj );
-                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
-                                        rvec_Add( system->atoms[k].f, force );
-                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                        rvec_Add( data->ext_press, ext_press );
-                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
-                                        rvec_Add( system->atoms[l].f, force );
-                                        rvec_iMultiply( ext_press, rel_box_jl, force );
-                                        rvec_Add( data->ext_press, ext_press );
-                                        /* This part is intended for a fully-flexible box */
-                                        /* rvec_ScaledSum( temp_rvec, 
-                                           CEtors7 + CEconj4, p_ijk->dcos_dk,      // i     
-                                           CEtors9 + CEconj6, dcos_omega_di );
-                                           rvec_OuterProduct( temp_rtensor, 
-                                           temp_rvec, system->atoms[i].x );
-                                           rtensor_Copy( total_rtensor, temp_rtensor );
-                                           rvec_ScaledSum( temp_rvec, 
-                                           CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
-                                           CEtors8 + CEconj5, p_jkl->dcos_di );
-                                           rvec_ScaledAdd( temp_rvec, 
-                                           CEtors9 + CEconj6, dcos_omega_dj );
-                                           rvec_OuterProduct( temp_rtensor, 
-                                           temp_rvec, system->atoms[j].x );
-                                           rtensor_Add( total_rtensor, temp_rtensor );
-                                           rvec_ScaledSum( temp_rvec, 
-                                           CEtors7 + CEconj4, p_ijk->dcos_di,      // k
-                                           CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                           rvec_ScaledAdd( temp_rvec, 
-                                           CEtors9 + CEconj6, dcos_omega_dk );
-                                           rvec_OuterProduct( temp_rtensor, 
-                                           temp_rvec, system->atoms[k].x );
-                                           rtensor_Add( total_rtensor, temp_rtensor );
-                                           rvec_ScaledSum( temp_rvec, 
-                                           CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
-                                           CEtors9 + CEconj6, dcos_omega_dl );
-                                           rvec_OuterProduct( temp_rtensor, 
-                                           temp_rvec, system->atoms[l].x );
-                                           rtensor_Copy( total_rtensor, temp_rtensor );
-                                           if( pbond_ij->imaginary || pbond_jk->imaginary || 
-                                           pbond_kl->imaginary )
-                                           rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
-                                           else
-                                           rtensor_Add( data->flex_bar.P, total_rtensor ); */
-                                    }
-                                    /*fprintf( out_control->etor, 
-                                    //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                    //r_ij, r_jk, r_kl, 
-                                    "%12.8f%12.8f%12.8f%12.8f\n",
-                                    cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
-                                    // fprintf( out_control->etor, "%12.8f\n", dfn11 );
-                                    fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", 
-                                            fn10, cos_omega, CV );
-                                    fprintf( out_control->etor, 
-                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                            CEtors2, CEtors3, CEtors4, CEtors5, 
-                                            CEtors6, CEtors7, CEtors8, CEtors9 );
-                                    /* fprintf( out_control->etor, 
-                                       "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                       htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
-                                    fprintf( out_control->etor, 
-                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-                                            CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
-                                    /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
-                                       fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
-                                    fprintf( out_control->etor, 
-                                            //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", 
-                                            "%6d%6d%6d%6d%12.8f%12.8f\n", 
-                                            workspace->orig_id[i], workspace->orig_id[j], 
-                                            workspace->orig_id[k], workspace->orig_id[l], 
-                                            e_tor, e_con );
-                                    //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
-                                    fprintf( out_control->econ, 
-                                            "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-                                            workspace->orig_id[i], workspace->orig_id[j], 
-                                            workspace->orig_id[k], workspace->orig_id[l], 
-                                            RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
-                                            e_con,data->E_Con );
-                                    /* fprintf( out_control->etor, 
-                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",       
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[0], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[1], 
-                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
-                                    /* fprintf( out_control->etor, 
-                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[0], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[1], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[2], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], 
-                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
-                                    fprintf( out_control->etor, 
-                                            "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-                                            dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], 
-                                            dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], 
-                                            dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
-                                            dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
-                                    // Torsion Forces 
-                                    Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., 
-                                            workspace->f_tor, workspace->f_tor);
-                                    Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
-                                    Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
-                                    Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
-                                    Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
-                                    Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
-                                    rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk);
-                                    rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj);
-                                    rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di);
-                                    rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di);
-                                    rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj);
-                                    rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk);
-                                    rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di );
-                                    rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj );
-                                    rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
-                                    rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
-                                    // Conjugation Forces 
-                                    Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
-                                    Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
-                                    Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
-                                    rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk);
-                                    rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj);
-                                    rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di);
-                                    rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di);
-                                    rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj);
-                                    rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk);
-                                    rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di );
-                                    rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj );
-                                    rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk );
-                                    rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl );
-                                } // pl check ends
-                            } // pl loop ends
-                        } // pi check ends
-                    } // pi loop ends
-                } // k-j neighbor check ends
-            } // j<k && j-k neighbor check ends
-        } // pk loop ends
-    } // j loop
-    /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", 
-       data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
-    fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs );
-    fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", 
-            data->E_Tor, data->E_Con );
-//Cuda Functions
-GLOBAL void Four_Body_Interactions ( reax_atom *atoms, 
+GLOBAL void k_Four_Body_Interactions ( reax_atom *atoms, 
         global_parameters g_params,
         four_body_header *d_fbp,
         control_params *control,
@@ -741,7 +231,6 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
     list *thb_intrs = &p_thb_intrs;
     static_storage *workspace = &p_workspace;
     //for( j = 0; j < system->N; ++j ) {
     type_j = atoms[j].type;
     Delta_j = workspace->Delta_boc[j];
@@ -836,8 +325,8 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                             pbond_kl = &( bonds->select.bond_list[plk] );
                             bo_kl = &( pbond_kl->bo_data );
                             type_l = atoms[l].type;
-                            fbh = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types) ]);
-                            fbp = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]);
+                            fbh = &(d_fbp[ index_fbp(type_i,type_j,type_k,type_l,num_atom_types) ]);
+                            fbp = &(d_fbp[ index_fbp(type_i,type_j,type_k,type_l,num_atom_types)].prm[0]);
                             if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
                                     bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
@@ -889,7 +378,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                 //PERFORMANCE IMPACT
                                 e_tor = fn10 * sin_ijk * sin_jkl * CV;
-                                //atomicAdd (&data->E_Tor ,e_tor );
+                                //MYATOMICADD(&data->E_Tor ,e_tor );
                                 E_Tor [j] += e_tor;
                                 //sh_tor [threadIdx.x] += e_tor;
@@ -933,7 +422,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                 fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
                                 //PERFORMANCE IMPACT
                                 e_con = fbp->p_cot1 * fn12 * (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
-                                //atomicAdd (&data->E_Con ,e_con );
+                                //MYATOMICADD(&data->E_Con ,e_con );
                                 E_Con [j] += e_con ;
                                 //sh_con [threadIdx.x] += e_con;
@@ -971,12 +460,12 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                 /* forces */
                                 //PERFORMANCE IMPACT
-                                   atomicAdd ( &bo_jk->Cdbopi, CEtors2 );
-                                   atomicAdd ( &workspace->CdDelta[j], CEtors3 );
-                                   atomicAdd ( &workspace->CdDelta[k], CEtors3 );
-                                   atomicAdd ( &bo_ij->Cdbo, (CEtors4 + CEconj1) );
-                                   atomicAdd ( &bo_jk->Cdbo, (CEtors5 + CEconj2) );
-                                   atomicAdd ( &bo_kl->Cdbo, (CEtors6 + CEconj3) );
+                                   MYATOMICADD( &bo_jk->Cdbopi, CEtors2 );
+                                   MYATOMICADD( &workspace->CdDelta[j], CEtors3 );
+                                   MYATOMICADD( &workspace->CdDelta[k], CEtors3 );
+                                   MYATOMICADD( &bo_ij->Cdbo, (CEtors4 + CEconj1) );
+                                   MYATOMICADD( &bo_jk->Cdbo, (CEtors5 + CEconj2) );
+                                   MYATOMICADD( &bo_kl->Cdbo, (CEtors6 + CEconj3) );
                                 //PERFORMANCE IMPACT
@@ -987,39 +476,29 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                 bo_jk->Cdbo += CEtors5 + CEconj2;
                                 //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE
-                                atomicAdd (&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 );
+                                MYATOMICADD(&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 );
                                 //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE
                                 if( control->ensemble == NVE || control->ensemble == NVT ||control->ensemble == bNVT) {
                                     /* dcos_theta_ijk */
                                     //PERFORMANCE IMPACT
-                                    atomic_rvecScaledAdd (pbond_ij->i_f, 
-                                            CEtors7 + CEconj4, p_ijk->dcos_dk );
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors7 + CEconj4, p_ijk->dcos_dj );
-                                    atomic_rvecScaledAdd( pbond_jk->k_f, 
-                                            CEtors7 + CEconj4, p_ijk->dcos_di );
+                                    atomic_rvecScaledAdd( pbond_ij->i_f, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                    atomic_rvecScaledAdd( pbond_jk->k_f, CEtors7 + CEconj4, p_ijk->dcos_di );
                                     /* dcos_theta_jkl */
                                     //PERFORMANCE IMPACT
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors8 + CEconj5, p_jkl->dcos_di );
-                                    atomic_rvecScaledAdd( pbond_jk->i_f, 
-                                            CEtors8 + CEconj5, p_jkl->dcos_dj );
-                                    atomic_rvecScaledAdd( pbond_kl->k_f, 
-                                            CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors8 + CEconj5, p_jkl->dcos_di );
+                                    atomic_rvecScaledAdd( pbond_jk->i_f, CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                    atomic_rvecScaledAdd( pbond_kl->k_f, CEtors8 + CEconj5, p_jkl->dcos_dk );
                                     /* dcos_omega */
                                     //PERFORMANCE IMPACT
-                                    atomic_rvecScaledAdd( pbond_ij->i_f, 
-                                            CEtors9 + CEconj6, dcos_omega_di );
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors9 + CEconj6, dcos_omega_dj );
-                                    atomic_rvecScaledAdd( pbond_jk->i_f, 
-                                            CEtors9 + CEconj6, dcos_omega_dk );
-                                    atomic_rvecScaledAdd( pbond_kl->k_f, 
-                                            CEtors9 + CEconj6, dcos_omega_dl );
+                                    atomic_rvecScaledAdd( pbond_ij->i_f, CEtors9 + CEconj6, dcos_omega_di );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors9 + CEconj6, dcos_omega_dj );
+                                    atomic_rvecScaledAdd( pbond_jk->i_f, CEtors9 + CEconj6, dcos_omega_dk );
+                                    atomic_rvecScaledAdd( pbond_kl->k_f, CEtors9 + CEconj6, dcos_omega_dl );
                                 else {
                                     ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
@@ -1033,8 +512,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                     //rvec_Add (sh_press [threadIdx.x], ext_press);
                                     //PERFORMANCE IMPACT
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors7 + CEconj4, p_ijk->dcos_dj );
                                     rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
                                     //PERFORMANCE IMPACT
@@ -1047,8 +525,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
                                     /* dcos_theta_jkl */
                                     //PERFORMANCE IMPACT
-                                    rvec_ScaledAdd( atoms[j].f, 
-                                            CEtors8 + CEconj5, p_jkl->dcos_di );
+                                    rvec_ScaledAdd( atoms[j].f, CEtors8 + CEconj5, p_jkl->dcos_di );
                                     rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
                                     //PERFORMANCE IMPACT
@@ -1327,7 +804,7 @@ GLOBAL void Four_Body_Interactions ( reax_atom *atoms,
-GLOBAL void Four_Body_Postprocess ( reax_atom *atoms, 
+GLOBAL void k_Four_Body_Postprocess( reax_atom *atoms, 
         static_storage p_workspace, 
         list p_bonds, int N )
diff --git a/PuReMD-GPU/src/cuda_four_body_interactions.h b/PuReMD-GPU/src/cuda_four_body_interactions.h
new file mode 100644
index 0000000000000000000000000000000000000000..088e24f4f15a1e83405b83427ba56d1e5d4e67ba
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_four_body_interactions.h
@@ -0,0 +1,42 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+GLOBAL void k_Four_Body_Interactions( reax_atom *, global_parameters ,
+    four_body_header *, control_params *, list , list , simulation_box *,
+    simulation_data *, static_storage , int , int , real *, real *, rvec * );
+GLOBAL void k_Four_Body_Postprocess( reax_atom *, static_storage, list , int );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_grid.cu b/PuReMD-GPU/src/cuda_grid.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ca9556d6b43ffcac8a828d90f87e743d2647152c
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_grid.cu
@@ -0,0 +1,48 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_grid.h"
+#include "grid.h"
+#include "index_utils.h"
+#include "vector.h"
+#include "cuda_utils.h"
+#include "cuda_reset_utils.h"
+void Cuda_Bin_Atoms (reax_system *system, static_storage *workspace )
+    Cuda_Reset_Grid ( &system->d_g);
+    Bin_Atoms ( system, workspace );
+    dev_workspace->realloc.gcell_atoms = workspace->realloc.gcell_atoms;
+void Cuda_Bin_Atoms_Sync (reax_system *system)
+    copy_host_device (system->g.top, system->d_g.top, 
+            INT_SIZE * system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_TOP);
+    copy_host_device (system->g.atoms, system->d_g.atoms, 
+            INT_SIZE * system->g.max_atoms*system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_ATOMS);
diff --git a/PuReMD-GPU/src/cuda_grid.h b/PuReMD-GPU/src/cuda_grid.h
new file mode 100644
index 0000000000000000000000000000000000000000..28a20797a56bda9682354f967634eeb6301e44ae
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_grid.h
@@ -0,0 +1,39 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_GRID_H_
+#define __CUDA_GRID_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+void Cuda_Bin_Atoms( reax_system*, static_storage* );
+void Cuda_Bin_Atoms_Sync (reax_system *);
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_helpers.h b/PuReMD-GPU/src/cuda_helpers.h
index e021cf84a99b22713fa2fdc4a00af81db0ef5672..e306c673f8aec585bbda7b92250e2f0ec91eb2e3 100644
--- a/PuReMD-GPU/src/cuda_helpers.h
+++ b/PuReMD-GPU/src/cuda_helpers.h
@@ -21,9 +21,15 @@
 #ifndef __CUDA_HELPERS__
 #define __CUDA_HELPERS__
 #include "mytypes.h"
-DEVICE inline int cuda_strcmp (char *a, char *b, int len)
+#ifdef __cplusplus
+extern "C"  {
+static inline DEVICE int cuda_strcmp(char *a, char *b, int len)
     char *src, *dst;
@@ -32,20 +38,25 @@ DEVICE inline int cuda_strcmp (char *a, char *b, int len)
     for (int i = 0; i < len; i++)
         if (*dst == '\0')
+        {
             return 0;
+        }
-        if (*src != *dst)  return 1;
+        if (*src != *dst)
+        {
+            return 1;
+        }
-        src ++;
-        dst ++;
+        src++;
+        dst++;
     return 0;
-DEVICE inline real atomicAdd(real* address, real val)
+static inline DEVICE double myAtomicAdd(double* address, double val)
     unsigned long long int* address_as_ull =
         (unsigned long long int*)address;
@@ -54,24 +65,31 @@ DEVICE inline real atomicAdd(real* address, real val)
         assumed = old;
         old = atomicCAS(address_as_ull, assumed,
-                        __double_as_longlong(val + __longlong_as_double(assumed)));
+                __double_as_longlong(val + __longlong_as_double(assumed)));
     while (assumed != old);
     return __longlong_as_double(old);
-DEVICE inline void atomic_rvecAdd( rvec ret, rvec v )
+static inline DEVICE void atomic_rvecAdd( rvec ret, rvec v )
-    atomicAdd ( &ret[0], v[0] );
-    atomicAdd ( &ret[1], v[1] );
-    atomicAdd ( &ret[2], v[2] );
+    MYATOMICADD( (double*)&ret[0], (double)v[0] );
+    MYATOMICADD( (double*)&ret[1], (double)v[1] );
+    MYATOMICADD( (double*)&ret[2], (double)v[2] );
-DEVICE inline void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
+static inline DEVICE void atomic_rvecScaledAdd( rvec ret, real c, rvec v )
-    atomicAdd ( &ret[0], c * v[0] );
-    atomicAdd ( &ret[1], c * v[1] );
-    atomicAdd ( &ret[2], c * v[2] );
+    MYATOMICADD( (double*)&ret[0], (double)(c * v[0]) );
+    MYATOMICADD( (double*)&ret[1], (double)(c * v[1]) );
+    MYATOMICADD( (double*)&ret[2], (double)(c * v[2]) );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_init.cu b/PuReMD-GPU/src/cuda_init.cu
index 09515038daa71b15ef2c5796f54f3c79c29063df..4ca4bac18d08052311d0bdcd83290b5617b66b55 100644
--- a/PuReMD-GPU/src/cuda_init.cu
+++ b/PuReMD-GPU/src/cuda_init.cu
@@ -18,59 +18,65 @@
 #include "cuda_init.h"
 #include "cuda_utils.h"
 #include "cuda_copy.h"
+#include "cuda_reset_utils.h"
 #include "vector.h"
-#include "reset_utils.h"
-void Cuda_Init_System ( reax_system *system)
+void Cuda_Init_System( reax_system *system)
-    cuda_malloc ( (void **) &system->d_atoms, system->N * REAX_ATOM_SIZE, 1, RES_SYSTEM_ATOMS );    
+    cuda_malloc( (void **) &system->d_atoms, system->N * REAX_ATOM_SIZE, 1, RES_SYSTEM_ATOMS );    
-    cuda_malloc ( (void **) &system->d_box, sizeof (simulation_box), 1, RES_SYSTEM_SIMULATION_BOX );
+    cuda_malloc( (void **) &system->d_box, sizeof (simulation_box), 1, RES_SYSTEM_SIMULATION_BOX );
     //interaction parameters
-    cuda_malloc ((void **) &system->reaxprm.d_sbp, system->reaxprm.num_atom_types * SBP_SIZE,
+    cuda_malloc((void **) &system->reaxprm.d_sbp, system->reaxprm.num_atom_types * SBP_SIZE,
             1, RES_REAX_INT_SBP );
-    cuda_malloc ((void **) &system->reaxprm.d_tbp, pow (system->reaxprm.num_atom_types, 2) * TBP_SIZE, 
+    cuda_malloc((void **) &system->reaxprm.d_tbp, pow (system->reaxprm.num_atom_types, 2) * TBP_SIZE, 
             1, RES_REAX_INT_TBP );
-    cuda_malloc ((void **) &system->reaxprm.d_thbp, pow (system->reaxprm.num_atom_types, 3) * THBP_SIZE,
+    cuda_malloc((void **) &system->reaxprm.d_thbp, pow (system->reaxprm.num_atom_types, 3) * THBP_SIZE,
             1, RES_REAX_INT_THBP );
-    cuda_malloc ((void **) &system->reaxprm.d_hbp, pow (system->reaxprm.num_atom_types, 3) * HBP_SIZE,
+    cuda_malloc((void **) &system->reaxprm.d_hbp, pow (system->reaxprm.num_atom_types, 3) * HBP_SIZE,
             1, RES_REAX_INT_HBP );
-    cuda_malloc ((void **) &system->reaxprm.d_fbp, pow (system->reaxprm.num_atom_types, 4) * FBP_SIZE,
+    cuda_malloc((void **) &system->reaxprm.d_fbp, pow (system->reaxprm.num_atom_types, 4) * FBP_SIZE,
             1, RES_REAX_INT_FBP );
-    cuda_malloc ((void **) &system->reaxprm.d_gp.l, REAL_SIZE * system->reaxprm.gp.n_global, 1, RES_GLOBAL_PARAMS );
+    cuda_malloc((void **) &system->reaxprm.d_gp.l, REAL_SIZE * system->reaxprm.gp.n_global, 1, RES_GLOBAL_PARAMS );
     system->reaxprm.d_gp.n_global = 0;
     system->reaxprm.d_gp.vdw_type = 0;
-void Cuda_Init_Control (control_params *control)
+void Cuda_Init_Control(control_params *control)
-    cuda_malloc ((void **)&control->d_control, CONTROL_PARAMS_SIZE, 1, RES_CONTROL_PARAMS );
-    copy_host_device (control, control->d_control, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
+    cuda_malloc((void **)&control->d_control, CONTROL_PARAMS_SIZE, 1, RES_CONTROL_PARAMS );
+    copy_host_device(control, control->d_control, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
 void Cuda_Init_Simulation_Data (simulation_data *data)
-    cuda_malloc ((void **) &(data->d_simulation_data), SIMULATION_DATA_SIZE, 1, RES_SIMULATION_DATA );
+    cuda_malloc((void **) &(data->d_simulation_data), SIMULATION_DATA_SIZE, 1, RES_SIMULATION_DATA );
-GLOBAL void Initialize_Grid (ivec *nbrs, rvec *nbrs_cp, int N)
+GLOBAL void Initialize_Grid(ivec *nbrs, rvec *nbrs_cp, int N)
     int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index >= N) return;
+    if (index >= N)
+    {
+        return;
+    }
     nbrs[index][0] = -1;
     nbrs[index][1] = -1;
@@ -80,6 +86,7 @@ GLOBAL void Initialize_Grid (ivec *nbrs, rvec *nbrs_cp, int N)
     nbrs_cp[index][2] = -1;
 void Cuda_Init_Grid (grid *host, grid *dev)
     int total = host->ncell[0] * host->ncell[1] * host->ncell[2];
@@ -89,30 +96,31 @@ void Cuda_Init_Grid (grid *host, grid *dev)
     dev->max_cuda_nbrs = host->max_cuda_nbrs;
     dev->cell_size = host->cell_size;
-    ivec_Copy (dev->spread, host->spread);
-    ivec_Copy (dev->ncell, host->ncell);
-    rvec_Copy (dev->len, host->len);
-    rvec_Copy (dev->inv_len, host->inv_len);
+    ivec_Copy( dev->spread, host->spread );
+    ivec_Copy( dev->ncell, host->ncell );
+    rvec_Copy( dev->len, host->len );
+    rvec_Copy( dev->inv_len, host->inv_len );
-    cuda_malloc ((void **) &dev->top, INT_SIZE * total , 1, RES_GRID_TOP );
-    cuda_malloc ((void **) &dev->mark, INT_SIZE * total , 1, RES_GRID_MARK );
-    cuda_malloc ((void **) &dev->start, INT_SIZE * total , 1, RES_GRID_START );
-    cuda_malloc ((void **) &dev->end, INT_SIZE * total , 1, RES_GRID_END );
+    cuda_malloc((void **) &dev->top, INT_SIZE * total , 1, RES_GRID_TOP );
+    cuda_malloc((void **) &dev->mark, INT_SIZE * total , 1, RES_GRID_MARK );
+    cuda_malloc((void **) &dev->start, INT_SIZE * total , 1, RES_GRID_START );
+    cuda_malloc((void **) &dev->end, INT_SIZE * total , 1, RES_GRID_END );
-    cuda_malloc ((void **) &dev->atoms, INT_SIZE * total * host->max_atoms, 1, RES_GRID_ATOMS );
-    cuda_malloc ((void **) &dev->nbrs, IVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS );
-    cuda_malloc ((void **) &dev->nbrs_cp, RVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS_CP );
+    cuda_malloc((void **) &dev->atoms, INT_SIZE * total * host->max_atoms, 1, RES_GRID_ATOMS );
+    cuda_malloc((void **) &dev->nbrs, IVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS );
+    cuda_malloc((void **) &dev->nbrs_cp, RVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS_CP );
     int block_size = 512;
     int blocks = (total*dev->max_nbrs) / block_size + ((total*dev->max_nbrs) % block_size == 0 ? 0 : 1);
-    Initialize_Grid <<<blocks, block_size>>>
-        (dev->nbrs, dev->nbrs_cp, total * host->max_nbrs );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    Initialize_Grid<<<blocks, block_size>>>
+        ( dev->nbrs, dev->nbrs_cp, total * host->max_nbrs );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
-GLOBAL void Init_Workspace_Arrays (single_body_parameters *sbp, reax_atom *atoms, 
+GLOBAL void Init_Workspace_Arrays(single_body_parameters *sbp, reax_atom *atoms, 
         static_storage workspace, int N)
@@ -127,6 +135,7 @@ GLOBAL void Init_Workspace_Arrays (single_body_parameters *sbp, reax_atom *atoms
     workspace.b[i+N] = -1.0;
 GLOBAL void Init_Map_Serials (int *input, int N)
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -135,6 +144,7 @@ GLOBAL void Init_Map_Serials (int *input, int N)
     input[i] = -1;
 void Cuda_Init_Workspace_System (reax_system *system, static_storage *workspace )
     int blocks, block_size = BLOCK_SIZE;
@@ -262,6 +272,7 @@ void Cuda_Init_Workspace( reax_system *system, control_params *control,
     Cuda_Reset_Workspace( system, workspace );
 void Cuda_Init_Workspace_Device ( static_storage *workspace )
     workspace->realloc.estimate_nbrs = -1;
@@ -273,6 +284,7 @@ void Cuda_Init_Workspace_Device ( static_storage *workspace )
     workspace->realloc.gcell_atoms = -1;
 void Cuda_Init_Sparse_Matrix (sparse_matrix *matrix, int entries, int N)
     cuda_malloc ((void **) &matrix->start, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX );
@@ -284,7 +296,8 @@ void Cuda_Init_Sparse_Matrix (sparse_matrix *matrix, int entries, int N)
-void Cuda_Init_Scratch ()
+void Cuda_Init_Scratch()
     cuda_malloc ((void **) &scratch, SCRATCH_SIZE, 0, RES_SCRATCH );
diff --git a/PuReMD-GPU/src/cuda_init.h b/PuReMD-GPU/src/cuda_init.h
index 233761691fe56bc8b1290c8972c4b413cc876f54..cd9c568130730d298fbaf781f4f6b18b0f02f65a 100644
--- a/PuReMD-GPU/src/cuda_init.h
+++ b/PuReMD-GPU/src/cuda_init.h
@@ -18,22 +18,31 @@
 #ifndef __CUDA_INIT_H__
 #define __CUDA_INIT_H__
 #include "mytypes.h"
-void    Cuda_Init_System ( reax_system* );
-void    Cuda_Init_Simulation_Data (simulation_data *);
-void    Cuda_Init_Workspace_System ( reax_system *, static_storage *);
-void    Cuda_Init_Workspace ( reax_system *, control_params *, static_storage *);
-void    Cuda_Init_Workspace_Device ( static_storage *);
-void  Cuda_Init_Control (control_params *);
-void  Cuda_Init_Grid (grid *, grid *);
-void    Cuda_Init_Sparse_Matrix (sparse_matrix *, int, int);
+#ifdef __cplusplus
+extern "C"  {
+void Cuda_Init_System( reax_system* );
+void Cuda_Init_Simulation_Data( simulation_data * );
+void Cuda_Init_Workspace_System( reax_system *, static_storage * );
+void Cuda_Init_Workspace( reax_system *, control_params *, static_storage * );
+void Cuda_Init_Workspace_Device( static_storage * );
+void Cuda_Init_Control( control_params * );
+void Cuda_Init_Grid( grid *, grid * );
+void Cuda_Init_Sparse_Matrix( sparse_matrix *, int, int );
+void Cuda_Init_Scratch( );
+#ifdef __cplusplus
-void Cuda_Init_Scratch ();
diff --git a/PuReMD-GPU/src/cuda_init_md.cu b/PuReMD-GPU/src/cuda_init_md.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1a205506e4c5ff767e02398a3859f838818c1e1a
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_init_md.cu
@@ -0,0 +1,586 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_init_md.h"
+#include "allocate.h"
+#include "box.h"
+#include "forces.h"
+#include "grid.h"
+#include "index_utils.h"
+#include "init_md.h"
+#include "integrate.h"
+#include "lookup.h"
+#include "print_utils.h"
+#include "reset_utils.h"
+#include "system_props.h"
+#include "traj.h"
+#include "vector.h"
+#include "cuda_allocate.h"
+#include "cuda_utils.h"
+#include "cuda_init.h"
+#include "cuda_copy.h"
+#include "cuda_box.h"
+#include "cuda_forces.h"
+#include "cuda_grid.h"
+#include "cuda_integrate.h"
+#include "cuda_lin_alg.h"
+#include "cuda_list.h"
+#include "cuda_lookup.h"
+#include "cuda_neighbors.h"
+#include "cuda_reduction.h"
+#include "cuda_reset_utils.h"
+#include "cuda_system_props.h"
+#include "validation.h"
+void Cuda_Init_System( reax_system *system, control_params *control, 
+        simulation_data *data )
+    int i;
+    rvec dx;
+    if( !control->restart )
+    {
+        Cuda_Reset_Atoms( system );
+    }
+    Cuda_Compute_Total_Mass( system, data );
+    Cuda_Compute_Center_of_Mass( system, data, stderr );
+    /* reposition atoms */
+    // just fit the atoms to the periodic box
+    if( control->reposition_atoms == 0 )
+    {
+        rvec_MakeZero( dx );
+    }
+    // put the center of mass to the center of the box
+    else if( control->reposition_atoms == 1 )
+    {
+        rvec_Scale( dx, 0.5, system->box.box_norms );
+        rvec_ScaledAdd( dx, -1., data->xcm );
+    }
+    // put the center of mass to the origin
+    else if( control->reposition_atoms == 2 )
+    {
+        rvec_Scale( dx, -1., data->xcm );
+    }
+    else
+    {
+        fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
+        exit( UNKNOWN_OPTION );
+    }
+    k_compute_Inc_on_T3<<<BLOCKS_POW_2, BLOCK_SIZE>>>
+        (system->d_atoms, system->N, system->d_box, dx[0], dx[1], dx[2]);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    //copy back the atoms from device to the host
+    copy_host_device( system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , 
+            cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
+    /* Initialize velocities so that desired init T can be attained */
+    if( !control->restart || (control->restart && control->random_vel) )  {
+        Generate_Initial_Velocities( system, control->T_init );
+    }
+    Setup_Grid( system );
+void Cuda_Init_Simulation_Data( reax_system *system, control_params *control, 
+        simulation_data *data, output_controls *out_control, 
+        evolve_function *Evolve )
+    Reset_Simulation_Data( data );
+    if( !control->restart )  
+        data->step = data->prev_steps = 0;
+    switch( control->ensemble ) {
+        case NVE:
+            data->N_f = 3 * system->N;
+            *Evolve = Cuda_Velocity_Verlet_NVE;
+            break;
+        case NVT:
+            data->N_f = 3 * system->N + 1;
+            //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
+            if( !control->restart || (control->restart && control->random_vel) ) {
+                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
+                        data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->therm.v_xi_old = 0;
+                data->therm.xi = 0;
+#if defined(DEBUG_FOCUS)
+                fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
+                        data->therm.G_xi, control->Tau_T, data->E_Kin, 
+                        data->N_f, data->therm.v_xi );
+            }
+            *Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein;
+            break;
+        case NPT: // Anisotropic NPT
+            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+            exit( UNKNOWN_OPTION );
+            data->N_f = 3 * system->N + 9;
+            if( !control->restart ) {
+                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
+                        data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->iso_bar.eps = 0.33333 * log(system->box.volume);
+                //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
+                //Compute_Pressure( system, data, workspace );
+            }
+            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+            break;
+        case sNPT: // Semi-Isotropic NPT
+            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+            exit( UNKNOWN_OPTION );
+            data->N_f = 3 * system->N + 4;
+            *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
+            break;
+        case iNPT: // Isotropic NPT
+            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+            exit( UNKNOWN_OPTION );
+            data->N_f = 3 * system->N + 2;
+            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+            break;
+        case bNVT: //berendensen NVT
+            data->N_f = 3 * system->N + 1; 
+            *Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
+            break;
+        default:
+            break;
+    }
+    Cuda_Compute_Kinetic_Energy( system, data );
+#ifdef __BUILD_DEBUG__
+    real t_E_Kin = 0;
+    t_E_Kin = data->E_Kin;
+    copy_host_device( &data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, 
+            REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+    data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
+    if( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! 
+        data->therm.T = ALMOST_ZERO;
+#ifdef __BUILD_DEBUG__
+    if (check_zero( t_E_Kin, data->E_Kin)){
+        fprintf( stderr, "SimulationData:E_Kin does not match between host and device (%f %f) \n", t_E_Kin, data->E_Kin );
+        exit( 1 );
+    }
+    //validate_data ( system, data );
+    /* init timing info for the host*/
+    data->timing.start = Get_Time( );
+    data->timing.total = data->timing.start;
+    data->timing.nbrs = 0;
+    data->timing.init_forces = 0;
+    data->timing.bonded = 0;
+    data->timing.nonb = 0;
+    data->timing.QEq = 0;
+    data->timing.matvecs = 0;
+    /* init timing info for the device */
+    d_timing.start = Get_Time( );
+    d_timing.total = data->timing.start;
+    d_timing.nbrs = 0;
+    d_timing.init_forces = 0;
+    d_timing.bonded = 0;
+    d_timing.nonb = 0;
+    d_timing.QEq = 0;
+    d_timing.matvecs = 0;
+int Estimate_Device_Matrix( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+    int *indices, *Htop;
+    list *far_nbrs = dev_lists + FAR_NBRS;
+    int max_sparse_entries = 0;
+    real t1, t2;
+    indices = (int *) scratch;
+    cuda_memset( indices, 0, INT_SIZE * system->N, RES_SCRATCH );
+    t1 = Get_Time( );
+    k_Estimate_Sparse_Matrix_Entries<<<BLOCKS, BLOCK_SIZE>>>
+        ( system->d_atoms, (control_params *)control->d_control, 
+          (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, 
+          *far_nbrs, system->N, indices );
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    t2 = Get_Timing_Info( t1 );
+    //fprintf (stderr, " Time to estimate sparse matrix entries --- > %f \n", t2 );
+    Htop = (int *) malloc( INT_SIZE * (system->N + 1) );
+    memset( Htop, 0, INT_SIZE * (system->N + 1) );
+    copy_host_device( Htop, indices, system->N * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+    for (int i = 0; i < system->N; i++) 
+    {
+        if (max_sparse_entries < Htop[i]) {
+            max_sparse_entries = Htop[i];
+        }    
+    }
+#ifdef __DEBUG_CUDA__
+    fprintf( stderr,
+        " Max sparse entries for this run are ---> %d \n", max_sparse_entries );
+    return max_sparse_entries * SAFE_ZONE;
+    //return max_sparse_entries;
+void Allocate_Device_Matrix (reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+    //Allocate space for the sparse Matrix entries here. 
+    system->max_sparse_matrix_entries = 
+        Estimate_Device_Matrix( system, control, data, workspace, lists, out_control );
+    dev_workspace->H.n = system->N ;
+    dev_workspace->H.m = system->N * system->max_sparse_matrix_entries;
+    Cuda_Init_Sparse_Matrix( &dev_workspace->H, system->max_sparse_matrix_entries * system->N, system->N );
+#ifdef __CUDA_MEM__
+    fprintf( stderr, "Device memory allocated: sparse matrix= %ld (MB)\n", 
+            system->max_sparse_matrix_entries * system->N * sizeof(sparse_matrix_entry) / (1024*1024) );
+void Cuda_Init_Lists( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+    int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
+    int *hb_top, *bond_top;
+    real t_start, t_elapsed;
+    grid *g = &( system->g );
+    int *d_indices = (int *) scratch;
+    int total = g->ncell[0] * g->ncell[1] * g->ncell[2];
+    cuda_memset( d_indices, 0, INT_SIZE * system->N, RES_SCRATCH );
+#ifdef __BUILD_DEBUG__
+    for (int i = 0; i < g->max_nbrs; i ++)
+    {
+        if ((g->nbrs[i][0] >= g->ncell[0]) ||
+                (g->nbrs[i][1] >= g->ncell[1]) ||
+                (g->nbrs[i][2] >= g->ncell[2]) )
+        {
+            fprintf( stderr, " Grid Incorrectly built.... \n" );
+            exit( 1 );
+        }
+    }
+    dim3 blockspergrid( system->g.ncell[0], system->g.ncell[1], system->g.ncell[2] );
+    dim3 threadsperblock( system->g.max_atoms );
+#ifdef __BUILD_DEBUG__
+    fprintf( stderr, "Blocks per grid (%d %d %d)\n", system->g.ncell[0], system->g.ncell[1], system->g.ncell[2] );
+    fprintf( stderr, "Estimate Num  Neighbors with threads per block as %d \n", system->d_g.max_atoms );
+    fprintf( stderr, "Max nbrs %d \n", system->d_g.max_nbrs );
+    //First Bin atoms and they sync the host and the device for the grid.
+    //This will copy the atoms from host to device.
+    Cuda_Bin_Atoms( system, workspace );
+    Sync_Host_Device_Grid( &system->g, &system->d_g, cudaMemcpyHostToDevice );
+    k_Estimate_NumNeighbors<<<blockspergrid, threadsperblock >>>
+        (system->d_atoms, system->d_g, system->d_box, 
+         (control_params *)control->d_control, d_indices);
+    cudaThreadSynchronize( );
+    cudaCheckError( );
+    int *nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) );
+    memset( nbrs_indices , 0, INT_SIZE * (system->N + 1) );
+    nbrs_indices [0] = 0;
+    copy_host_device( &nbrs_indices [1], d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ ); 
+    for (int i = 1; i <= system->N; i++)
+    {
+        nbrs_indices [i] += nbrs_indices [i-1];
+    }
+    num_nbrs = nbrs_indices [system->N] ;
+    system->num_nbrs = num_nbrs;
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Total neighbors %d \n", nbrs_indices[system->N]);
+    fprintf (stderr, "Corrected Total neighbors %d \n", num_nbrs);
+    list *far_nbrs = (dev_lists + FAR_NBRS);
+    if( !Cuda_Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, far_nbrs) ) {
+        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
+        exit( INIT_ERR );
+    }
+#ifdef __CUDA_MEM__
+    fprintf( stderr, "Device memory allocated: far_nbrs = %ld (MB)\n", 
+            num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
+    copy_host_device( nbrs_indices, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__  );
+    copy_host_device( nbrs_indices, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__  );
+    Cuda_Generate_Neighbor_Lists( system, workspace, control, FALSE );
+#ifdef __BUILD_DEBUG__
+    int *end = (int *)malloc( sizeof (int) * system->N );
+    int *start = (int *) malloc( sizeof (int) * system->N );
+    copy_host_device( start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0 );
+    copy_host_device( end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0 );
+    far_neighbor_data *far_data = (far_neighbor_data *) 
+        malloc( FAR_NEIGHBOR_SIZE * num_nbrs );
+    copy_host_device( far_data, far_nbrs->select.far_nbr_list, 
+            FAR_NEIGHBOR_SIZE * num_nbrs, cudaMemcpyDeviceToHost, 0 );
+    compare_far_neighbors( nbrs_indices, start, end, far_data, *lists + FAR_NBRS, system->N );
+    free( start );
+    free( end );
+    int *output, size;
+    size = INT_SIZE * 2 * system->N + 2;
+    output = (int *) malloc (size);
+    Cuda_Estimate_Storage_Sizes( system, control, output );
+    Htop = output[0];
+    num_3body  = output[1];
+    hb_top = &output[ 2 ]; 
+    bond_top = &output[ 2 + system->N ];
+#ifdef __DEBUG_CUDA__
+    int max_hbonds = 0;
+    int min_hbonds = 1000;
+    int max_bonds = 0;
+    int min_bonds = 1000;
+    for (int i = 0; i < system->N; i++)
+    {
+        if ( max_hbonds < hb_top[i])
+        {
+            max_hbonds = hb_top[i];
+        }
+        if (min_hbonds > hb_top[i])
+        {
+            min_hbonds = hb_top[i];
+        }
+        if (max_bonds < bond_top [i])
+        {
+            max_bonds = bond_top[i];
+        }
+        if (min_bonds > bond_top[i])
+        {
+            min_bonds = bond_top[i];
+        }
+    }
+    fprintf( stderr, "Max Hbonds %d min Hbonds %d \n", max_hbonds, min_hbonds );
+    fprintf( stderr, "Max bonds %d min bonds %d \n", max_bonds, min_bonds );
+    fprintf( stderr, "Device HTop --> %d and num_3body --> %d \n", Htop, num_3body );
+    Allocate_Device_Matrix( system, control, data, workspace, lists, out_control );
+    dev_workspace->num_H = 0;
+    if( control->hb_cut > 0 )
+    {
+        int *hbond_index = (int *) malloc ( INT_SIZE * system->N );
+        // init H indexes 
+        num_hbonds = 0;
+        for( i = 0; i < system->N; ++i )
+        {
+            if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 || 
+                    system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 2  ) // H atom
+            {
+                //hbond_index[i] = workspace->num_H++;
+                hbond_index[i] = num_hbonds ++;
+            }
+            else 
+            {
+                hbond_index[i] = -1;
+            }
+        }
+        copy_host_device( hbond_index, dev_workspace->hbond_index, 
+                system->N * INT_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_HBOND_INDEX );
+        dev_workspace->num_H = num_hbonds;
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "Device num_H --> %d \n", dev_workspace->num_H );
+        Cuda_Allocate_HBond_List( system->N, dev_workspace->num_H, dev_workspace->hbond_index, 
+                hb_top, (dev_lists+HBONDS) );
+        num_hbonds = hb_top[system->N-1];
+        system->num_hbonds = num_hbonds;
+#ifdef __CUDA_MEM__
+        fprintf( stderr, "Device memory allocated: Hydrogen Bonds list: %ld (MB) \n", 
+                sizeof (hbond_data) * num_hbonds / (1024*1024) );
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "Device Total number of HBonds --> %d \n", num_hbonds );
+        free( hbond_index );
+    }
+    // bonds list 
+    Cuda_Allocate_Bond_List( system->N, bond_top, dev_lists+BONDS );
+    num_bonds = bond_top[system->N-1];
+    system->num_bonds = num_bonds;
+#ifdef __CUDA_MEM__
+    fprintf( stderr, "Device memory allocated: Bonds list: %ld (MB) \n", 
+            sizeof (bond_data) * num_bonds / (1024*1024));
+#ifdef __DEBUG_CUDA__
+   fprintf( stderr, "Device Total Bonds --> %d \n", num_bonds );
+    //    system->max_thb_intrs = num_3body;
+    // 3bodies list 
+    //if(!Cuda_Make_List(num_bonds, num_bonds * MAX_THREE_BODIES, TYP_THREE_BODY, dev_lists + THREE_BODIES)) {
+    //  fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
+    //  exit( INIT_ERR );
+    //}
+    //fprintf( stderr, "***memory allocated: three_body = %ldMB\n", 
+    //   num_bonds * MAX_THREE_BODIES *sizeof(three_body_interaction_data) / (1024*1024) );
+    //fprintf (stderr, "size of (three_body_interaction_data) : %d \n", sizeof (three_body_interaction_data));
+    free( output );
+    free( nbrs_indices );
+void Cuda_Initialize( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, list **lists, 
+        output_controls *out_control, evolve_function *Evolve )
+    compute_blocks( &BLOCKS, &BLOCK_SIZE, system->N );
+    compute_nearest_pow_2( BLOCKS, &BLOCKS_POW_2 );
+    //MATVEC_BLOCKS = system.N;
+    //MATVEC_BLOCK_SIZE = 32;
+        ((system->N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) == 0 ? 0 : 1);
+#ifdef __DEBUG_CUDA__
+    fprintf( stderr, " MATVEC Blocks : %d, Block_Size : %d \n", MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
+    fprintf( stderr, " Blocks : %d, Blocks_Pow_2 : %d, Block_Size : %d \n", BLOCKS, BLOCKS_POW_2, BLOCK_SIZE );
+    fprintf( stderr, " Size of far neighbor data %d \n", sizeof (far_neighbor_data) );
+    fprintf( stderr, " Size of reax_atom %d \n", sizeof (reax_atom) );
+    fprintf( stderr, " size of sparse matrix entry %d \n", sizeof (sparse_matrix_entry) );
+    fprintf( stderr, " TOTAL NUMBER OF ATOMS IN THE SYSTEM --> %d \n", system.N );
+    Randomize( );
+    Cuda_Init_Scratch( );
+    //System
+    Cuda_Init_System( system );
+    Sync_Host_Device_Sys( system, cudaMemcpyHostToDevice );
+    Cuda_Init_System( system, control, data );
+    //Simulation Data
+    copy_host_device( system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , 
+            cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS );
+    Cuda_Init_Simulation_Data( data );
+    //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice );
+    Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve );
+    Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice );
+    //static storage
+    Cuda_Init_Workspace_System( system, dev_workspace );
+    Cuda_Init_Workspace( system, control, dev_workspace );
+    Cuda_Init_Workspace_Device( workspace );
+    //control
+    Cuda_Init_Control( control );
+    //Grid
+    Cuda_Init_Grid( &system->g, &system->d_g );
+    //lists
+    Cuda_Init_Lists( system, control, data, workspace, lists, out_control );
+    Init_Out_Controls( system, control, workspace, out_control );
+    if( control->tabulate )
+    {
+        real start, end;
+        start = Get_Time( );
+        Make_LR_Lookup_Table( system, control );
+        copy_LR_table_to_device( system, control );
+        end = Get_Timing_Info( start );
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "Done copying the LR table to the device ---> %f \n", end );
+    }
diff --git a/PuReMD-GPU/src/cuda_init_md.h b/PuReMD-GPU/src/cuda_init_md.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a6544b017d31ca0a82651840682c3aae49d1b11
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_init_md.h
@@ -0,0 +1,40 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_INIT_MD_H_
+#define __CUDA_INIT_MD_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+void Cuda_Initialize( reax_system*, control_params*, simulation_data*,
+       static_storage*, list**, output_controls*, evolve_function* );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_integrate.cu b/PuReMD-GPU/src/cuda_integrate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cba0b79c39b4f9b66e5b506d11dcffb81adc488d
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_integrate.cu
@@ -0,0 +1,517 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_integrate.h"
+#include "allocate.h"
+#include "box.h"
+#include "forces.h"
+#include "grid.h"
+#include "print_utils.h"
+#include "reset_utils.h"
+#include "system_props.h"
+#include "vector.h"
+#include "list.h"
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+#include "cuda_allocate.h"
+#include "cuda_forces.h"
+#include "cuda_grid.h"
+#include "cuda_neighbors.h"
+#include "cuda_QEq.h"
+#include "cuda_reset_utils.h"
+#include "cuda_system_props.h"
+#include "validation.h"
+GLOBAL void Cuda_Velocity_Verlet_NVE_atoms1 (reax_atom *atoms, 
+        single_body_parameters *sbp, 
+        simulation_box *box,
+        int N, real dt)
+    real inv_m, dt_sqr;
+    rvec dx;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    dt_sqr = SQR(dt);
+    //for( i = 0; i < system->N; i++ ) {
+    inv_m = 1.0 / sbp[atoms[i].type].mass;
+    rvec_ScaledSum( dx, dt, atoms[i].v, 
+            0.5 * dt_sqr * -F_CONV * inv_m, atoms[i].f );
+    Inc_on_T3( atoms[i].x, dx, box );
+    rvec_ScaledAdd( atoms[i].v, 
+            0.5 * dt * -F_CONV * inv_m, atoms[i].f );
+    //}
+GLOBAL void Cuda_Velocity_Verlet_NVE_atoms2 (reax_atom *atoms, single_body_parameters *sbp, int N, real dt)
+    real inv_m;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    //for( i = 0; i < system->N; i++ ) {
+    inv_m = 1.0 / sbp[atoms[i].type].mass;
+    rvec_ScaledAdd( atoms[i].v, 
+            0.5 * dt * -F_CONV * inv_m, atoms[i].f );
+    //}
+void Cuda_Velocity_Verlet_NVE(reax_system* system, control_params* control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+    int i, steps, renbr;
+    real inv_m, dt, dt_sqr;
+    rvec dx;
+    int blocks, block_size;
+    dt = control->dt;
+    dt_sqr = SQR(dt);
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr, "step%d: ", data->step );
+    compute_blocks (&blocks, &block_size, system->N);
+    Cuda_Velocity_Verlet_NVE_atoms1 <<<blocks, block_size>>>
+        (system->d_atoms, system->reaxprm.d_sbp, 
+         (simulation_box *)system->d_box, system->N, dt);
+    cudaThreadSynchronize ();
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr, "verlet1 - ");
+    Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
+    Cuda_Reset( system, control, data, workspace, lists );
+    if( renbr ) {
+        Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, TRUE);
+    }
+    Cuda_Compute_Forces( system, control, data, workspace, lists, out_control );
+    Cuda_Velocity_Verlet_NVE_atoms2<<<blocks, block_size>>>
+        (system->d_atoms, system->reaxprm.d_sbp, system->N, dt);
+    cudaThreadSynchronize ();
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr, "verlet2\n");
+GLOBAL void Compute_X_t_dt (real dt, real dt_sqr, thermostat p_therm,
+        reax_atom *atoms, single_body_parameters *sbp, 
+        simulation_box *box,
+        static_storage p_workspace, int N)
+    real inv_m;
+    rvec dx;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    static_storage *workspace = &p_workspace;
+    thermostat *therm = &p_therm;
+    /* Compute x(t + dt) and copy old forces */
+    //for (i=0; i < system->N; i++) {
+    inv_m = 1.0 / sbp[atoms[i].type].mass;
+    rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, atoms[i].v,
+            0.5 * dt_sqr * inv_m * -F_CONV, atoms[i].f );
+    Inc_on_T3( atoms[i].x, dx, box );
+    rvec_Copy( workspace->f_old[i], atoms[i].f );
+    //}
+GLOBAL void Update_Velocity (reax_atom *atoms, single_body_parameters *sbp, 
+        static_storage p_workspace, real dt, thermostat p_therm, 
+        int N)
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    real inv_m;
+    static_storage *workspace = &p_workspace;
+    thermostat *therm = &p_therm;
+    //for( i = 0; i < system->N; ++i ) {
+    inv_m = 1.0 / sbp[atoms[i].type].mass;
+    rvec_Scale( workspace->v_const[i], 
+            1.0 - 0.5 * dt * therm->v_xi, atoms[i].v );
+    rvec_ScaledAdd( workspace->v_const[i], 
+            0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
+    rvec_ScaledAdd( workspace->v_const[i], 
+            0.5 * dt * inv_m * -F_CONV, atoms[i].f );
+    //}
+GLOBAL void E_Kin_Reduction (reax_atom *atoms, static_storage p_workspace,
+        single_body_parameters *sbp, 
+        real *per_block_results, real coef_v, const size_t n)
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+    static_storage *workspace = &p_workspace;
+    if(i < n)
+    {
+        rvec_Scale( atoms[i].v, coef_v, workspace->v_const[i] );
+        x = ( 0.5 * sbp[atoms[i].type].mass * 
+                rvec_Dot( atoms[i].v, atoms[i].v ) );
+    }
+    sdata[threadIdx.x] = x;
+    __syncthreads();
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {   
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }   
+        __syncthreads();
+    }
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sdata[0];
+    }
+void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, 
+        control_params* control, 
+        simulation_data *data, 
+        static_storage *workspace, 
+        list **lists, 
+        output_controls *out_control )
+    int i, itr, steps, renbr;
+    real inv_m, coef_v, dt, dt_sqr;
+    real E_kin_new, G_xi_new, v_xi_new, v_xi_old;
+    rvec dx;
+    thermostat *therm;
+    real *results = (real *)scratch;
+    dt = control->dt;
+    dt_sqr = SQR( dt );
+    therm = &( data->therm );
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, " Device: Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein --> coef to update velocity --> %6.10f\n", therm->v_xi_old);
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "step%d: ", data->step );
+    Compute_X_t_dt <<< BLOCKS, BLOCK_SIZE >>>
+        (dt, dt_sqr, data->therm, system->d_atoms, 
+         system->reaxprm.d_sbp, system->d_box, *dev_workspace, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    /* Compute xi(t + dt) */
+    therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "verlet1 - " );
+    Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
+    Cuda_Reset( system, control, data, workspace, lists );
+    if( renbr )
+    {
+        //generate_neighbor_lists here
+        Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, TRUE);
+    }
+    /* Calculate Forces at time (t + dt) */
+    Cuda_Compute_Forces( system,control,data, workspace, lists, out_control );
+    /* Compute iteration constants for each atom's velocity */
+    Update_Velocity <<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, system->reaxprm.d_sbp, *dev_workspace,
+         dt, *therm, system->N );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
+    E_kin_new = G_xi_new = v_xi_old = 0;
+    itr = 0;
+    do {
+        itr++;      
+        /* new values become old in this iteration */
+        v_xi_old = v_xi_new;
+        coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
+        E_kin_new = 0;
+#ifdef __DEBUG_CUDA__
+        fprintf (stderr, " Device: coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old);
+        /*reduction for the E_Kin_new here*/
+        cuda_memset (results, 0, 2 * BLOCK_SIZE * REAL_SIZE, RES_SCRATCH );
+        E_Kin_Reduction <<< BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>
+            (system->d_atoms, *dev_workspace, system->reaxprm.d_sbp, 
+             results, coef_v, system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
+            (results, results + BLOCKS_POW_2, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        copy_host_device (&E_kin_new, results + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); 
+        G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - 
+                data->N_f * K_B * control->T );
+        v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
+#if defined(DEBUG)
+        fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n",
+                itr, G_xi_new, v_xi_new, v_xi_old );
+    }
+    while( fabs(v_xi_new - v_xi_old ) > 1e-5 );
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, " Iteration Count in NVE --> %d \n", itr );
+    therm->v_xi_old = therm->v_xi;
+    therm->v_xi = v_xi_new;
+    therm->G_xi = G_xi_new;  
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr,"vel scale\n" );
+GLOBAL void ker_update_velocity_1 (reax_atom *atoms,
+        single_body_parameters *sbp,
+        real dt,
+        simulation_box *box,
+        int N)
+    real inv_m;
+    rvec dx;
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N ) return;
+    /* velocity verlet, 1st part */
+    //for( i = 0; i < system->n; i++ ) { 
+    atom = &(atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute x(t + dt) */
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+    /* Metin's suggestion to rebox the atoms */
+    /* bNVT fix */
+    Inc_on_T3( atoms[i].x, dx, box );
+    /* bNVT fix */
+    /* Compute v(t + dt/2) */
+    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+    //}
+void bNVT_update_velocity_part1 (reax_system *system, simulation_box *box, real dt)
+    ker_update_velocity_1 <<< BLOCKS, BLOCK_SIZE>>>
+        (system->d_atoms, system->reaxprm.d_sbp, dt, box, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+GLOBAL void ker_update_velocity_2 (reax_atom *atoms,
+        single_body_parameters *sbp,
+        real dt,
+        int N)
+    reax_atom *atom;
+    real inv_m;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N ) return;
+    /* velocity verlet, 2nd part */
+    //for( i = 0; i < system->n; i++ ) { 
+    atom = &(atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute v(t + dt) */
+    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+    //}
+void bNVT_update_velocity_part2 (reax_system *system, real dt)
+    ker_update_velocity_2 <<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, system->reaxprm.d_sbp, dt, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+GLOBAL void ker_scale_velocities (reax_atom *atoms, real lambda, int N)
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N ) return;
+    /* Scale velocities and positions at t+dt */
+    //for( i = 0; i < system->n; ++i ) {
+    atom = &(atoms[i]);
+    rvec_Scale( atom->v, lambda, atom->v );
+    //}
+void bNVT_scale_velocities (reax_system *system, real lambda)
+    ker_scale_velocities <<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, lambda, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system,
+        control_params* control,
+        simulation_data *data,
+        static_storage *workspace,
+        list **lists,
+        output_controls *out_control
+        )
+    int i, steps, renbr;
+    real inv_m, dt, lambda;
+    rvec dx;
+    reax_atom *atom;
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "step%d\n", data->step );
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
+    /* velocity verlet, 1st part 
+       for( i = 0; i < system->N; i++ ) { 
+       atom = &(system->atoms[i]);
+       inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
+    // Compute x(t + dt) 
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+    // Compute v(t + dt/2) 
+    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+    }
+     */
+    bNVT_update_velocity_part1 (system, (simulation_box *) system->d_box, dt);
+#if defined(DEBUG_FOCUS)
+    fprintf(stderr, "step%d: verlet1 done\n", data->step);
+    Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
+    Cuda_Reset( system, control, data, workspace, lists );
+    if( renbr ) {
+        Cuda_Generate_Neighbor_Lists( system, workspace, control, TRUE);
+    }
+    Cuda_Compute_Forces( system, control, data, workspace,
+            lists, out_control );
+    /* velocity verlet, 2nd part 
+       for( i = 0; i < system->N; i++ ) {
+       atom = &(system->atoms[i]);
+       inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
+    // Compute v(t + dt) 
+    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+    }
+     */
+    bNVT_update_velocity_part2 (system, dt);
+#if defined(DEBUG_FOCUS)  
+    fprintf(stderr, "step%d: verlet2 done\n", data->step);
+    /* temperature scaler */
+    Cuda_Compute_Kinetic_Energy( system, data );
+    //get the latest temperature from the device to the host.
+    copy_host_device (&data->therm, &((simulation_data *)data->d_simulation_data)->therm,
+            sizeof (thermostat), cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+    if( lambda < MIN_dT )
+        lambda = MIN_dT;
+    else if (lambda > MAX_dT )
+        lambda = MAX_dT;
+    lambda = SQRT( lambda );
+    //fprintf (stderr, "step:%d lambda -> %f \n", data->step, lambda);
+    /* Scale velocities and positions at t+dt 
+       for( i = 0; i < system->N; ++i ) {
+       atom = &(system->atoms[i]);
+       rvec_Scale( atom->v, lambda, atom->v );
+       }
+     */
+    bNVT_scale_velocities (system, lambda);
+    Cuda_Compute_Kinetic_Energy( system, data );
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr, "step%d: scaled velocities\n",
+            data->step );
diff --git a/PuReMD-GPU/src/cuda_integrate.h b/PuReMD-GPU/src/cuda_integrate.h
new file mode 100644
index 0000000000000000000000000000000000000000..959b6684fef618b86252a5606b6feb4a6d093f15
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_integrate.h
@@ -0,0 +1,46 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_INTEGRATE_H_
+#define __CUDA_INTEGRATE_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+void Cuda_Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
+        static_storage*, list**, output_controls* );
+void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
+        simulation_data*, static_storage*,
+        list**, output_controls* );
+void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* ,
+        simulation_data *, static_storage *,
+        list **, output_controls * );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_lin_alg.cu b/PuReMD-GPU/src/cuda_lin_alg.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5dc9eb3518ed8b46174838a498cfdba62d34e989
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_lin_alg.cu
@@ -0,0 +1,589 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_lin_alg.h"
+#include "list.h"
+#include "vector.h"
+#include "index_utils.h"
+#include "cuda_copy.h"
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
+#include "system_props.h"
+#include "cublas_v2.h"
+#include "cusparse_v2.h"
+//one thread per row
+GLOBAL void Cuda_Matvec (sparse_matrix H, real *vec, real *results, int rows)
+    real results_row = 0;
+    int col;
+    real val;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= rows) return;
+    for (int c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
+        results_row += val * vec [col];
+    }
+    results [i] = results_row;
+//32 thread warp per matrix row.
+//invoked as follows
+// <<< system->N, 32 >>>
+GLOBAL void Cuda_Matvec_csr (sparse_matrix H, real *vec, real *results, int num_rows)
+    extern __shared__ real vals [];
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / 32;
+    int lane = thread_id & (32 - 1);
+    int row_start;
+    int row_end;
+    // one warp per row
+    //int row = warp_id;
+    int row = warp_id;
+    //if (row < num_rows)
+    {
+        vals[threadIdx.x] = 0;
+        if (row < num_rows) {
+            row_start = H.start[row];
+            row_end = H.end[row];
+            // compute running sum per thread
+            for(int jj = row_start + lane; jj < row_end; jj += 32)
+                vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ];
+            //vals[threadIdx.x] += H.val[jj] * vec [ H.j[jj] ];
+        }
+        __syncthreads ();
+        // parallel reduction in shared memory
+        //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+        if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads();
+        if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads ();
+        if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads ();
+        if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads ();
+        if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads ();
+        // first thread writes the result
+        if (lane == 0 && row < num_rows)
+            results[row] = vals[threadIdx.x];
+    }
+GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv, int entries)
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= entries) return;
+    b_proc [i] = b[i] * Hdia_inv[i];
+GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j, real *output)
+    real tmp1, tmp2, cc;
+    for( int i = 0; i <= j; i++ )    {
+        if( i == j ) {
+            cc = SQRT( SQR(h[ index_wkspace_res (j,j) ])+SQR(h[ index_wkspace_res (j+1,j) ]) );
+            hc[j] = h[ index_wkspace_res (j,j) ] / cc;
+            hs[j] = h[ index_wkspace_res (j+1,j) ] / cc;
+        }
+        tmp1 =  hc[i] * h[ index_wkspace_res (i,j) ] + hs[i] * h[ index_wkspace_res (i+1,j) ];
+        tmp2 = -hs[i] * h[ index_wkspace_res (i,j) ] + hc[i] * h[ index_wkspace_res (i+1,j) ];
+        h[ index_wkspace_res (i,j) ] = tmp1;
+        h[ index_wkspace_res (i+1,j) ] = tmp2;
+    } 
+    /* apply Givens rotations to the rhs as well */
+    tmp1 =  hc[j] * g_j;
+    tmp2 = -hs[j] * g_j;
+    output[0] = tmp1;
+    output[1] = tmp2;
+GLOBAL void GMRES_BackSubstitution (int j, real *g, real *h, real *y)
+    real temp;
+    for( int i = j-1; i >= 0; i-- ) {
+        temp = g[i];      
+        for( int k = j-1; k > i; k-- )
+            temp -= h[ index_wkspace_res (i,k) ] * y[k];
+        y[i] = temp / h[ index_wkspace_res (i,i) ];
+    }
+int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x )
+    int i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+    real v_add_tmp;
+    sparse_matrix *H = &workspace->H;
+    real t_start, t_elapsed;
+    real *spad = (real *)scratch;
+    real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
+    N = H->n;
+    cuda_memset(spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+        (b, spad, H->n, INITIAL);
+    cudaThreadSynchronize();
+    cudaCheckError();
+    Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>>
+        (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
+    cudaThreadSynchronize();
+    cudaCheckError();
+    copy_host_device( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE,
+            cudaMemcpyDeviceToHost, __LINE__);
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Norm of the array is %e \n", bnorm );
+    /* apply the diagonal pre-conditioner to rhs */
+    GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
+        (workspace->b_prc, b, workspace->Hdia_inv, N);
+    cudaThreadSynchronize();
+    cudaCheckError();
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr ) {
+        /* calculate r0 */
+        //Sparse_MatVec( H, x, workspace->b_prm );      
+            ( *H, x, workspace->b_prm, N );
+        cudaThreadSynchronize();
+        cudaCheckError();
+        GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>>
+            (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N);
+        cudaThreadSynchronize();
+        cudaCheckError();
+        Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>>
+            (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,
+             workspace->b_prc, -1., workspace->b_prm, N);
+        cudaThreadSynchronize();
+        cudaCheckError ();
+        //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system->N)], N );
+        {
+            cuda_memset( spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+            Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
+                (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL);
+            cudaThreadSynchronize();
+            cudaCheckError();
+            Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>>
+                (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
+            cudaThreadSynchronize();
+            cudaCheckError();
+            copy_host_device( g, workspace->g, REAL_SIZE,
+                    cudaMemcpyDeviceToHost, RES_STORAGE_G);
+        }
+        Cuda_Vector_Scale<<< BLOCKS, BLOCK_SIZE >>>
+            ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0],
+              &workspace->v[index_wkspace_sys(0,0,N)], N );
+        cudaThreadSynchronize();
+        cudaCheckError();
+        /* GMRES inner-loop */
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr,
+                " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm,
+                tol, g[0] );
+        for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) {
+            /* matvec */
+            //Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
+                ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)],
+                  &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
+            cudaThreadSynchronize();
+            cudaCheckError();
+            GMRES_Diagonal_Preconditioner<<<BLOCKS, BLOCK_SIZE>>>
+                (&workspace->v[ index_wkspace_sys (j+1,0,N) ],
+                 &workspace->v[ index_wkspace_sys( j+1,0,N) ],
+                 workspace->Hdia_inv, N );
+            cudaThreadSynchronize();
+            cudaCheckError();
+            /* apply modified Gram-Schmidt to orthogonalize the new residual */
+            for( i = 0; i <= j; i++ )
+            {
+                Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+                    (&workspace->v[index_wkspace_sys(i,0,N)],
+                     &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
+                cudaThreadSynchronize();
+                cudaCheckError();
+                Cuda_reduction<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>>
+                    (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
+                cudaThreadSynchronize();
+                cudaCheckError();
+                copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+                Cuda_Vector_Add<<< BLOCKS, BLOCK_SIZE >>>
+                    ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
+                      -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
+                cudaThreadSynchronize();
+                cudaCheckError();
+            }
+            //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
+            cuda_memset(spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
+            Cuda_Norm<<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+                (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
+            cudaThreadSynchronize();
+            cudaCheckError();
+            Cuda_Norm<<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>>
+                (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
+            cudaThreadSynchronize();
+            cudaCheckError();
+            copy_host_device(&v_add_tmp,
+                    &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            Cuda_Vector_Scale<<< BLOCKS, BLOCK_SIZE >>>
+                ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
+                  1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N );
+            cudaThreadSynchronize();
+            cudaCheckError();
+            /* Givens rotations on the upper-Hessenberg matrix to make it U */
+            GMRES_Givens_Rotation<<<1, 1>>>
+                (j, workspace->h, workspace->hc, workspace->hs, g[j], spad);
+            cudaThreadSynchronize();
+            cudaCheckError();
+            copy_host_device(&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+        }
+        copy_host_device(g, workspace->g, (RESTART+1)*REAL_SIZE,
+                cudaMemcpyHostToDevice, __LINE__);
+        /* solve Hy = g.
+           H is now upper-triangular, do back-substitution */
+        copy_host_device(g, spad, (RESTART+1) * REAL_SIZE,
+                cudaMemcpyHostToDevice, RES_STORAGE_G);
+        GMRES_BackSubstitution<<<1, 1>>>
+            (j, spad, workspace->h, workspace->y);
+        cudaThreadSynchronize();
+        cudaCheckError();
+        /* update x = x_0 + Vy */
+        for( i = 0; i < j; i++ )
+        {
+            copy_host_device(&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>>
+                ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
+            cudaThreadSynchronize ();
+            cudaCheckError();
+        }
+        /* stopping condition */
+        if( fabs(g[j]) / bnorm <= tol )
+        {
+            break;
+        }
+    }
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        return itr * (RESTART+1) + j + 1;
+    }
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
+    return itr * (RESTART+1) + j + 1;
+int Cublas_GMRES(reax_system *system, static_storage *workspace, real *b, real tol, real *x )
+    real CSR_ALPHA = 1, CSR_BETA = 0;
+    int i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+    real v_add_tmp;
+    sparse_matrix *H = &workspace->H;
+    real t_start, t_elapsed;
+    real *spad = (real *)scratch;
+    real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
+    cublasHandle_t cublasHandle;
+    N = H->n;
+    cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+    /*
+       Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL);
+       cudaThreadSynchronize ();
+       cudaCheckError ();
+       Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
+       cudaThreadSynchronize ();
+       cudaCheckError ();
+       copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+     */
+    cublasCheckError (cublasDnrm2 ( cublasHandle, N, b, 1, &bnorm ));
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Norm of the array is %e \n", bnorm );
+    /* apply the diagonal pre-conditioner to rhs */
+    GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
+        (workspace->b_prc, b, workspace->Hdia_inv, N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr ) {
+        /* calculate r0 */
+        //Sparse_MatVec( H, x, workspace->b_prm );      
+        Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>>
+            (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        /*
+           Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>>
+           (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
+           cudaThreadSynchronize ();
+           cudaCheckError ();
+         */
+        cuda_memset (workspace->v, 0, REAL_SIZE * (RESTART+1) * N, RES_STORAGE_V);
+        double D_ONE = 1.;
+        double D_MINUS_ONE = -1.;
+        cublasCheckError (cublasDaxpy (cublasHandle, N, &D_ONE, workspace->b_prc, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
+        cublasCheckError (cublasDaxpy (cublasHandle, N, &D_MINUS_ONE, workspace->b_prm, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
+        //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system->N)], N );
+        {
+            /*
+               cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+               Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
+               (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+               Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+               copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G);
+             */
+            cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (0, 0, N)], 1, g ));
+            copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
+        }
+        /*
+           Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
+           ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N );
+           cudaThreadSynchronize ();
+           cudaCheckError ();
+         */
+        double D_SCALE = 1.0 / g[0];
+        cublasCheckError (cublasDscal (cublasHandle, N, &D_SCALE, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
+        /* GMRES inner-loop */
+#ifdef __DEBUG_CUDA__
+        fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] );
+        for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) {
+            /* matvec */
+            Cuda_Matvec_csr 
+                ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+            GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
+                (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+            /* apply modified Gram-Schmidt to orthogonalize the new residual */
+            for( i = 0; i <= j; i++ ) {
+                /*
+                   Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+                   (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
+                   cudaThreadSynchronize ();
+                   cudaCheckError ();
+                   Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
+                   cudaThreadSynchronize ();
+                   cudaCheckError ();
+                   copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+                 */
+                cublasCheckError (cublasDdot (cublasHandle, N, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
+                            &workspace->v[index_wkspace_sys(j+1,0,N)], 1, 
+                            &v_add_tmp));
+                copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
+                /*
+                   Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>>
+                   ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
+                   -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
+                   cudaThreadSynchronize ();
+                   cudaCheckError ();
+                 */
+                double NEG_V_ADD_TMP = -v_add_tmp;
+                cublasCheckError (cublasDaxpy (cublasHandle, N, &NEG_V_ADD_TMP, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
+                            &workspace->v[index_wkspace_sys(j+1,0,N)], 1 ));
+            }
+            //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
+            /*
+               cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
+               Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+               Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+               copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+             */
+            cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (j+1, 0, N)], 1, &v_add_tmp ));
+            copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
+            /*
+               Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
+               ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
+               1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N );
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+             */
+            double REC_V_ADD_TMP = 1. / v_add_tmp;
+            cublasCheckError (cublasDscal (cublasHandle, N, &REC_V_ADD_TMP,  &workspace->v[index_wkspace_sys(j+1,0,N)], 1));
+            /* Givens rotations on the upper-Hessenberg matrix to make it U */
+            GMRES_Givens_Rotation <<<1, 1>>>
+                (j, workspace->h, workspace->hc, workspace->hs, g[j], spad);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+            copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+        }
+        copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
+        /* solve Hy = g.
+           H is now upper-triangular, do back-substitution */
+        copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
+        GMRES_BackSubstitution <<<1, 1>>>
+            (j, spad, workspace->h, workspace->y);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        /* update x = x_0 + Vy */
+        for( i = 0; i < j; i++ )
+        {
+            /*
+               copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+               Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>>
+               ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+             */
+            copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            cublasCheckError (cublasDaxpy (cublasHandle, N, &v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
+                        x, 1));
+        }
+        /* stopping condition */
+        if( fabs(g[j]) / bnorm <= tol )
+            break;
+    }
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        return itr * (RESTART+1) + j + 1;
+    }
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
+    return itr * (RESTART+1) + j + 1;
diff --git a/PuReMD-GPU/src/cuda_lin_alg.h b/PuReMD-GPU/src/cuda_lin_alg.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b464152280f3590c1c70761c1b3ef206779cf5b
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_lin_alg.h
@@ -0,0 +1,43 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_LIN_ALG_H_
+#define __CUDA_LIN_ALG_H_
+#define SIGN(x) (x < 0.0 ? -1 : 1);
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+GLOBAL void Cuda_Matvec (sparse_matrix , real *, real *, int );
+GLOBAL void Cuda_Matvec_csr (sparse_matrix , real *, real *, int );
+int Cuda_GMRES( static_storage *, real *b, real tol, real *x );
+int Cublas_GMRES( reax_system *, static_storage *, real *b, real tol, real *x );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_list.cu b/PuReMD-GPU/src/cuda_list.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5375297c8da15000d673cc70ef5cad3a91dae541
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_list.cu
@@ -0,0 +1,114 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_list.h"
+#include "cuda_utils.h"
+char Cuda_Make_List(int n, int num_intrs, int type, list* l)
+    char success=1;
+    l->n = n;
+    l->num_intrs = num_intrs;
+    cuda_malloc ((void **)&l->index, n * sizeof(int), 1, LIST_INDEX );
+    cuda_malloc ((void **)&l->end_index, n * sizeof(int), 1, LIST_END_INDEX );
+    switch(type)
+    {
+        case TYP_FAR_NEIGHBOR:
+            cuda_malloc ((void **) &l->select.far_nbr_list, 
+                    l->num_intrs*sizeof(far_neighbor_data), 
+                    1, LIST_FAR_NEIGHBOR_DATA);
+            /*
+               cudaHostAlloc ((void **) &l->select.far_nbr_list, 
+               l->num_intrs*sizeof(far_neighbor_data),
+               cudaHostAllocMapped);
+               cudaHostGetDevicePointer ( (void **) &l->select.far_nbr_list, 
+               (void *)l->select.far_nbr_list, 0);
+             */
+            break;
+        case TYP_HBOND:
+            cuda_malloc ((void **) &l->select.hbond_list,
+                    l->num_intrs * sizeof(hbond_data),
+                    1, LIST_HBOND_DATA );
+            break;            
+        case TYP_BOND:
+            cuda_malloc ((void **) &l->select.bond_list,
+                    l->num_intrs * sizeof(bond_data),
+                    1, LIST_BOND_DATA );
+            break;            
+        case TYP_THREE_BODY:
+            cuda_malloc ( (void **) &l->select.three_body_list, 
+                    l->num_intrs * sizeof(three_body_interaction_data), 
+                    1, LIST_THREE_BODY_DATA );
+            break;
+        default: 
+            fprintf (stderr, "Unknown list creation \n" );
+            exit (1);
+    }
+    return success;
+void Cuda_Delete_List(list* l)
+    if (l->index != NULL)
+        cuda_free (l->index, LIST_INDEX );    
+    if (l->end_index != NULL)
+        cuda_free (l->end_index, LIST_END_INDEX );
+    switch(l->type)
+    {
+        case TYP_FAR_NEIGHBOR:
+            if (l->select.far_nbr_list != NULL)
+                cuda_free (l->select.far_nbr_list, LIST_FAR_NEIGHBOR_DATA);
+            break;
+        case TYP_HBOND:
+            if (l->select.hbond_list != NULL)
+                cuda_free (l->select.hbond_list, LIST_HBOND_DATA );
+            break;            
+        case TYP_BOND:
+            if (l->select.bond_list != NULL)
+                cuda_free (l->select.bond_list, LIST_BOND_DATA );
+            break;            
+        case TYP_THREE_BODY:
+            if (l->select.three_body_list != NULL) 
+                cuda_free ( l->select.three_body_list, LIST_THREE_BODY_DATA );
+            break;
+        default: 
+            fprintf (stderr, "Unknown list deletion \n" );
+            exit (1);
+    }
diff --git a/PuReMD-GPU/src/helpers.h b/PuReMD-GPU/src/cuda_list.h
similarity index 82%
rename from PuReMD-GPU/src/helpers.h
rename to PuReMD-GPU/src/cuda_list.h
index aa1bb62080ee5cdd9b92718c21124a387e248bad..cafc85743b9eb6b31da5b35b063361ef97fddf54 100644
--- a/PuReMD-GPU/src/helpers.h
+++ b/PuReMD-GPU/src/cuda_list.h
@@ -18,12 +18,22 @@
-#ifndef __HELPERS_H__
-#define __HELPERS_H__
+#ifndef __CUDA_LIST_H_
+#define __CUDA_LIST_H_
 #include "mytypes.h"
-GLOBAL void compute_Inc_on_T3 (reax_atom *atoms, unsigned int N,
-                               simulation_box *box, real d1, real d2, real d3);
+#ifdef __cplusplus
+extern "C"  {
+char Cuda_Make_List( int, int, int, list* );
+void Cuda_Delete_List( list* );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/lookup.cu b/PuReMD-GPU/src/cuda_lookup.cu
similarity index 60%
rename from PuReMD-GPU/src/lookup.cu
rename to PuReMD-GPU/src/cuda_lookup.cu
index c6cc23cfcb431d3fbb2921068fa958816b5287fc..9804ad8b6afee5b0c7e04ea37f46d9c206c5ba68 100644
--- a/PuReMD-GPU/src/lookup.cu
+++ b/PuReMD-GPU/src/cuda_lookup.cu
@@ -18,37 +18,16 @@
-#include "lookup.h"
-#include "two_body_interactions.h"
+#include "cuda_lookup.h"
-#include "cuda_utils.h"
 #include "index_utils.h"
-void Make_Lookup_Table(real xmin, real xmax, int n,
-        lookup_function f, lookup_table* t)
-    int i;
-    t->xmin = xmin;
-    t->xmax = xmax;
-    t->n = n;
-    t->dx = (xmax - xmin)/(n-1);
-    t->inv_dx = 1.0 / t->dx;
-    t->a = (n-1)/(xmax-xmin);
-    t->y = (real*) malloc(n*sizeof(real));
-    for(i=0; i < n; i++)
-        t->y[i] = f(i*t->dx + t->xmin);
-    // //fprintf(stdout,"dx = %lf\n",t->dx);
-    // for(i=0; i < n; i++)
-    //   //fprintf( stdout,"%d %lf %lf %lf\n", 
-    //            i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) );
+#include "cuda_utils.h"
+#include "cuda_two_body_interactions.h"
 /* Fills solution into x. Warning: will modify c and d! */
-HOST_DEVICE void Tridiagonal_Solve( const real *a, const real *b, 
+DEVICE void Tridiagonal_Solve( const real *a, const real *b,
         real *c, real *d, real *x, unsigned int n){
     int i;
     real id;
@@ -68,6 +47,7 @@ HOST_DEVICE void Tridiagonal_Solve( const real *a, const real *b,
         x[i] = d[i] - c[i] * x[i + 1];
 GLOBAL void Cuda_Tridiagonal_Solve (const real *a, const real *b, 
         real *c, real *d, real *x, unsigned int n)
@@ -75,65 +55,6 @@ GLOBAL void Cuda_Tridiagonal_Solve (const real *a, const real *b,
-void Natural_Cubic_Spline( const real *h, const real *f, 
-        cubic_spline_coef *coef, unsigned int n )
-    int i;
-    real *a, *b, *c, *d, *v;
-    /* allocate space for the linear system */
-    a = (real*) malloc( n * sizeof(real) );
-    b = (real*) malloc( n * sizeof(real) );
-    c = (real*) malloc( n * sizeof(real) );
-    d = (real*) malloc( n * sizeof(real) );
-    v = (real*) malloc( n * sizeof(real) );
-    /* build the linear system */
-    a[0] = a[1] = a[n-1] = 0;
-    for( i = 2; i < n-1; ++i )
-        a[i] = h[i-1];
-    b[0] = b[n-1] = 0;
-    for( i = 1; i < n-1; ++i )
-        b[i] = 2 * (h[i-1] + h[i]); 
-    c[0] = c[n-2] = c[n-1] = 0;
-    for( i = 1; i < n-2; ++i )
-        c[i] = h[i];
-    d[0] = d[n-1] = 0;
-    for( i = 1; i < n-1; ++i )
-        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
-    /*//fprintf( stderr, "i  a        b        c        d\n" );
-      for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
-    v[0] = 0;
-    v[n-1] = 0;
-    Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
-    for( i = 1; i < n; ++i ){
-        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-        coef[i-1].c = v[i]/2;
-        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-        coef[i-1].a = f[i];
-    }
-    /*//fprintf( stderr, "i  v  coef\n" );
-      for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 GLOBAL void cubic_spline_init_a ( real *a, const real *h, int n )
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -146,6 +67,7 @@ GLOBAL void cubic_spline_init_a ( real *a, const real *h, int n )
 GLOBAL void cubic_spline_init_b (real *b, const real *h, int n )
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -158,6 +80,7 @@ GLOBAL void cubic_spline_init_b (real *b, const real *h, int n )
 GLOBAL void cubic_spline_init_c (real *c, const real *h, int n )
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -170,6 +93,7 @@ GLOBAL void cubic_spline_init_c (real *c, const real *h, int n )
 GLOBAL void cubic_spline_init_d (real *d, const real *f, const real *h, int n )
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -182,6 +106,7 @@ GLOBAL void cubic_spline_init_d (real *d, const real *f, const real *h, int n )
 GLOBAL void calculate_cubic_spline_coef ( const real *f, real *v, const real *h, LR_lookup_table *data, int offset, int n )
     cubic_spline_coef *coef;
@@ -270,66 +195,6 @@ void Cuda_Natural_Cubic_Spline( const real *h, const real *f,
-void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast,
-        cubic_spline_coef *coef, unsigned int n )
-    int i;
-    real *a, *b, *c, *d, *v;
-    /* allocate space for the linear system */
-    a = (real*) malloc( n * sizeof(real) );
-    b = (real*) malloc( n * sizeof(real) );
-    c = (real*) malloc( n * sizeof(real) );
-    d = (real*) malloc( n * sizeof(real) );
-    v = (real*) malloc( n * sizeof(real) );
-    /* build the linear system */
-    a[0] = 0;
-    for( i = 1; i < n; ++i )
-        a[i] = h[i-1];
-    b[0] = 2*h[0];
-    for( i = 1; i < n; ++i )
-        b[i] = 2 * (h[i-1] + h[i]); 
-    c[n-1] = 0;
-    for( i = 0; i < n-1; ++i )
-        c[i] = h[i];
-    d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0;   
-    d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]);
-    for( i = 1; i < n-1; ++i )
-        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
-    /*//fprintf( stderr, "i  a        b        c        d\n" );
-      for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
-    Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n );
-    // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
-    for( i = 1; i < n; ++i ){
-        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-        coef[i-1].c = v[i]/2;
-        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-        coef[i-1].a = f[i];
-    }
-    /*//fprintf( stderr, "i  v  coef\n" );
-      for( i = 0; i < n; ++i )
-    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 GLOBAL void complete_cubic_spline_init_a (real *a, const real *h, int n)
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -341,6 +206,7 @@ GLOBAL void complete_cubic_spline_init_a (real *a, const real *h, int n)
 GLOBAL void complete_cubic_spline_init_b (real *b, const real *h, int n)
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -364,6 +230,7 @@ GLOBAL void complete_cubic_spline_init_c (real *c, const real *h, int n )
 GLOBAL void complete_cubic_spline_init_d (real *d, const real *f, const real *h, int v0_r, int vlast_r, int n)
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -383,6 +250,7 @@ GLOBAL void complete_cubic_spline_init_d (real *d, const real *f, const real *h,
         d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
 GLOBAL void calculate_complete_cubic_spline_coef (LR_lookup_table *data, int offset, real *v, const real *h, const real *f, int n)
@@ -409,6 +277,7 @@ GLOBAL void calculate_complete_cubic_spline_coef (LR_lookup_table *data, int off
     coef[i-1].a = f[i];
 void Cuda_Complete_Cubic_Spline( const real *h, const real *f, int v0_r, int vlast_r,
         LR_lookup_table *data, int offset, unsigned int n )
@@ -471,206 +340,6 @@ void Cuda_Complete_Cubic_Spline( const real *h, const real *f, int v0_r, int vla
-void LR_Lookup( LR_lookup_table *t, real r, LR_data *y )
-    int i;
-    real base, dif;
-    i = (int)(r * t->inv_dx);
-    if( i == 0 )  ++i;
-    base = (real)(i+1) * t->dx;
-    dif = r - base;
-    ////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif );
-    y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + 
-        t->vdW[i].a;
-    y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + 
-            t->CEvd[i].b)*dif + t->CEvd[i].a;
-    //y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b;
-    y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + 
-        t->ele[i].a;
-    y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif +
-        t->CEclmb[i].a;
-    y->H = y->e_ele * EV_to_KCALpMOL / C_ele;
-    //y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a;
-void Make_LR_Lookup_Table( reax_system *system, control_params *control )
-    int i, j, r;
-    int num_atom_types;
-    int existing_types[MAX_ATOM_TYPES];
-    real dr;
-    real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb;
-    real v0_vdw, v0_ele, vlast_vdw, vlast_ele;
-    /* real rand_dist;
-       real evdw_abserr, evdw_relerr, fvdw_abserr, fvdw_relerr;
-       real eele_abserr, eele_relerr, fele_abserr, fele_relerr;
-       real evdw_maxerr, eele_maxerr;
-       LR_data y, y_spline; */
-    /* initializations */
-    vlast_ele = 0;
-    vlast_vdw = 0;
-    v0_ele = 0;
-    v0_vdw = 0;
-    num_atom_types = system->reaxprm.num_atom_types;
-    dr = control->r_cut / control->tabulate;
-    h = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fh = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fele = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-    /* allocate Long-Range LookUp Table space based on 
-       number of atom types in the ffield file */
-    //LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) );
-    //for( i = 0; i < num_atom_types; ++i )
-    // LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table));
-    LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table));
-    /* most atom types in ffield file will not exist in the current
-       simulation. to avoid unnecessary lookup table space, determine
-       the atom types that exist in the current simulation */
-    for( i = 0; i < MAX_ATOM_TYPES; ++i )
-        existing_types[i] = 0;
-    for( i = 0; i < system->N; ++i )
-        existing_types[ system->atoms[i].type ] = 1;
-    /* fill in the lookup table entries for existing atom types.
-       only lower half should be enough. */
-    for( i = 0; i < num_atom_types; ++i )
-        if( existing_types[i] )
-            for( j = i; j < num_atom_types; ++j )
-                if( existing_types[j] ) {
-                    LR[ index_lr (i,j,num_atom_types) ].xmin = 0;
-                    LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut;
-                    LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1;
-                    LR[ index_lr (i,j,num_atom_types) ].dx = dr;
-                    LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
-                    LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data));
-                    LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) 
-                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-                    for( r = 1; r <= control->tabulate; ++r ) {
-                        LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) );
-                        h[r] = LR[ index_lr (i,j,num_atom_types) ].dx;
-                        fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H;
-                        fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW;
-                        fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-                        fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele;
-                        fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
-                        if( r == 1 ){
-                            v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-                            v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
-                        }
-                        else if( r == control->tabulate ){
-                            vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-                            vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
-                        }
-                    }
-                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
-                      for( r = 1; r <= control->tabulate; ++r )
-                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
-                    Natural_Cubic_Spline( &h[1], &fh[1], 
-                            &(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 );
-                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
-                      for( r = 1; r <= control->tabulate; ++r )
-                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
-                    //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
-                     */
-                    Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, 
-                            &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 );
-                    Natural_Cubic_Spline( &h[1], &fCEvd[1], 
-                            &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 );
-                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
-                      for( r = 1; r <= control->tabulate; ++r )
-                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
-                    //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
-                     */
-                    Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, 
-                            &(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 );
-                    Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
-                            &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 );
-                }
-    /***** //test LR-Lookup table
-      evdw_maxerr = 0;
-      eele_maxerr = 0;
-      for( i = 0; i < num_atom_types; ++i )
-      if( existing_types[i] )
-      for( j = i; j < num_atom_types; ++j )
-      if( existing_types[j] ) {
-      for( r = 1; r <= 100; ++r ) {
-      rand_dist = (real)rand()/RAND_MAX * control->r_cut;
-      LR_vdW_Coulomb( system, control, i, j, rand_dist, &y );
-      LR_Lookup( &(LR[i][j]), rand_dist, &y_spline );
-      evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW);
-      evdw_relerr = fabs(evdw_abserr / y.e_vdW);
-      fvdw_abserr = fabs(y.CEvd - y_spline.CEvd);
-      fvdw_relerr = fabs(fvdw_abserr / y.CEvd);
-      eele_abserr = fabs(y.e_ele - y_spline.e_ele);
-      eele_relerr = fabs(eele_abserr / y.e_ele);
-      fele_abserr = fabs(y.CEclmb - y_spline.CEclmb);
-      fele_relerr = fabs(fele_abserr / y.CEclmb);
-      if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){
-//fprintf( stderr, "rand_dist = %24.15e\n", rand_dist );
-//fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-y.H, y_spline.H, 
-fabs(y.H-y_spline.H), fabs((y.H-y_spline.H)/y.H) );  
-//fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-y.e_vdW, y_spline.e_vdW, evdw_abserr, evdw_relerr ); 
-//fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-y.CEvd, y_spline.CEvd, fvdw_abserr, fvdw_relerr ); 
-//fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-y.e_ele, y_spline.e_ele, eele_abserr, eele_relerr ); 
-//fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
-y.CEclmb, y_spline.CEclmb, fele_abserr, fele_relerr ); 
-if( evdw_relerr > evdw_maxerr )
-evdw_maxerr = evdw_relerr;
-if( eele_relerr > eele_maxerr )
-eele_maxerr = eele_relerr;
-//fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr );
-//fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr );
-     *******/
 void copy_LR_table_to_device (reax_system *system, control_params *control)
     int i, j, r;
@@ -728,30 +397,6 @@ void copy_LR_table_to_device (reax_system *system, control_params *control)
-// CUDA Functions for Lookup Table
 GLOBAL void calculate_LR_Values ( LR_lookup_table *d_LR, real *h, real *fh, real *fvdw, real *fCEvd, real *fele, real *fCEclmb, 
         global_parameters g_params, two_body_parameters *tbp, 
         control_params *control, int i, 
@@ -760,17 +405,18 @@ GLOBAL void calculate_LR_Values ( LR_lookup_table *d_LR, real *h, real *fh, real
     int r = blockIdx.x * blockDim.x + threadIdx.x;
     if ( r == 0 || r > count ) return;
-    LR_vdW_Coulomb ( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types );
+    d_LR_vdW_Coulomb( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types );
-    h[r] = d_LR[ index_lr (i, j, num_atom_types) ].dx;
-    fh[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].H;
-    fvdw[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_vdW;
-    fCEvd[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEvd;
-    fele[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_ele;
-    fCEclmb[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEclmb;
+    h[r] = d_LR[ index_lr(i, j, num_atom_types) ].dx;
+    fh[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].H;
+    fvdw[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].e_vdW;
+    fCEvd[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].CEvd;
+    fele[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].e_ele;
+    fCEclmb[r] = d_LR[ index_lr(i, j, num_atom_types) ].y[r].CEclmb;
-GLOBAL void init_LR_values ( LR_lookup_table *d_LR, control_params *control, real dr, int i, int j, int num_atom_types )
+GLOBAL void init_LR_values( LR_lookup_table *d_LR, control_params *control, real dr, int i, int j, int num_atom_types )
     d_LR[ index_lr (i, j, num_atom_types) ].xmin = 0;
     d_LR[ index_lr (i, j, num_atom_types) ].xmax = control->r_cut;
@@ -779,6 +425,7 @@ GLOBAL void init_LR_values ( LR_lookup_table *d_LR, control_params *control, rea
     d_LR[ index_lr (i, j, num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
 void Cuda_Make_LR_Lookup_Table( reax_system *system, control_params *control )
     int i, j, r;
@@ -907,54 +554,3 @@ void Cuda_Make_LR_Lookup_Table( reax_system *system, control_params *control )
     cuda_free(fele, RES_LR_LOOKUP_ELE);
     cuda_free(fCEclmb, RES_LR_LOOKUP_CECLMB);
-// CUDA Functions for Lookup Table
-int Lookup_Index_Of( real x, lookup_table* t )
-    return (int)( t->a * ( x - t->xmin ) );
-real Lookup( real x, lookup_table* t )
-    real x1, x2;
-    real b;
-    int i;
-    /* if ( x < t->xmin) 
-       {
-    //fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x);
-    exit(0);
-    }
-    if ( x > t->xmax) 
-    {
-    //fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x);
-    exit(0);
-    } */
-    i = Lookup_Index_Of( x, t );
-    x1 = i * t->dx + t->xmin;
-    x2 = (i+1) * t->dx + t->xmin;
-    b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx;
-    // //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n",
-    //          i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x));
-    return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b;
diff --git a/PuReMD-GPU/src/cuda_lookup.h b/PuReMD-GPU/src/cuda_lookup.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0e05e0c092f1634a1d21fd902f198f714ac2391
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_lookup.h
@@ -0,0 +1,40 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_LOOKUP_H_
+#define __CUDA_LOOKUP_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+void Cuda_Make_LR_Lookup_Table( reax_system*, control_params* );
+void copy_LR_table_to_device ( reax_system*, control_params* );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_neighbors.cu b/PuReMD-GPU/src/cuda_neighbors.cu
new file mode 100644
index 0000000000000000000000000000000000000000..876b6b9913e4d825e0cc8be5a2fc1d092c56d9f8
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_neighbors.cu
@@ -0,0 +1,764 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_neighbors.h"
+#include "box.h"
+#include "grid.h"
+#include "list.h"
+#include "neighbors.h"
+#include "reset_utils.h"
+#include "system_props.h"
+#include "vector.h"
+#include "index_utils.h"
+#include "cuda_utils.h"
+#include "cuda_grid.h"
+extern inline DEVICE int index_grid (int blocksize)
+    return blockIdx.x * gridDim.y * gridDim.z * blocksize +  
+        blockIdx.y * gridDim.z * blocksize +  
+        blockIdx.z * blocksize ;
+DEVICE int d_Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
+        real cutoff, far_neighbor_data *data )
+    real norm_sqr, d, tmp;
+    int i;
+    norm_sqr = 0;
+    for( i = 0; i < 3; i++ ) { 
+        d = x2[i] - x1[i];
+        tmp = SQR(d);
+        if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
+            if( x2[i] > x1[i] ) { 
+                d -= box->box_norms[i];
+                data->rel_box[i] = -1; 
+            }   
+            else {
+                d += box->box_norms[i];
+                data->rel_box[i] = +1; 
+            }   
+            data->dvec[i] = d;
+            norm_sqr += SQR(d);
+        }   
+        else {
+            data->dvec[i] = d;
+            norm_sqr += tmp;
+            data->rel_box[i] = 0;
+        }   
+    }
+    if( norm_sqr <= SQR(cutoff) ){
+        data->d = sqrt(norm_sqr);
+        return 1;
+    }
+    return 0;
+GLOBAL void k_Estimate_NumNeighbors( reax_atom *sys_atoms,
+        grid g, simulation_box *box, control_params *control, int *indices )
+    int *atoms = g.atoms;
+    int *top = g.top;
+    ivec *nbrs = g.nbrs; 
+    rvec *nbrs_cp = g.nbrs_cp;
+    int *nbr_atoms;
+    int atom1, atom2, l, iter, max, m, num_far;
+    far_neighbor_data nbr_data;
+    int x, y, z, i;
+    if (threadIdx.x >= *(top + index_grid(1))){
+        return;
+    } 
+    nbrs = nbrs + index_grid (g.max_nbrs);
+    nbrs_cp = nbrs_cp + index_grid (g.max_nbrs);
+    atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x];
+    num_far = 0;
+    iter = 0;
+    while (nbrs[iter][0] >= 0) {
+        x = nbrs[iter][0];
+        y = nbrs[iter][1];
+        z = nbrs[iter][2];
+        //condition check for cutoff here
+        if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
+                SQR (control->vlist_cut)) 
+        {
+            nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+            max = top [index_grid_3d(x, y, z, &g)];
+            for (m = 0; m < max; m++) {
+                atom2 = nbr_atoms[m];
+                //CHANGE ORIGINAL
+                /*
+                   if (atom1 > atom2) {
+                   if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                   control->vlist_cut, &nbr_data)){
+                   ++num_far;
+                   }
+                   }
+                 */
+                if (atom1 > atom2) {
+                    if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                                control->vlist_cut, &nbr_data)){
+                        ++num_far;
+                    }
+                }
+                else if (atom1 < atom2) {
+                    if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
+                                control->vlist_cut, &nbr_data)){
+                        ++num_far;
+                    }
+                }
+                //CHANGE ORIGINAL
+            }
+        }
+        ++iter;
+    }
+    //indices[ atom1 ] = num_far;// * SAFE_ZONE;
+    indices[ atom1 ] = num_far * SAFE_ZONE;
+/*One thread per atom Implementation */
+GLOBAL void k_New_Estimate_NumNeighbors( reax_atom *sys_atoms,
+        grid g, simulation_box *box, control_params* control, int N, int *indices )
+    int *atoms = g.atoms;
+    int *top = g.top;
+    ivec *nbrs = g.nbrs; 
+    rvec *nbrs_cp = g.nbrs_cp;
+    int     *nbr_atoms;
+    int   atom1, atom2, iter, max, m, num_far;
+    int     x, y, z, i;
+    int atom_x, atom_y, atom_z;
+    far_neighbor_data temp;
+    rvec atom1_x;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index > N) return;
+    atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]);
+    atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]);
+    atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]);
+#ifdef __BNVT_FIX__
+    if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
+    if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
+    if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
+    nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+    nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+    atom1 = index;
+    rvec_Copy (atom1_x, sys_atoms [atom1].x );
+    num_far = 0;
+    iter = 0;
+    while (nbrs[iter][0] >= 0) {
+        x = nbrs[iter][0];
+        y = nbrs[iter][1];
+        z = nbrs[iter][2];
+        if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= 
+                SQR (control->vlist_cut)) 
+        {
+            nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+            max = top [index_grid_3d(x, y, z, &g)];
+            for (m = 0; m < max; m++) 
+            {
+                atom2 = nbr_atoms[m];
+                if (atom1 > atom2) {
+                    if (d_Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
+                                control->vlist_cut, &temp)){
+                        num_far++;
+                    }
+                }
+                else if (atom1 < atom2) {
+                    if (d_Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
+                                control->vlist_cut, &temp)){
+                        num_far ++;
+                    }
+                }
+            }
+        }
+        ++iter;
+    }
+    indices [atom1] = num_far * SAFE_ZONE;
+/*One thread per entry in the gcell implementation */
+GLOBAL void k_Generate_Neighbor_Lists ( reax_atom *sys_atoms,
+        grid g, simulation_box *box, control_params* control, 
+        list far_nbrs )
+    int *atoms = g.atoms;
+    int *top = g.top;
+    ivec *nbrs = g.nbrs; 
+    rvec *nbrs_cp = g.nbrs_cp;
+    int     *nbr_atoms;
+    int   atom1, atom2, l, iter, max, m, num_far;
+    int     x, y, z, i;
+    far_neighbor_data *nbr_data;
+    far_neighbor_data temp;
+    if (threadIdx.x >= *(top + index_grid(1))){
+        return;
+    } 
+    nbrs = nbrs + index_grid (g.max_nbrs);
+    nbrs_cp = nbrs_cp + index_grid (g.max_nbrs);
+    atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x];
+    num_far = Start_Index (atom1, &far_nbrs);
+    //Set_Start_Index (atom1, 0, &far_nbrs);
+    //num_far =  0;
+    iter = 0;
+    while (nbrs[iter][0] >= 0) {
+        x = nbrs[iter][0];
+        y = nbrs[iter][1];
+        z = nbrs[iter][2];
+        //condition check for cutoff here
+        if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
+                SQR (control->vlist_cut)) 
+        {
+            nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+            max = top [index_grid_3d(x, y, z, &g)];
+            for (m = 0; m < max; m++) {
+                atom2 = nbr_atoms[m];
+                //nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] );
+                //CHANGE ORIGINAL
+                /*
+                   if (atom1 > atom2) {
+                   if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                   control->vlist_cut, &temp)){
+                   nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                   nbr_data->nbr = atom2;
+                   nbr_data->rel_box[0] = temp.rel_box[0];
+                   nbr_data->rel_box[1] = temp.rel_box[1];
+                   nbr_data->rel_box[2] = temp.rel_box[2];
+                   nbr_data->d = temp.d;
+                   nbr_data->dvec[0] = temp.dvec[0];
+                   nbr_data->dvec[1] = temp.dvec[1];
+                   nbr_data->dvec[2] = temp.dvec[2];
+                   ++num_far;
+                   }
+                   }
+                 */
+                if (atom1 > atom2) {
+                    if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                                control->vlist_cut, &temp)){
+                        nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                        nbr_data->nbr = atom2;
+                        nbr_data->rel_box[0] = temp.rel_box[0];
+                        nbr_data->rel_box[1] = temp.rel_box[1];
+                        nbr_data->rel_box[2] = temp.rel_box[2];
+                        nbr_data->d = temp.d;
+                        nbr_data->dvec[0] = temp.dvec[0];
+                        nbr_data->dvec[1] = temp.dvec[1];
+                        nbr_data->dvec[2] = temp.dvec[2];
+                        ++num_far;
+                    }
+                }
+                else if (atom1 < atom2) {
+                    if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
+                                control->vlist_cut, &temp)){
+                        nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                        nbr_data->nbr = atom2;
+                        nbr_data->rel_box[0] = temp.rel_box[0];
+                        nbr_data->rel_box[1] = temp.rel_box[1];
+                        nbr_data->rel_box[2] = temp.rel_box[2];
+                        nbr_data->d = temp.d;
+                        nbr_data->dvec[0] = temp.dvec[0];
+                        nbr_data->dvec[1] = temp.dvec[1];
+                        nbr_data->dvec[2] = temp.dvec[2];
+                        ++num_far;
+                    }
+                }
+                //CHANGE ORIGINAL
+            }
+        }
+        ++iter;
+    }
+    //end the far_neighbor list here
+    Set_End_Index (atom1, num_far, &far_nbrs);
+/*One thread per atom Implementation */
+GLOBAL void k_New_Generate_Neighbor_Lists( reax_atom *sys_atoms,
+        grid g, simulation_box *box, control_params* control, 
+        list far_nbrs, int N )
+    int *atoms = g.atoms;
+    int *top = g.top;
+    ivec *nbrs = g.nbrs; 
+    rvec *nbrs_cp = g.nbrs_cp;
+    int     *nbr_atoms;
+    int   atom1, atom2, l, iter, max, m, num_far;
+    int     x, y, z, i;
+    far_neighbor_data *nbr_data, *my_start;
+    far_neighbor_data temp;
+    int atom_x, atom_y, atom_z;
+    rvec atom1_x;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index > N) return;
+    atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]);
+    atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]);
+    atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]);
+#ifdef __BNVT_FIX__
+    if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
+    if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
+    if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
+    nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+    nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+    atom1 = index;
+    rvec_Copy (atom1_x, sys_atoms [atom1].x );
+    num_far = Start_Index (atom1, &far_nbrs);
+    my_start = & (far_nbrs.select.far_nbr_list [num_far] );
+    //Set_Start_Index (atom1, 0, &far_nbrs);
+    //num_far =  0;
+    iter = 0;
+    while (nbrs[iter][0] >= 0) {
+        x = nbrs[iter][0];
+        y = nbrs[iter][1];
+        z = nbrs[iter][2];
+        //condition check for cutoff here
+        //if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
+        if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= 
+                SQR (control->vlist_cut)) 
+        {
+            nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+            max = top [index_grid_3d(x, y, z, &g)];
+            for (m = 0; m < max; m++) 
+            {
+                atom2 = nbr_atoms[m];
+                if (atom1 > atom2) {
+                    if (d_Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
+                                control->vlist_cut, &temp)){
+                        //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                        nbr_data = my_start;
+                        nbr_data->nbr = atom2;
+                        nbr_data->rel_box[0] = temp.rel_box[0];
+                        nbr_data->rel_box[1] = temp.rel_box[1];
+                        nbr_data->rel_box[2] = temp.rel_box[2];
+                        nbr_data->d = temp.d;
+                        nbr_data->dvec[0] = temp.dvec[0];
+                        nbr_data->dvec[1] = temp.dvec[1];
+                        nbr_data->dvec[2] = temp.dvec[2];
+                        num_far++;
+                        my_start ++;
+                    }
+                }
+                else if (atom1 < atom2) {
+                    if (d_Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
+                                control->vlist_cut, &temp)){
+                        //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                        nbr_data = my_start;
+                        nbr_data->nbr = atom2;
+                        nbr_data->rel_box[0] = temp.rel_box[0];
+                        nbr_data->rel_box[1] = temp.rel_box[1];
+                        nbr_data->rel_box[2] = temp.rel_box[2];
+                        nbr_data->d = temp.d;
+                        nbr_data->dvec[0] = temp.dvec[0];
+                        nbr_data->dvec[1] = temp.dvec[1];
+                        nbr_data->dvec[2] = temp.dvec[2];
+                        num_far ++;
+                        my_start ++;
+                    }
+                }
+                //CHANGE ORIGINAL
+            }
+        }
+        ++iter;
+    }
+    //end the far_neighbor list here
+    Set_End_Index (atom1, num_far, &far_nbrs);
+/*Multiple threads per atom Implementation */
+GLOBAL void Test_Generate_Neighbor_Lists( reax_atom *sys_atoms,
+        grid g, simulation_box *box, control_params* control, 
+        list far_nbrs, int N )
+    extern __shared__ int __nbr[];
+    extern __shared__ int __sofar [];
+    int nbrgen;
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warp_id = thread_id / __THREADS_PER_ATOM__;
+    int lane_id = thread_id & (__THREADS_PER_ATOM__ -1);
+    int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+    if (warp_id >= N ) return;
+    int *tnbr = __nbr;
+    //int *nbrssofar = __nbr + __THREADS_PER_ATOM__;
+    int *nbrssofar = __nbr + blockDim.x;
+    int *atoms = g.atoms;
+    int *top = g.top;
+    ivec *nbrs = g.nbrs; 
+    rvec *nbrs_cp = g.nbrs_cp;
+    int     *nbr_atoms;
+    int   atom1, atom2, l, iter, max, m, num_far;
+    int leader = -10;
+    int     x, y, z, i;
+    far_neighbor_data *nbr_data, *my_start;
+    far_neighbor_data temp;
+    int atom_x, atom_y, atom_z;
+    atom1 = warp_id;
+    atom_x = (int)(sys_atoms[atom1].x[0] * g.inv_len[0]);
+    atom_y = (int)(sys_atoms[atom1].x[1] * g.inv_len[1]);
+    atom_z = (int)(sys_atoms[atom1].x[2] * g.inv_len[2]);
+#ifdef __BNVT_FIX__
+    if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
+    if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
+    if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
+    nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+    nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+    num_far = Start_Index (atom1, &far_nbrs);
+    my_start = & (far_nbrs.select.far_nbr_list [num_far] );
+    iter = 0;
+    tnbr[threadIdx.x] = 0;
+    if (lane_id == 0) {
+        //nbrssofar [threadIdx.x /__THREADS_PER_ATOM__] = 0;
+        nbrssofar [my_bucket] = 0;
+    }
+    __syncthreads ();
+    while ((nbrs[iter][0] >= 0)) {
+        x = nbrs[iter][0];
+        y = nbrs[iter][1];
+        z = nbrs[iter][2];
+        tnbr[threadIdx.x] = 0;
+        nbrgen = FALSE;
+        if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms [atom1].x) <= 
+                SQR (control->vlist_cut)) 
+        {
+            nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+            max = top [index_grid_3d(x, y, z, &g)];
+            tnbr[threadIdx.x] = 0;
+            nbrgen = FALSE;
+            m = lane_id ; //0-31
+            int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1);
+            int iterations = 0;
+            //while (m < max)
+            while (iterations < loopcount)
+            {
+                tnbr [threadIdx.x] = 0;
+                nbrgen = FALSE;
+                if (m < max) {
+                    atom2 = nbr_atoms[m];
+                    if (atom1 > atom2) {
+                        if (d_Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                                    control->vlist_cut, &temp))
+                        {
+                            tnbr [threadIdx.x] = 1;
+                            nbrgen = TRUE;
+                        }
+                    }
+                    else if (atom1 < atom2) {
+                        if (d_Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
+                                    control->vlist_cut, &temp)){
+                            tnbr [threadIdx.x] = 1;
+                            nbrgen = TRUE;
+                        }
+                    }
+                }
+                if (nbrgen)
+                {
+                    //do leader selection here
+                    leader = -1;
+                    //for (l = threadIdx.x / __THREADS_PER_ATOM__; l < threadIdx.x / __THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++)
+                    for (l = my_bucket *__THREADS_PER_ATOM__; l < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++)
+                        if (tnbr[l]){
+                            leader = l;
+                            break;
+                        }
+                    //do the reduction;
+                    if (threadIdx.x == leader) 
+                        for (l = 1; l < __THREADS_PER_ATOM__; l++)
+                            //tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + l] += tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + (l-1)];    
+                            tnbr [my_bucket * __THREADS_PER_ATOM__ + l] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (l-1)];    
+                }
+                //__syncthreads ();
+                //MYATOMICADD( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1);
+                //while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ;
+                if (nbrgen)
+                {
+                    //got the indices
+                    //nbr_data = my_start + nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] + tnbr [threadIdx.x] - 1;
+                    nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1;
+                    nbr_data->nbr = atom2;
+                    nbr_data->rel_box[0] = temp.rel_box[0];
+                    nbr_data->rel_box[1] = temp.rel_box[1];
+                    nbr_data->rel_box[2] = temp.rel_box[2];
+                    nbr_data->d = temp.d;
+                    nbr_data->dvec[0] = temp.dvec[0];
+                    nbr_data->dvec[1] = temp.dvec[1];
+                    nbr_data->dvec[2] = temp.dvec[2];
+                    if (threadIdx.x == leader)
+                        //nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] += tnbr[(threadIdx.x / __THREADS_PER_ATOM__)*__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
+                        nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
+                }
+                m += __THREADS_PER_ATOM__;
+                iterations ++;
+                //cleanup
+                nbrgen = FALSE;
+                tnbr [threadIdx.x] = 0;
+            }
+        }
+        ++iter;
+    }
+    __syncthreads ();
+    //end the far_neighbor list here
+    if (lane_id == 0)
+        Set_End_Index (atom1, num_far + nbrssofar[my_bucket], &far_nbrs);
+    //Set_End_Index (atom1, num_far + tnbr[63], &far_nbrs);
+void Cuda_Generate_Neighbor_Lists (reax_system *system, static_storage *workspace, control_params *control, int estimate)
+    real t_start, t_elapsed;
+    real t_1, t_2;
+    list *far_nbrs = dev_lists + FAR_NBRS;
+    int *d_indices = (int *) scratch;
+    int *nbrs_start, *nbrs_end;
+    int i, max_nbrs = 0;
+    int nbs;
+    t_start = Get_Time (); 
+    Cuda_Bin_Atoms (system, workspace);
+    Cuda_Bin_Atoms_Sync ( system );
+    if (dev_workspace->realloc.estimate_nbrs > -1) {
+        /*reset the re-neighbor condition */
+        dev_workspace->realloc.estimate_nbrs = -1;
+        //#ifdef __DEBUG_CUDA__
+        fprintf (stderr, "Recomputing the neighbors estimate.... \n");
+        //#endif
+        cuda_memset (d_indices, 0, INT_SIZE * system->N, RES_SCRATCH );
+        /*
+           dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
+           dim3 threadsperblock (system->g.max_atoms);
+           k_Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>>
+           (system->d_atoms, system->d_g, system->d_box, 
+           (control_params *)control->d_control, d_indices);
+           cudaThreadSynchronize ();
+           cudaCheckError ();
+         */
+        nbs = (system->N / NBRS_BLOCK_SIZE) + (((system->N) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+        k_New_Estimate_NumNeighbors <<<nbs, NBRS_BLOCK_SIZE>>> 
+            (     system->d_atoms, system->d_g,
+                system->d_box, (control_params *)control->d_control,
+                system->N, d_indices);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        int *nbrs_indices = NULL;
+        nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) );
+        if (nbrs_indices == NULL) 
+        {
+            fprintf (stderr, "Malloc failed for nbrs indices .... \n");
+            exit (1);
+        }
+        memset (nbrs_indices , 0, INT_SIZE * (system->N+1) ); 
+        copy_host_device (nbrs_indices+1, d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); 
+        for (int i = 1; i <= system->N; i++) 
+            nbrs_indices [i] += nbrs_indices [i-1];
+        copy_host_device (nbrs_indices, (far_nbrs->index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ );
+        copy_host_device (nbrs_indices, (far_nbrs->end_index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ );
+        free (nbrs_indices);
+    }
+    /*
+       One thread per atom Implementation
+       Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> 
+       (system->d_atoms, system->d_g, system->d_box, 
+       (control_params *)control->d_control, *far_nbrs);
+     */
+    nbs = (system->N * NBRS_THREADS_PER_ATOM/ NBRS_BLOCK_SIZE) + 
+        (((system->N *NBRS_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+    /* Multiple threads per atom Implementation */
+    Test_Generate_Neighbor_Lists <<<nbs, NBRS_BLOCK_SIZE, 
+                         (system->d_atoms, system->d_g, system->d_box, 
+                          (control_params *)control->d_control, *far_nbrs, system->N );
+    cudaThreadSynchronize (); 
+    cudaCheckError (); 
+    t_elapsed = Get_Timing_Info (t_start);
+    d_timing.nbrs += t_elapsed;
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Done with neighbor generation ---> %f \n", t_elapsed);
+    /*validate neighbors list*/
+    nbrs_start = (int *) calloc (system->N, INT_SIZE);
+    nbrs_end = (int *) calloc (system->N, INT_SIZE);
+    copy_host_device (nbrs_start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device (nbrs_end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ );
+    int device_nbrs = 0;
+    for(i = 0; i < system->N; i++)
+    {
+        if ((nbrs_end[i] - nbrs_start[i]) > max_nbrs)
+            max_nbrs = nbrs_end[i] - nbrs_start[i];
+        device_nbrs += nbrs_end[i] - nbrs_start[i]; 
+    }
+#ifdef __CUDA_TEST__
+    //fprintf (stderr, " New Device count is : %d \n", device_nbrs);
+    //dev_workspace->realloc.num_far = device_nbrs;
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Max neighbors is ---> %d \n", max_nbrs );
+    fprintf (stderr, "DEVICE NEIGHBORS ---> %d \n", device_nbrs);
+    //validate check here
+    //get the num_far from the list here
+    for (i = 0; i < system->N-1; i++)
+    {
+        if ((nbrs_end[i] - nbrs_start[i]) > (nbrs_start[i+1] - nbrs_start[i]) * DANGER_ZONE )
+        {
+            dev_workspace->realloc.num_far = device_nbrs;
+            //#ifdef __CUDA_MEM__
+            //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far);
+            //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d (%d %d %d) \n", 
+            //                            i, nbrs_start[i], nbrs_end[i], nbrs_start[i+1]);
+            //#endif
+        }
+        if (nbrs_end[i] > nbrs_start[i+1]) {
+            fprintf( stderr, "**ran out of space on far_nbrs: start[i] = %d, end[i]=%d, start[i+1]=%d, end[i+1] = %d",
+                    nbrs_start[i], nbrs_end[i], nbrs_start[i+1], nbrs_end[i+1]);
+            exit( INSUFFICIENT_SPACE );
+        }
+    }
+    if ((nbrs_end[i] - nbrs_start[i]) > (far_nbrs->num_intrs - nbrs_start[i]) * DANGER_ZONE ) {
+        dev_workspace->realloc.num_far = device_nbrs;
+        //#ifdef __CUDA_MEM__
+        //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far);
+        //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d start: %d, end: %d, count: %d\n"
+        //                    , i, nbrs_start[i], nbrs_end[i], far_nbrs->num_intrs);
+        //#endif
+    }
+    if (nbrs_end[i] > far_nbrs->num_intrs) {
+        fprintf( stderr, "**ran out of space on far_nbrs: top=%d, max=%d",
+                nbrs_end[i], far_nbrs->num_intrs );
+        exit( INSUFFICIENT_SPACE );
+    }
+    free (nbrs_start);
+    free (nbrs_end);
diff --git a/PuReMD-GPU/src/cuda_neighbors.h b/PuReMD-GPU/src/cuda_neighbors.h
new file mode 100644
index 0000000000000000000000000000000000000000..13656f62f12b53509fad82f535e974e45cc45805
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_neighbors.h
@@ -0,0 +1,44 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_NEIGHBORS_H_
+#define __CUDA_NEIGHBORS_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+GLOBAL void k_Estimate_NumNeighbors( reax_atom *, grid, simulation_box *,
+        control_params *, int * );
+void Cuda_Generate_Neighbor_Lists (reax_system *system,
+        static_storage *workspace, control_params *control, int);
+DEVICE int d_Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_post_evolve.cu b/PuReMD-GPU/src/cuda_post_evolve.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f5dbad825f2ac9f6e120e6cf7147258c89868c71
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_post_evolve.cu
@@ -0,0 +1,148 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_post_evolve.h"
+#include "vector.h"
+#include "cuda_utils.h"
+#include "cuda_copy.h"
+#include "cuda_system_props.h"
+void Cuda_Setup_Evolve( reax_system* system, control_params* control, 
+        simulation_data* data, static_storage* workspace, 
+        list** lists, output_controls *out_control )
+    //fprintf (stderr, "Begin ... \n");
+    //to Sync step to the device.
+    //Sync_Host_Device_Data( &data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice );
+    copy_host_device( &data->step, &((simulation_data *)data->d_simulation_data)->step, 
+            INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+void Cuda_Setup_Output( reax_system* system, simulation_data* data )
+    // Here sync the simulation data, because it has been changed.
+    Prep_Device_For_Output( system, data );
+void Cuda_Sync_Temp( control_params* control )
+    Sync_Host_Device_Params( control, (control_params*)control->d_control, cudaMemcpyHostToDevice );
+GLOBAL void Update_Atoms_Post_Evolve (reax_atom *atoms, simulation_data *data, int N)
+    rvec diff, cross;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+    //for( i = 0; i < system->N; i++ ) {
+    // remove translational
+    rvec_ScaledAdd( atoms[i].v, -1., data->vcm ); 
+    // remove rotational
+    rvec_ScaledSum( diff, 1., atoms[i].x, -1., data->xcm );
+    rvec_Cross( cross, data->avcm, diff );
+    rvec_ScaledAdd( atoms[i].v, -1., cross );
+    //}
+void Cuda_Post_Evolve( reax_system* system, control_params* control, 
+        simulation_data* data, static_storage* workspace, 
+        list** lists, output_controls *out_control )
+    int i;
+    rvec diff, cross;
+    /* compute kinetic energy of the system */
+    /*
+       real *results = (real *) scratch;
+       cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
+       Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+       (system->reaxprm.d_sbp, system->d_atoms, system->N, 
+       (simulation_data *)data->d_simulation_data, (real *) results);
+       cudaThreadSynchronize ();
+       cudaCheckError ();
+     */
+    //fprintf (stderr, "Cuda_Post_Evolve: Begin\n");
+    Cuda_Compute_Kinetic_Energy( system, data );
+    //fprintf (stderr, " Cuda_Compute_Kinetic_Energy done.... \n");
+    /* remove rotational and translational velocity of the center of mass */
+    if( control->ensemble != NVE && 
+            control->remove_CoM_vel && 
+            data->step && data->step % control->remove_CoM_vel == 0 ) {
+        /*
+           rvec t_xcm, t_vcm, t_avcm;
+           rvec_MakeZero (t_xcm);
+           rvec_MakeZero (t_vcm);
+           rvec_MakeZero (t_avcm);
+           rvec_Copy (t_xcm, data->xcm);
+           rvec_Copy (t_vcm, data->vcm);
+           rvec_Copy (t_avcm, data->avcm);
+         */
+        /* compute velocity of the center of mass */
+        Cuda_Compute_Center_of_Mass( system, data, out_control->prs );
+        //fprintf (stderr, "Cuda_Compute_Center_of_Mass done... \n");
+        /*
+           fprintf (stderr, "center of mass done on the device \n");
+           fprintf (stderr, "xcm --> %4.10f %4.10f \n", t_xcm, data->xcm );
+           fprintf (stderr, "vcm --> %4.10f %4.10f \n", t_vcm, data->vcm );
+           fprintf (stderr, "avcm --> %4.10f %4.10f \n", t_avcm, data->avcm );
+           if (check_zero (t_xcm, data->xcm) || 
+           check_zero (t_vcm, data->vcm) ||
+           check_zero (t_avcm, data->avcm)){
+           fprintf (stderr, "SimulationData (xcm, vcm, avcm) does not match between device and host \n");
+           exit (0);
+           }
+         */
+        //xcm, avcm, 
+        copy_host_device( data->vcm,
+            ((simulation_data *)data->d_simulation_data)->vcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+        copy_host_device( data->xcm,
+            ((simulation_data *)data->d_simulation_data)->xcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+        copy_host_device( data->avcm,
+            ((simulation_data *)data->d_simulation_data)->avcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+        //fprintf (stderr, "data copied.... \n");
+        Update_Atoms_Post_Evolve<<< BLOCKS, BLOCK_SIZE >>>
+            (system->d_atoms, (simulation_data *)data->d_simulation_data, system->N);
+        cudaThreadSynchronize( );
+        cudaCheckError( );
+        //fprintf (stderr, " Cuda_Post_Evolve:End \n");
+    }
diff --git a/PuReMD-GPU/src/cuda_post_evolve.h b/PuReMD-GPU/src/cuda_post_evolve.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d8fdc270edd28e0f387d53d9b9837c3a4879542
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_post_evolve.h
@@ -0,0 +1,48 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_POST_EVOLVE_H__
+#define __CUDA_POST_EVOLVE_H__
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+void Cuda_Setup_Evolve( reax_system *, control_params *, 
+        simulation_data *, static_storage *, 
+        list **, output_controls * );
+void Cuda_Setup_Output( reax_system *, simulation_data * );
+void Cuda_Sync_Temp( control_params * );
+void Cuda_Post_Evolve( reax_system *, control_params *, 
+        simulation_data *, static_storage *, 
+        list **, output_controls * );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/reduction.cu b/PuReMD-GPU/src/cuda_reduction.cu
similarity index 97%
rename from PuReMD-GPU/src/reduction.cu
rename to PuReMD-GPU/src/cuda_reduction.cu
index 48fb5efc473adda2540a3dac4ffb2664ad9ca2a9..e22f9ad8474830ebdbe11089cdfcc986ff8dd8f3 100644
--- a/PuReMD-GPU/src/reduction.cu
+++ b/PuReMD-GPU/src/cuda_reduction.cu
@@ -18,9 +18,9 @@
-#include "reduction.h"
+#include "cuda_reduction.h"
 #include "vector.h"
-#include "mytypes.h"
 GLOBAL void Cuda_reduction(const real *input, real *per_block_results, const size_t n)
@@ -52,6 +52,7 @@ GLOBAL void Cuda_reduction(const real *input, real *per_block_results, const siz
 GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t n, int pass)
     extern __shared__ real sdata[];
@@ -87,6 +88,7 @@ GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t
 GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, const size_t n )
     extern __shared__ real sdata[];
@@ -116,6 +118,7 @@ GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, con
 GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results, const size_t n)
     extern __shared__ real sdata[];
@@ -146,11 +149,7 @@ GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results
-GLOBAL void Cuda_reduction(const int *input, int *per_block_results, const size_t n)
+GLOBAL void Cuda_reduction_int(const int *input, int *per_block_results, const size_t n)
     extern __shared__ int sh_input[];
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -213,10 +212,10 @@ GLOBAL void Cuda_reduction_rvec (rvec *input, rvec *results, size_t n)
 //vector functions
 GLOBAL void Cuda_Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) 
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -225,6 +224,7 @@ GLOBAL void Cuda_Vector_Sum( real* dest, real c, real* v, real d, real* y, int k
     dest[i] = c * v[i] + d * y[i];
 GLOBAL void Cuda_Vector_Scale( real* dest, real c, real* v, int k ) 
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -233,6 +233,7 @@ GLOBAL void Cuda_Vector_Scale( real* dest, real c, real* v, int k )
     dest[i] = c * v[i];
 GLOBAL void Cuda_Vector_Add( real* dest, real c, real* v, int k )
     int i = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/PuReMD-GPU/src/reduction.h b/PuReMD-GPU/src/cuda_reduction.h
similarity index 66%
rename from PuReMD-GPU/src/reduction.h
rename to PuReMD-GPU/src/cuda_reduction.h
index fefbe4c2de5e7674c2d53c73c2aba698b7c93f2a..5b9baf1df69f2a4ab2bdab0d3bff26e390634951 100644
--- a/PuReMD-GPU/src/reduction.h
+++ b/PuReMD-GPU/src/cuda_reduction.h
@@ -18,22 +18,32 @@
-#ifndef __REDUCTION_H__
-#define __REDUCTION_H__
+#ifndef __CUDA_REDUCTION_H__
+#define __CUDA_REDUCTION_H__
 #include "mytypes.h"
 #define INITIAL 0
 #define FINAL       1
-GLOBAL void Cuda_reduction (const real *input, real *per_block_results, const size_t n);
-GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t n, int pass);
-GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, const size_t n);
-GLOBAL void Cuda_reduction (const int *input, int *per_block_results, const size_t n);
-GLOBAL void Cuda_reduction_rvec (rvec *, rvec *, size_t n);
+#ifdef __cplusplus
+extern "C"  {
+GLOBAL void Cuda_reduction( const real *input, real *per_block_results, const size_t n );
+GLOBAL void Cuda_Norm( const real *input, real *per_block_results, const size_t n, int pass );
+GLOBAL void Cuda_Dot( const real *a, const real *b, real *per_block_results, const size_t n );
+GLOBAL void Cuda_reduction_int( const int *input, int *per_block_results, const size_t n );
+GLOBAL void Cuda_reduction_rvec( rvec *, rvec *, size_t n );
 GLOBAL void Cuda_Vector_Sum( real* , real , real* , real , real* , int ) ;
 GLOBAL void Cuda_Vector_Scale( real* , real , real* , int ) ;
 GLOBAL void Cuda_Vector_Add( real* , real , real* , int );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/reset_utils.cu b/PuReMD-GPU/src/cuda_reset_utils.cu
similarity index 55%
rename from PuReMD-GPU/src/reset_utils.cu
rename to PuReMD-GPU/src/cuda_reset_utils.cu
index 0c6f852bdf295ec9ad48af63f9195737adc387c0..d18d9f749a9a57a269f731aecae7754bdb8925a4 100644
--- a/PuReMD-GPU/src/reset_utils.cu
+++ b/PuReMD-GPU/src/cuda_reset_utils.cu
@@ -18,13 +18,16 @@
-#include "reset_utils.h"
+#include "cuda_reset_utils.h"
 #include "list.h"
+#include "reset_utils.h"
 #include "vector.h"
 #include "cuda_utils.h"
 #include "cuda_copy.h"
 GLOBAL void Reset_Atoms (reax_atom *atoms, int N)
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -35,6 +38,7 @@ GLOBAL void Reset_Atoms (reax_atom *atoms, int N)
     atoms[i].f[2] = 0.0;
 void Cuda_Reset_Atoms (reax_system *system )
     Reset_Atoms <<<BLOCKS, BLOCK_SIZE>>>
@@ -43,42 +47,6 @@ void Cuda_Reset_Atoms (reax_system *system )
     cudaCheckError ();
-void Reset_Atoms( reax_system* system )
-    int i;
-    for( i = 0; i < system->N; ++i )
-        memset( system->atoms[i].f, 0.0, RVEC_SIZE );
-void Reset_Pressures( simulation_data *data )
-    rtensor_MakeZero( data->flex_bar.P );  
-    data->iso_bar.P = 0;
-    rvec_MakeZero( data->int_press );
-    rvec_MakeZero( data->ext_press );
-    /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", 
-       data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
-void Reset_Simulation_Data( simulation_data* data )
-    data->E_BE = 0;
-    data->E_Ov = 0;
-    data->E_Un = 0;
-    data->E_Lp = 0;
-    data->E_Ang = 0;
-    data->E_Pen = 0;
-    data->E_Coa = 0;
-    data->E_HB = 0;
-    data->E_Tor = 0;
-    data->E_Con = 0;
-    data->E_vdW = 0;
-    data->E_Ele = 0;
-    data->E_Kin = 0;
 void Cuda_Sync_Simulation_Data (simulation_data *data)
@@ -93,40 +61,6 @@ void Cuda_Sync_Simulation_Data (simulation_data *data)
-void Reset_Test_Forces( reax_system *system, static_storage *workspace )
-    memset( workspace->f_ele, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_vdw, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_bo, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_be, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_lp, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_ov, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_un, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_ang, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_coa, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_pen, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_hb, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_tor, 0, system->N * sizeof(rvec) );
-    memset( workspace->f_con, 0, system->N * sizeof(rvec) );
-void Reset_Workspace( reax_system *system, static_storage *workspace )
-    memset( workspace->total_bond_order, 0, system->N * sizeof( real ) );
-    memset( workspace->dDeltap_self, 0, system->N * sizeof( rvec ) );
-    memset( workspace->CdDelta, 0, system->N * sizeof( real ) );
-    //memset( workspace->virial_forces, 0, system->N * sizeof( rvec ) );
-    memset( workspace->dDelta, 0, sizeof(rvec) * system->N );
-    Reset_Test_Forces( system, workspace );
 void Cuda_Reset_Workspace( reax_system *system, static_storage *workspace )
     cuda_memset( workspace->total_bond_order, 0, system->N * REAL_SIZE, RES_STORAGE_TOTAL_BOND_ORDER );
@@ -157,6 +91,7 @@ GLOBAL void Reset_Neighbor_Lists (single_body_parameters *sbp, reax_atom *atoms,
 void Cuda_Reset_Neighbor_Lists (reax_system *system, control_params *control, 
         static_storage *workspace, list **lists ) 
@@ -172,6 +107,7 @@ void Cuda_Reset_Neighbor_Lists (reax_system *system, control_params *control,
     cuda_memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs, LIST_BOND_DATA );
 GLOBAL void Reset_Far_Neighbors_List (list far_nbrs, int N)
     int tmp;
@@ -183,6 +119,7 @@ GLOBAL void Reset_Far_Neighbors_List (list far_nbrs, int N)
     Set_End_Index (index, tmp, &far_nbrs);
 void Cuda_Reset_Far_Neighbors_List ( reax_system *system )
     Reset_Far_Neighbors_List <<<BLOCKS, BLOCK_SIZE>>>
@@ -191,52 +128,6 @@ void Cuda_Reset_Far_Neighbors_List ( reax_system *system )
     cudaCheckError ();
-void Reset_Neighbor_Lists( reax_system *system, control_params *control, 
-        static_storage *workspace, list **lists )
-    int i, tmp;
-    list *bonds = (*lists) + BONDS;
-    list *hbonds = (*lists) + HBONDS;
-    for( i = 0; i < system->N; ++i ) {
-        tmp = Start_Index( i, bonds );
-        Set_End_Index( i, tmp, bonds );
-    }
-    //TODO check if this is needed
-    memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs );
-    if( control->hb_cut > 0 )
-        for( i = 0; i < system->N; ++i )
-            if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) {
-                tmp = Start_Index( workspace->hbond_index[i], hbonds );
-                Set_End_Index( workspace->hbond_index[i], tmp, hbonds );
-                /* fprintf( stderr, "i:%d, hbond: %d-%d\n", 
-                   i, Start_Index( workspace->hbond_index[i], hbonds ), 
-                   End_Index( workspace->hbond_index[i], hbonds ) );*/
-            }
-void Reset( reax_system *system, control_params *control,  
-        simulation_data *data, static_storage *workspace, list **lists  )
-    Reset_Atoms( system );
-    Reset_Simulation_Data( data );
-    if( control->ensemble == NPT || control->ensemble == sNPT || 
-            control->ensemble == iNPT )
-        Reset_Pressures( data );
-    Reset_Workspace( system, workspace );  
-    Reset_Neighbor_Lists( system, control, workspace, lists );
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr, "reset - ");
 void Cuda_Reset_Sparse_Matrix (reax_system *system, static_storage *workspace)
@@ -244,6 +135,7 @@ void Cuda_Reset_Sparse_Matrix (reax_system *system, static_storage *workspace)
     cuda_memset (workspace->H.val, 0, (system->N * system->max_sparse_matrix_entries) * INT_SIZE, RES_SPARSE_MATRIX_INDEX );
 void Cuda_Reset( reax_system *system, control_params *control,  
         simulation_data *data, static_storage *workspace, list **lists  )
@@ -251,7 +143,7 @@ void Cuda_Reset( reax_system *system, control_params *control,
     //Reset_Simulation_Data( data );
     Cuda_Sync_Simulation_Data ( data );
-    //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
+    //Sync_Host_Device_Data( data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice );
     if( control->ensemble == NPT || control->ensemble == sNPT || 
             control->ensemble == iNPT )
@@ -268,23 +160,7 @@ void Cuda_Reset( reax_system *system, control_params *control,
-void Reset_Grid( grid *g )
-    memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]);
 void Cuda_Reset_Grid (grid *g)
     cuda_memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2], RES_GRID_TOP);
-void Reset_Marks( grid *g, ivec *grid_stack, int grid_top )
-    int i;
-    for( i = 0; i < grid_top; ++i )
-        g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + 
-            grid_stack[i][1] * g->ncell[2] + 
-            grid_stack[i][2]] = 0;
diff --git a/PuReMD-GPU/src/cuda_reset_utils.h b/PuReMD-GPU/src/cuda_reset_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf730b935cef4857eb8e0aa97d614ef122bfca0e
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_reset_utils.h
@@ -0,0 +1,45 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_RESET_UTILS_H_
+#define __CUDA_RESET_UTILS_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+void Cuda_Reset_Grid( grid* );
+void Cuda_Reset_Workspace (reax_system *, static_storage *);
+void Cuda_Reset( reax_system*, control_params*, simulation_data*,
+        static_storage*, list** );
+void Cuda_Reset_Atoms (reax_system *);
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/single_body_interactions.cu b/PuReMD-GPU/src/cuda_single_body_interactions.cu
similarity index 63%
rename from PuReMD-GPU/src/single_body_interactions.cu
rename to PuReMD-GPU/src/cuda_single_body_interactions.cu
index 3c6c08822fb056c18c78bf4db5f7aa334496ea34..530e63d7071135b143d0b432bf1350070d46667c 100644
--- a/PuReMD-GPU/src/single_body_interactions.cu
+++ b/PuReMD-GPU/src/cuda_single_body_interactions.cu
@@ -18,7 +18,8 @@
-#include "single_body_interactions.h"
+#include "cuda_single_body_interactions.h"
 #include "bond_orders.h"
 #include "list.h"
 #include "lookup.h"
@@ -28,301 +29,6 @@
 #include "cuda_helpers.h"
-void LonePair_OverUnder_Coordination_Energy( reax_system *system, 
-        control_params *control, 
-        simulation_data *data,
-        static_storage *workspace, 
-        list **lists, 
-        output_controls *out_control )
-    int i, j, pj, type_i, type_j;
-    real Delta_lpcorr, dfvl;
-    real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
-    real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
-    real e_ov, CEover1, CEover2, CEover3, CEover4;
-    real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
-    real exp_ovun2n, exp_ovun6, exp_ovun8;
-    real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
-    real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
-    real p_lp1, p_lp2, p_lp3;
-    real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    bond_data *pbond;
-    bond_order_data *bo_ij; 
-    list *bonds = (*lists) + BONDS;
-    /* Initialize parameters */
-    p_lp1 = system->reaxprm.gp.l[15];
-    p_lp3 = system->reaxprm.gp.l[5];
-    p_ovun3 = system->reaxprm.gp.l[32];
-    p_ovun4 = system->reaxprm.gp.l[31];
-    p_ovun6 = system->reaxprm.gp.l[6];
-    p_ovun7 = system->reaxprm.gp.l[8];
-    p_ovun8 = system->reaxprm.gp.l[9];
-    for( i = 0; i < system->N; ++i ) {
-        /* set the parameter pointer */
-        type_i = system->atoms[i].type;
-        sbp_i = &(system->reaxprm.sbp[ type_i ]);
-        /* lone-pair Energy */
-        p_lp2 = sbp_i->p_lp2;      
-        expvd2 = EXP( -75 * workspace->Delta_lp[i] );
-        inv_expvd2 = 1. / (1. + expvd2 );
-        /* calculate the energy */
-        data->E_Lp += e_lp = 
-            p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-        dElp = p_lp2 * inv_expvd2 + 
-            75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
-        CElp = dElp * workspace->dDelta_lp[i];
-        workspace->CdDelta[i] += CElp;      // lp - 1st term
-        fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", 
-                p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
-        fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n",
-                workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp );
-        Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
-        /* correction for C2 */
-        if( system->reaxprm.gp.l[5] > 0.001 && 
-                !strcmp( system->reaxprm.sbp[type_i].name, "C" ) )
-            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-                if( i < bonds->select.bond_list[pj].nbr ) {
-                    j = bonds->select.bond_list[pj].nbr;
-                    type_j = system->atoms[j].type;
-                    if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) {
-                        twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-                        bo_ij = &( bonds->select.bond_list[pj].bo_data );
-                        Di = workspace->Delta[i];
-                        vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
-                        if( vov3 > 3. ) {
-                            data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0);
-                            //estrain(i) += e_lph;
-                            deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-                            deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
-                            bo_ij->Cdbo += deahu2dbo;
-                            workspace->CdDelta[i] += deahu2dsbo;
-                            fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n",
-                                    // workspace->orig_id[i], workspace->orig_id[j],
-                                    i+1, j+1, e_lph, deahu2dbo, deahu2dsbo );
-                            Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
-                            Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
-                        }
-                    }
-                }
-    }
-    for( i = 0; i < system->N; ++i ) {
-        type_i = system->atoms[i].type;
-        sbp_i = &(system->reaxprm.sbp[ type_i ]);
-        /* over-coordination energy */
-        if( sbp_i->mass > 21.0 ) 
-            dfvl = 0.0;
-        else dfvl = 1.0; // only for 1st-row elements
-        p_ovun2 = sbp_i->p_ovun2;
-        sum_ovun1 = 0;
-        sum_ovun2 = 0;
-        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
-            j = bonds->select.bond_list[pj].nbr;
-            type_j = system->atoms[j].type;      
-            bo_ij = &(bonds->select.bond_list[pj].bo_data);
-            sbp_j = &(system->reaxprm.sbp[ type_j ]);
-            twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-            sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
-            sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
-                ( bo_ij->BO_pi + bo_ij->BO_pi2 );
-            /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", 
-              i+1, j+1, 
-              dfvl * workspace->Delta_lp_temp[j],
-              sbp_j->nlp_opt,
-              workspace->nlp_temp[j] );*/
-        }
-        exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
-        inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
-        Delta_lpcorr  = workspace->Delta[i] - 
-            (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
-        exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
-        inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
-        DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 );
-        CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
-        data->E_Ov += e_ov = sum_ovun1 * CEover1;
-        CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
-            ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
-        CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
-        CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
-            p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
-        /* under-coordination potential */
-        p_ovun2 = sbp_i->p_ovun2;
-        p_ovun5 = sbp_i->p_ovun5;
-        exp_ovun2n = 1.0 / exp_ovun2;
-        exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
-        exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
-        inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
-        inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
-        data->E_Un += e_un =
-            -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
-        CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
-                p_ovun2 * e_un * exp_ovun2n);
-        CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
-        CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
-        CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
-            p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
-        //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n",
-        //       i+1, sum_ovun2, e_ov, e_un );
-        /* forces */
-        workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
-        workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
-        Add_dDelta( system, lists, i, CEover3, workspace->f_ov );  // OvCoor - 2nd
-        Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st
-        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
-            pbond = &(bonds->select.bond_list[pj]);
-            j = pbond->nbr;
-            type_j = system->atoms[j].type;
-            bo_ij = &(pbond->bo_data);
-            twbp  = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-            bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
-            workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])*
-                (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
-            bo_ij->Cdbopi += CEover4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
-            bo_ij->Cdbopi2 += CEover4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
-            workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) *
-                (bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
-            bo_ij->Cdbopi += CEunder4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
-            bo_ij->Cdbopi2 += CEunder4 * 
-                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
-            /* fprintf( out_control->eov, "%6d%23.15e%23.15e"
-               workspace->orig_id[j]+1,
-            //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
-            CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */
-            /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-              workspace->orig_id[j]+1, 
-              CEover4,
-              CEover4*
-              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-              CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), 
-              (1.0 - dfvl*workspace->dDelta_lp[j]),
-              CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
-            /* fprintf( out_control->eun, "%6d%23.15e\n",
-               workspace->orig_id[j]+1, CEunder3 ); */
-            /*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n",
-              workspace->orig_id[j]+1,
-              CEunder4,
-              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-              CEunder4*
-              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-              CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
-              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
-            Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, 
-                    workspace->f_ov ); // OvCoor - 1st term
-            Add_dDelta( system, lists, j,
-                    CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-                    (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a
-            Add_dBOpinpi2( system, lists, i, pj, 
-                    CEover4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    CEover4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
-            Add_dDelta( system, lists, j,
-                    CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-                    (bo_ij->BO_pi + bo_ij->BO_pi2),
-                    workspace->f_un ); // UnCoor - 2a
-            Add_dBOpinpi2( system, lists, i, pj, 
-                    CEunder4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    CEunder4 * (workspace->Delta[j] - 
-                        dfvl * workspace->Delta_lp_temp[j]),
-                    workspace->f_un, workspace->f_un ); // UnCoor - 2b
-        }
-#ifdef TEST_ENERGY      
-        fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", 
-                i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); 
-        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
-                i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un );
-        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
-                i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un );
-    }
-//CUDA Functions
 GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, global_parameters g_params, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
         static_storage p_workspace, simulation_data *data,
@@ -374,7 +80,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
         e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-        atomicAdd (&data->E_Lp, e_lp);
+        MYATOMICADD(&data->E_Lp, e_lp);
         dElp = p_lp2 * inv_expvd2 + 
             75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
@@ -382,7 +88,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
         //workspace->CdDelta[i] += CElp;      // lp - 1st term
-        atomicAdd (&workspace->CdDelta[i], CElp);
+        MYATOMICADD(&workspace->CdDelta[i], CElp);
@@ -407,7 +113,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
                     type_j = atoms[j].type;
                     if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) {
-                        twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+                        twbp = &( tbp[ index_tbp(type_i,type_j,num_atom_types) ]);
                         bo_ij = &( bonds->select.bond_list[pj].bo_data );
                         Di = workspace->Delta[i];
                         vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
@@ -416,7 +122,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
                             //PERFORMANCE IMPACT
                             e_lph = p_lp3 * SQR(vov3-3.0);
-                            atomicAdd (&data->E_Lp, e_lph );
+                            MYATOMICADD(&data->E_Lp, e_lph );
                             //estrain(i) += e_lph;
                             deahu2dbo = 2.*p_lp3*(vov3 - 3.);
@@ -426,7 +132,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
                             //PERFORMANCE IMPACT
-                            atomicAdd (&workspace->CdDelta[i], deahu2dsbo);
+                            MYATOMICADD(&workspace->CdDelta[i], deahu2dsbo);
@@ -469,7 +175,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
         type_j = atoms[j].type;      
         bo_ij = &(bonds->select.bond_list[pj].bo_data);
         sbp_j = &(sbp[ type_j ]);
-        twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+        twbp = &(tbp[ index_tbp(type_i,type_j,num_atom_types) ]);
         sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
         sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
@@ -500,7 +206,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
     //data->E_Ov += e_ov = sum_ovun1 * CEover1;
     e_ov = sum_ovun1 * CEover1;
-    atomicAdd (&data->E_Ov, e_ov ); 
+    MYATOMICADD(&data->E_Ov, e_ov ); 
     CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
         ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
@@ -523,7 +229,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
     e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
-    atomicAdd (&data->E_Un, e_un );
+    MYATOMICADD(&data->E_Un, e_un );
     CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
             p_ovun2 * e_un * exp_ovun2n);
@@ -537,8 +243,8 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
     // forces 
-    atomicAdd (&workspace->CdDelta[i] , CEover3);   // OvCoor - 2nd term
-    atomicAdd (&workspace->CdDelta[i], CEunder3);  // UnCoor - 1st term
+    MYATOMICADD(&workspace->CdDelta[i] , CEover3);   // OvCoor - 2nd term
+    MYATOMICADD(&workspace->CdDelta[i], CEunder3);  // UnCoor - 1st term
@@ -553,13 +259,13 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
         j = pbond->nbr;
         type_j = atoms[j].type;
         bo_ij = &(pbond->bo_data);
-        twbp  = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+        twbp  = &(tbp[ index_tbp(type_i,type_j,num_atom_types) ]);
         bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
-        atomicAdd (&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a
+        MYATOMICADD(&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a
         bo_ij->Cdbopi += CEover4 * 
             (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
@@ -568,7 +274,7 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
-        atomicAdd (&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) );   // UnCoor - 2a
+        MYATOMICADD(&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) );   // UnCoor - 2a
         bo_ij->Cdbopi += CEunder4 * 
             (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
@@ -647,14 +353,11 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 //TEST ONLY CODE -- See if this is working.
-//CUDA Functions
 GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, global_parameters g_params, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
         static_storage p_workspace, simulation_data *data,
@@ -705,7 +408,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
     // calculate the energy 
     e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-    //atomicAdd (&data->E_Lp, e_lp );
+    //MYATOMICADD(&data->E_Lp, e_lp );
     E_Lp [ i ] = e_lp;
     dElp = p_lp2 * inv_expvd2 + 
@@ -723,7 +426,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
     type_j = atoms[j].type;
     if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) {
-    twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+    twbp = &( tbp[ index_tbp(type_i,type_j,num_atom_types) ]);
     bo_ij = &( bonds->select.bond_list[pj].bo_data );
     Di = workspace->Delta[i];
     vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
@@ -732,7 +435,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
     e_lph = p_lp3 * SQR(vov3-3.0);
     E_Lp [i] += e_lph;
-    //atomicAdd (&data->E_Lp, e_lph );
+    //MYATOMICADD(&data->E_Lp, e_lph );
     //estrain(i) += e_lph;
     deahu2dbo = 2.*p_lp3*(vov3 - 3.);
@@ -769,7 +472,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
         type_j = atoms[j].type;      
         bo_ij = &(bonds->select.bond_list[pj].bo_data);
         sbp_j = &(sbp[ type_j ]);
-        twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+        twbp = &(tbp[ index_tbp(type_i,type_j,num_atom_types) ]);
         sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
         sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
@@ -790,7 +493,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
     e_ov = sum_ovun1 * CEover1;
     E_Ov [ i ] = e_ov;
-    //atomicAdd ( &data->E_Ov, e_ov );
+    //MYATOMICADD( &data->E_Ov, e_ov );
     CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
         ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
@@ -813,7 +516,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
     e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
     E_Un [i] = e_un;
-    //atomicAdd ( &data->E_Un, e_un );
+    //MYATOMICADD( &data->E_Un, e_un );
     CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
             p_ovun2 * e_un * exp_ovun2n);
@@ -831,7 +534,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
         j = pbond->nbr;
         type_j = atoms[j].type;
         bo_ij = &(pbond->bo_data);
-        twbp  = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+        twbp  = &(tbp[ index_tbp(type_i,type_j,num_atom_types) ]);
         bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
@@ -854,7 +557,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, global_parameters g_params, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
         static_storage p_workspace, simulation_data *data,
@@ -903,7 +606,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g
     // calculate the energy 
     e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-    //atomicAdd (&data->E_Lp, e_lp );
+    //MYATOMICADD(&data->E_Lp, e_lp );
     E_Lp [ i ] = e_lp;
     dElp = p_lp2 * inv_expvd2 + 
@@ -921,7 +624,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g
                 type_j = atoms[j].type;
                 if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) {
-                    twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+                    twbp = &( tbp[ index_tbp(type_i,type_j,num_atom_types) ]);
                     bo_ij = &( bonds->select.bond_list[pj].bo_data );
                     Di = workspace->Delta[i];
                     vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
@@ -930,7 +633,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g
                         e_lph = p_lp3 * SQR(vov3-3.0);
                         E_Lp [i] += e_lph;
-                        //atomicAdd (&data->E_Lp, e_lph );
+                        //MYATOMICADD(&data->E_Lp, e_lph );
                         //estrain(i) += e_lph;
                         deahu2dbo = 2.*p_lp3*(vov3 - 3.);
@@ -943,7 +646,7 @@ GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, g
 GLOBAL void test_LonePair_Postprocess ( reax_atom *atoms, global_parameters g_params, 
         single_body_parameters *sbp, two_body_parameters *tbp, 
diff --git a/PuReMD-GPU/src/cuda_single_body_interactions.h b/PuReMD-GPU/src/cuda_single_body_interactions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ecd4b9a68b174624ac1dcf56aae4cec360750be
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_single_body_interactions.h
@@ -0,0 +1,59 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters ,
+        single_body_parameters *, two_body_parameters *,
+        static_storage , simulation_data *,
+        list , int , int );
+GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *, global_parameters ,
+        single_body_parameters *, two_body_parameters *,
+        static_storage , simulation_data *,
+        list , int , int,
+        real *, real *, real *);
+GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters ,
+        single_body_parameters *, two_body_parameters *,
+        static_storage , simulation_data *,
+        list , int , int,
+        real *, real *, real *);
+GLOBAL void test_LonePair_Postprocess ( reax_atom *, global_parameters ,
+        single_body_parameters *, two_body_parameters *,
+        static_storage , simulation_data *,
+        list , int , int );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/system_props.cu b/PuReMD-GPU/src/cuda_system_props.cu
similarity index 52%
rename from PuReMD-GPU/src/system_props.cu
rename to PuReMD-GPU/src/cuda_system_props.cu
index 3ec39134a3fdd647f8bb81b292d3184e51a6a2b0..7ff5c11fe6d644e6878cd5e25e67b82de39ad0f5 100644
--- a/PuReMD-GPU/src/system_props.cu
+++ b/PuReMD-GPU/src/cuda_system_props.cu
@@ -18,80 +18,29 @@
-#include "system_props.h"
+#include "cuda_system_props.h"
 #include "box.h"
 #include "vector.h"
-#include "cuda_utils.h"
+#include "cuda_center_mass.h"
 #include "cuda_copy.h"
-#include "reduction.h"
-#include "center_mass.h"
-#include "validation.h"
-real Get_Time( )
-    struct timeval tim;
-    gettimeofday(&tim, NULL );
-    return( tim.tv_sec + (tim.tv_usec / 1000000.0) );
-real Get_Timing_Info( real t_start )
-    struct timeval tim;
-    real t_end;
+#include "cuda_utils.h"
+#include "cuda_reduction.h"
-    gettimeofday(&tim, NULL );
-    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
-    return (t_end - t_start);
+GLOBAL void k_Compute_Total_Mass(single_body_parameters *, reax_atom *, real *, size_t );
+GLOBAL void k_Compute_Kinetic_Energy(single_body_parameters *, reax_atom *, unsigned int , real *);
+GLOBAL void k_Kinetic_Energy_Reduction(simulation_data *, real *, int);
-void Temperature_Control( control_params *control, simulation_data *data, 
-        output_controls *out_control )
-    real tmp;
-    if( control->T_mode == 1 ) { // step-wise temperature control
-        if( (data->step - data->prev_steps) % 
-                ((int)(control->T_freq / control->dt)) == 0 ) {
-            if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
-                control->T += control->T_rate;
-            else control->T = control->T_final;     
-        }
-    }
-    else if( control->T_mode == 2 ) { // constant slope control
-        tmp = control->T_rate * control->dt / control->T_freq;
-        if( fabs( control->T - control->T_final ) >= fabs( tmp ) )
-            control->T += tmp;       
-    }
 void prep_dev_system (reax_system *system) 
     //copy the system atoms to the device
-    Sync_Host_Device ( system, cudaMemcpyHostToDevice );
+    Sync_Host_Device_Sys( system, cudaMemcpyHostToDevice );
-void Compute_Total_Mass( reax_system *system, simulation_data *data )
-    int i;
-    int blocks;
-    int block_size;
-    real    *partial_sums = 0;
-    data->M = 0;
-    for( i = 0; i < system->N; i++ ) 
-        data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;  
-    data->inv_M = 1. / data->M;    
 void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data )
     real    *partial_sums = (real *) scratch;
@@ -100,7 +49,7 @@ void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data )
     //cuda_malloc ((void **)&partial_sums, sizeof (real) * (blocks + 1), 1, 0);
     cuda_memset (partial_sums, 0, REAL_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH );
-    Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+    k_Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
         (system->reaxprm.d_sbp, system->d_atoms, partial_sums, system->N);
     cudaThreadSynchronize ();
     cudaCheckError ();
@@ -133,158 +82,6 @@ void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data )
-GLOBAL void Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) 
-    extern __shared__ real sdata[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real x = 0; 
-    if(i < n) 
-        x = sbp [ atoms[ i ].type ].mass;
-    sdata[threadIdx.x] = x; 
-    __syncthreads();
-    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) 
-    {  
-        if(threadIdx.x < offset)
-        {  
-            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-        }  
-        __syncthreads();
-    }  
-    if(threadIdx.x == 0) 
-    {  
-        per_block_results[blockIdx.x] = sdata[0];
-    }
-void Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
-        FILE *fout )
-    int i;
-    real m, xx, xy, xz, yy, yz, zz, det;
-    rvec tvec, diff;
-    rtensor mat, inv;
-    int blocks;
-    int block_size;
-    rvec *l_xcm, *l_vcm, *l_amcm;
-    real t_start, t_end;
-    rvec_MakeZero( data->xcm );  // position of CoM
-    rvec_MakeZero( data->vcm );  // velocity of CoM
-    rvec_MakeZero( data->amcm ); // angular momentum of CoM
-    rvec_MakeZero( data->avcm ); // angular velocity of CoM
-    /* Compute the position, velocity and angular momentum about the CoM */
-    for( i = 0; i < system->N; ++i ) {
-        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
-        rvec_ScaledAdd( data->xcm, m, system->atoms[i].x );
-        rvec_ScaledAdd( data->vcm, m, system->atoms[i].v );
-        rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v );
-        rvec_ScaledAdd( data->amcm, m, tvec );
-        /*fprintf( fout,"%3d  %g %g %g\n",
-          i+1, 
-          system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2]  );
-          fprintf( fout, "vcm:  %g %g %g\n", 
-          data->vcm[0], data->vcm[1], data->vcm[2] );  
-         */
-    }
-    rvec_Scale( data->xcm, data->inv_M, data->xcm );
-    rvec_Scale( data->vcm, data->inv_M, data->vcm );
-    rvec_Cross( tvec, data->xcm, data->vcm );
-    rvec_ScaledAdd( data->amcm, -data->M, tvec );
-    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
-    /* Calculate and then invert the inertial tensor */
-    xx = xy = xz = yy = yz = zz = 0;
-    for( i = 0; i < system->N; ++i ) {
-        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
-        rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
-        xx += diff[0] * diff[0] * m;
-        xy += diff[0] * diff[1] * m;
-        xz += diff[0] * diff[2] * m;
-        yy += diff[1] * diff[1] * m;
-        yz += diff[1] * diff[2] * m;
-        zz += diff[2] * diff[2] * m;      
-    }
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " xx: %f \n", xx);
-    fprintf (stderr, " xy: %f \n", xy);
-    fprintf (stderr, " xz: %f \n", xz);
-    fprintf (stderr, " yy: %f \n", yy);
-    fprintf (stderr, " yz: %f \n", yz);
-    fprintf (stderr, " zz: %f \n", zz);
-    mat[0][0] = yy + zz;     
-    mat[0][1] = mat[1][0] = -xy;
-    mat[0][2] = mat[2][0] = -xz;
-    mat[1][1] = xx + zz;
-    mat[2][1] = mat[1][2] = -yz;
-    mat[2][2] = xx + yy;
-    /* invert the inertial tensor */
-    det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
-            mat[0][1] * mat[1][2] * mat[2][0] + 
-            mat[0][2] * mat[1][0] * mat[2][1] ) -
-        ( mat[0][0] * mat[1][2] * mat[2][1] + 
-          mat[0][1] * mat[1][0] * mat[2][2] + 
-          mat[0][2] * mat[1][1] * mat[2][0] );
-    inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
-    inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
-    inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
-    inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
-    inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
-    inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
-    inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
-    inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
-    inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
-    if( fabs(det) > ALMOST_ZERO )
-        rtensor_Scale( inv, 1./det, inv );
-    else 
-        rtensor_MakeZero( inv );
-    /* Compute the angular velocity about the centre of mass */
-    rtensor_MatVec( data->avcm, inv, data->amcm );  
-    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
-#if defined(DEBUG)
-    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
-            data->xcm[0], data->xcm[1], data->xcm[2] );
-    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
-            data->vcm[0], data->vcm[1], data->vcm[2] );
-    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
-            data->amcm[0], data->amcm[1], data->amcm[2] );
-    /* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
-       mat[0][0], mat[0][1], mat[0][2], 
-       mat[1][0], mat[1][1], mat[1][2], 
-       mat[2][0], mat[2][1], mat[2][2] );
-       fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
-       inv[0][0], inv[0][1], inv[0][2], 
-       inv[1][0], inv[1][1], inv[1][2], 
-       inv[2][0], inv[2][1], inv[2][2] );
-       fflush( fout ); */
-    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
-            data->avcm[0], data->avcm[1], data->avcm[2] );
 void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
         FILE *fout )
@@ -316,12 +113,12 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     l_vcm = r_scratch + (BLOCKS_POW_2 + 1);
     l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1);
-    center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (RVEC_SIZE * BLOCK_SIZE) >>> 
+    k_center_of_mass_blocks<<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (RVEC_SIZE * BLOCK_SIZE) >>> 
         (system->reaxprm.d_sbp, system->d_atoms, l_xcm, l_vcm, l_amcm, system->N);
     cudaThreadSynchronize ();
     cudaCheckError ();
-    center_of_mass <<<1, BLOCKS_POW_2, 3 * (RVEC_SIZE * BLOCKS_POW_2) >>> 
+    k_center_of_mass<<<1, BLOCKS_POW_2, 3 * (RVEC_SIZE * BLOCKS_POW_2) >>> 
         (l_xcm, l_vcm, l_amcm, 
          l_xcm + BLOCKS_POW_2, 
          l_vcm + BLOCKS_POW_2, 
@@ -391,18 +188,18 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     cuda_memset (partial_results, 0, REAL_SIZE * 6 * (BLOCKS_POW_2 + 1), RES_SCRATCH );
     local_results = (real *) malloc (REAL_SIZE * 6 *(BLOCKS_POW_2+ 1));
-    compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> 
+    k_compute_center_mass_sbp<<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> 
         (system->reaxprm.d_sbp, system->d_atoms, partial_results, 
          data->xcm[0], data->xcm[1], data->xcm[2], system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
-    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> 
+    k_compute_center_mass<<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> 
         (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
+    cudaThreadSynchronize( );
+    cudaCheckError( );
-    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device( local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__ );
 #ifdef __BUILD_DEBUG__
     if (check_zero (local_results[0],xx) ||
@@ -456,16 +253,19 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
     inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
     if( fabs(det) > ALMOST_ZERO )
+    {
         rtensor_Scale( inv, 1./det, inv );
+    }
+    {
         rtensor_MakeZero( inv );
+    }
     /* Compute the angular velocity about the centre of mass */
     rtensor_MatVec( data->avcm, inv, data->amcm );  
     data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
-    //free the resources
-    free (local_results);
+    free( local_results );
 #if defined(DEBUG)
     fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
@@ -489,34 +289,51 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
-void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
+void Cuda_Compute_Kinetic_Energy( reax_system *system, simulation_data *data )
-    int i;
-    rvec p;
-    real m;
+    real *results = (real *) scratch;
+    cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
+    k_Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+        (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results);
+    cudaThreadSynchronize (); 
+    cudaCheckError ();
-    data->E_Kin = 0.0;
+    k_Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
+        ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2);
+    cudaThreadSynchronize (); 
+    cudaCheckError ();
-    for (i=0; i < system->N; i++) {
-        m = system->reaxprm.sbp[system->atoms[i].type].mass;
-        rvec_Scale( p, m, system->atoms[i].v );
-        data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v );
+GLOBAL void k_Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) 
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0; 
-        /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n",
-           i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
-           system->reaxprm.sbp[system->atoms[i].type].mass); */
-    }
+    if(i < n) 
+        x = sbp [ atoms[ i ].type ].mass;
-    data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
+    sdata[threadIdx.x] = x; 
+    __syncthreads();
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) 
+    {  
+        if(threadIdx.x < offset)
+        {  
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }  
+        __syncthreads();
+    }  
-    if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */
-        data->therm.T = ALMOST_ZERO;
+    if(threadIdx.x == 0) 
+    {  
+        per_block_results[blockIdx.x] = sdata[0];
+    }
-GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, 
+GLOBAL void k_Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, 
         unsigned int N, real *output)
     extern __shared__ real sh_ekin[];
@@ -547,8 +364,8 @@ GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atom
-GLOBAL void Kinetic_Energy_Reduction (simulation_data *data,
-        real *input, int n)
+GLOBAL void k_Kinetic_Energy_Reduction (simulation_data *data, real *input, int n)
     extern __shared__ real sdata[];
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -582,20 +399,6 @@ GLOBAL void Kinetic_Energy_Reduction (simulation_data *data,
-void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data)
-    real *results = (real *) scratch;
-    cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
-    Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-        (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results);
-    cudaThreadSynchronize (); 
-    cudaCheckError ();
-    Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
-        ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2);
-    cudaThreadSynchronize (); 
-    cudaCheckError ();
    GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, 
@@ -658,119 +461,3 @@ data->therm.T = ALMOST_ZERO;
-/* IMPORTANT: This function assumes that current kinetic energy and 
- *  the center of mass of the system is already computed before. 
- *
- * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
- *  to be added when there are long-range interactions or long-range 
- *  corrections to short-range interactions present.
- *  We may want to add that for more accuracy. 
- */
-void Compute_Pressure_Isotropic( reax_system* system, control_params *control, 
-        simulation_data* data, 
-        output_controls *out_control )
-    int i;
-    reax_atom *p_atom;
-    rvec tx;
-    rvec tmp;
-    simulation_box *box = &(system->box);
-    /* Calculate internal pressure */
-    rvec_MakeZero( data->int_press );
-    // 0: both int and ext, 1: ext only, 2: int only
-    if( control->press_mode == 0 || control->press_mode == 2 ) {
-        for( i = 0; i < system->N; ++i ) {
-            p_atom = &( system->atoms[i] );
-            /* transform x into unitbox coordinates */
-            Transform_to_UnitBox( p_atom->x, box, 1, tx );
-            /* this atom's contribution to internal pressure */
-            rvec_Multiply( tmp, p_atom->f, tx );
-            rvec_Add( data->int_press, tmp );
-            if( out_control->debug_level > 0 ) {
-                fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", 
-                        i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
-                fprintf( out_control->prs, "%8.2f%8.2f%8.2f", 
-                        p_atom->f[0], p_atom->f[1], p_atom->f[2] );
-                fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", 
-                        data->int_press[0],data->int_press[1],data->int_press[2]);
-            }
-        }
-    }
-    /* kinetic contribution */
-    data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV );
-    /* Calculate total pressure in each direction */  
-    data->tot_press[0] = data->kin_press - 
-        ((data->int_press[0] + data->ext_press[0]) /
-         (box->box_norms[1] * box->box_norms[2] * P_CONV));
-    data->tot_press[1] = data->kin_press - 
-        ((data->int_press[1] + data->ext_press[1])/
-         (box->box_norms[0] * box->box_norms[2] * P_CONV));
-    data->tot_press[2] = data->kin_press - 
-        ((data->int_press[2] + data->ext_press[2])/
-         (box->box_norms[0] * box->box_norms[1] * P_CONV));
-    /* Average pressure for the whole box */
-    data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3;
-void Compute_Pressure_Isotropic_Klein( reax_system* system, 
-        simulation_data* data )
-    int i;
-    reax_atom *p_atom;
-    rvec dx;
-    // IMPORTANT: This function assumes that current kinetic energy and 
-    // the center of mass of the system is already computed before.
-    data->iso_bar.P = 2.0 * data->E_Kin;
-    for( i = 0; i < system->N; ++i )
-    {
-        p_atom = &( system->atoms[i] );
-        rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm);
-        data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) );
-    }
-    data->iso_bar.P /= (3.0 * system->box.volume);
-    // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
-    // to be added when there are long-range interactions or long-range 
-    // corrections to short-range interactions present.
-    // We may want to add that for more accuracy.
-void Compute_Pressure( reax_system* system, simulation_data* data, 
-        static_storage *workspace )
-    int i;
-    reax_atom *p_atom;
-    rtensor temp;
-    rtensor_MakeZero( data->flex_bar.P );
-    for( i = 0; i < system->N; ++i ) {
-        p_atom = &( system->atoms[i] );
-        // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx );
-        rvec_OuterProduct( temp, p_atom->v, p_atom->v );
-        rtensor_ScaledAdd( data->flex_bar.P, 
-                system->reaxprm.sbp[ p_atom->type ].mass, temp );
-        // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); 
-        rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp );
-    }
-    rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P );
-    data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0;
diff --git a/PuReMD-GPU/src/cuda_system_props.h b/PuReMD-GPU/src/cuda_system_props.h
new file mode 100644
index 0000000000000000000000000000000000000000..026999e9e7bbc6ecc70b0945e7a30bd853a6cbe0
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_system_props.h
@@ -0,0 +1,42 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#ifndef __CUDA_SYSTEM_PROP_H_
+#define __CUDA_SYSTEM_PROP_H_
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+void prep_dev_system (reax_system *system);
+void Cuda_Compute_Total_Mass( reax_system*, simulation_data* );
+void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* );
+void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_three_body_interactions.cu b/PuReMD-GPU/src/cuda_three_body_interactions.cu
new file mode 100644
index 0000000000000000000000000000000000000000..038b88402b49e516ad4594de520a09464b4c7c8e
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_three_body_interactions.cu
@@ -0,0 +1,1636 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_three_body_interactions.h"
+#include "bond_orders.h"
+#include "list.h"
+#include "lookup.h"
+#include "vector.h"
+#include "index_utils.h"
+#include "cuda_helpers.h"
+/* calculates the theta angle between i-j-k */
+DEVICE void d_Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
+        real *theta, real *cos_theta )
+    (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
+    if( *cos_theta > 1. ) *cos_theta  = 1.0;
+    if( *cos_theta < -1. ) *cos_theta  = -1.0;
+    (*theta) = ACOS( *cos_theta );
+/* calculates the derivative of the cosine of the angle between i-j-k */
+DEVICE void d_Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
+        rvec* dcos_theta_di, rvec* dcos_theta_dj, 
+        rvec* dcos_theta_dk )
+    int  t;
+    real sqr_d_ji   = SQR(d_ji);
+    real sqr_d_jk   = SQR(d_jk);
+    real inv_dists  = 1.0 / (d_ji * d_jk);
+    real inv_dists3 = POW( inv_dists, 3 );
+    real dot_dvecs  = Dot( dvec_ji, dvec_jk, 3 );
+    real Cdot_inv3  = dot_dvecs * inv_dists3;
+    for( t = 0; t < 3; ++t ) {
+        (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - 
+            Cdot_inv3 * sqr_d_jk * dvec_ji[t];
+        (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists +
+            Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
+        (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - 
+            Cdot_inv3 * sqr_d_ji * dvec_jk[t];
+    }
+    /*fprintf( stderr, 
+      "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+      dvec_jk[t] * inv_dists*/
+/* this is a 3-body interaction in which the main role is 
+   played by j which sits in the middle of the other two. */
+GLOBAL void k_Three_Body_Interactions( reax_atom *atoms,
+        single_body_parameters *sbp,
+        three_body_header *d_thbp,
+        global_parameters g_params,
+        control_params *control,
+        simulation_data *data,
+        static_storage p_workspace, 
+        list p_bonds, list p_thb_intrs,
+        int N, int num_atom_types,
+        real *E_Ang, real *E_Pen, real *E_Coa, rvec *aux_ext_press )
+    int  i, j, pi, k, pk, t;
+    int  type_i, type_j, type_k;
+    int  start_j, end_j, start_pk, end_pk;
+    int  flag, cnt, num_thb_intrs;
+    real temp, temp_bo_jt, pBOjt7;
+    real p_val1, p_val2, p_val3, p_val4, p_val5;
+    real p_val6, p_val7, p_val8, p_val9, p_val10;
+    real p_pen1, p_pen2, p_pen3, p_pen4;
+    real p_coa1, p_coa2, p_coa3, p_coa4;
+    real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
+    real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
+    real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
+    real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
+    real CEpen1, CEpen2, CEpen3;
+    real e_ang, e_coa, e_pen;
+    real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
+    real Cf7ij, Cf7jk, Cf8j, Cf9j;
+    real f7_ij, f7_jk, f8_Dj, f9_Dj;
+    real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
+    real r_ij, r_jk;
+    real BOA_ij, BOA_jk;
+    real vlpadj;
+    rvec force, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    real *total_bo;
+    three_body_header *thbh;
+    three_body_parameters *thbp;
+    three_body_interaction_data *p_ijk, *p_kji;
+    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+    bond_order_data *bo_ij, *bo_jk, *bo_jt;
+    list *bonds, *thb_intrs;
+    bond_data *bond_list;
+    three_body_interaction_data *thb_list;
+    static_storage *workspace = &p_workspace;
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
+    total_bo = workspace->total_bond_order;
+    bonds = &p_bonds;
+    bond_list = bonds->select.bond_list;
+    thb_intrs = &p_thb_intrs;
+    thb_list = thb_intrs->select.three_body_list;
+    /* global parameters used in these calculations */
+    p_val6 = g_params.l[14];
+    p_val8 = g_params.l[33];
+    p_val9 = g_params.l[16];
+    p_val10 = g_params.l[17];
+    //TODO check this, initially this was zero, 
+    // I am changing it to the starting index for this atom.
+    //num_thb_intrs = j * MAX_TH_BODY;
+    //for( j = 0; j < system->N; ++j ) {
+    // fprintf( out_control->eval, "j: %d\n", j );
+    type_j = atoms[j].type;
+    start_j = Start_Index(j, bonds);
+    end_j = End_Index(j, bonds);
+    p_val3 = sbp[ type_j ].p_val3;
+    p_val5 = sbp[ type_j ].p_val5;
+    SBOp = 0, prod_SBO = 1;
+    for( t = start_j; t < end_j; ++t ) {
+        bo_jt = &(bond_list[t].bo_data);
+        SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
+        temp = SQR( bo_jt->BO );
+        temp *= temp; 
+        temp *= temp;
+        prod_SBO *= EXP( -temp );
+    }
+    /* modifications to match Adri's code - 09/01/09 */
+    if( workspace->vlpex[j] >= 0 ){
+        vlpadj = 0;
+        dSBO2 = prod_SBO - 1;
+    }
+    else{
+        vlpadj = workspace->nlp[j];
+        dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
+    }
+    SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
+    dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
+    if( SBO <= 0 )
+        SBO2 = 0, CSBO2 = 0;
+    else if( SBO > 0 && SBO <= 1 ) {
+        SBO2 = POW( SBO, p_val9 );
+        CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
+    }
+    else if( SBO > 1 && SBO < 2 ) {
+        SBO2 = 2 - POW( 2-SBO, p_val9 );
+        CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
+    }
+    else 
+        SBO2 = 2, CSBO2 = 0;  
+    expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
+    /* unlike 2-body intrs where we enforce i<j, we cannot put any such 
+       restrictions here. such a restriction would prevent us from producing 
+       all 4-body intrs correctly */
+    for( pi = start_j; pi < end_j; ++pi ) {
+        //TODO
+        //num_thb_intrs = pi * MAX_THREE_BODIES;
+        //TODO
+        //Set_Start_Index( pi, num_thb_intrs, thb_intrs );
+        num_thb_intrs = Start_Index (pi, thb_intrs);
+        pbond_ij = &(bond_list[pi]);
+        bo_ij = &(pbond_ij->bo_data);
+        BOA_ij = bo_ij->BO - control->thb_cut;
+        if( BOA_ij/*bo_ij->BO*/ > 0.0 ) {
+            i = pbond_ij->nbr;
+            r_ij = pbond_ij->d;     
+            type_i = atoms[i].type;
+            // fprintf( out_control->eval, "i: %d\n", i );
+            /* first copy 3-body intrs from previously computed ones where i>k.
+IMPORTANT: if it is less costly to compute theta and its 
+derivative, we should definitely re-compute them, 
+instead of copying!
+in the second for-loop below, we compute only new 3-body intrs 
+where i < k */
+            for( pk = start_j; pk < pi; ++pk ) {
+                // fprintf( out_control->eval, "pk: %d\n", pk );
+                start_pk = Start_Index( pk, thb_intrs );
+                end_pk = End_Index( pk, thb_intrs );
+                for( t = start_pk; t < end_pk; ++t )
+                    if( thb_list[t].thb == i ) {
+                        p_ijk = &(thb_list[num_thb_intrs]);
+                        p_kji = &(thb_list[t]);
+                        p_ijk->thb = bond_list[pk].nbr;
+                        p_ijk->pthb  = pk;
+                        p_ijk->theta = p_kji->theta;              
+                        rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
+                        rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
+                        rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
+                        ++num_thb_intrs;
+                        break;
+                    }
+            }
+            /* and this is the second for loop mentioned above */
+            for( pk = pi+1; pk < end_j; ++pk ) {
+                pbond_jk = &(bond_list[pk]);
+                bo_jk    = &(pbond_jk->bo_data);
+                BOA_jk   = bo_jk->BO - control->thb_cut;
+                k        = pbond_jk->nbr;
+                type_k   = atoms[k].type;
+                p_ijk    = &( thb_list[num_thb_intrs] );
+                //CHANGE ORIGINAL
+                if (BOA_jk <= 0) continue;
+                //CHANGE ORIGINAL
+                d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
+                        pbond_jk->dvec, pbond_jk->d,
+                        &theta, &cos_theta );
+                d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
+                        pbond_jk->dvec, pbond_jk->d, 
+                        &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
+                        &(p_ijk->dcos_dk) );
+                p_ijk->thb = k;
+                p_ijk->pthb = pk;
+                p_ijk->theta = theta;
+                sin_theta = SIN( theta );
+                if( sin_theta < 1.0e-5 )
+                    sin_theta = 1.0e-5;
+                ++num_thb_intrs;
+                if( BOA_jk > 0.0 && 
+                        (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
+                    r_jk = pbond_jk->d;              
+                    thbh = &( d_thbp[ index_thbp(type_i,type_j,type_k,num_atom_types) ] );
+                    flag = 0;
+                    /* if( workspace->orig_id[i] < workspace->orig_id[k] )
+                       fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                       workspace->orig_id[i], workspace->orig_id[j],
+                       workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
+                       else 
+                       fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                       workspace->orig_id[k], workspace->orig_id[j],
+                       workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
+                    //TODO:
+                    //pbond_jk->scratch = thbh->cnt;
+                    for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
+                        // fprintf( out_control->eval, 
+                        // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
+                        if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
+                            thbp = &( thbh->prm[cnt] );
+                            /* ANGLE ENERGY */
+                            p_val1 = thbp->p_val1;
+                            p_val2 = thbp->p_val2;
+                            p_val4 = thbp->p_val4;
+                            p_val7 = thbp->p_val7;
+                            theta_00 = thbp->theta_00;
+                            exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
+                            f7_ij = 1.0 - exp3ij;
+                            Cf7ij = p_val3 * p_val4 * 
+                                POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+                            exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
+                            f7_jk = 1.0 - exp3jk;
+                            Cf7jk = p_val3 * p_val4 * 
+                                POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+                            expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
+                            trm8 = 1.0 + expval6 + expval7;
+                            f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
+                            Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
+                                (p_val6 * expval6 * trm8 - 
+                                 (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
+                            theta_0 = 180.0 - 
+                                theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
+                            theta_0 = DEG2RAD( theta_0 );              
+                            expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
+                            if( p_val1 >= 0 )
+                                expval12theta = p_val1 * (1.0 - expval2theta);
+                            else // To avoid linear Me-H-Me angles (6/6/06)
+                                expval12theta = p_val1 * -expval2theta;
+                            CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
+                            CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
+                            CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
+                            CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
+                                expval2theta * (theta_0 - theta);
+                            Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
+                                exp( -p_val10 * (2.0 - SBO2) );
+                            CEval5 = -CEval4 * Ctheta_0 * CSBO2;
+                            CEval6 = CEval5 * dSBO1;
+                            CEval7 = CEval5 * dSBO2;
+                            CEval8 = -CEval4 / sin_theta;
+                            e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
+                            //PERFORMANCE IMPACT
+                            //MYATOMICADD(&data->E_Ang, e_ang);
+                            E_Ang [j] += e_ang;
+                            /* END ANGLE ENERGY*/
+                            /* PENALTY ENERGY */
+                            p_pen1 = thbp->p_pen1;
+                            p_pen2 = g_params.l[19];
+                            p_pen3 = g_params.l[20];
+                            p_pen4 = g_params.l[21];
+                            exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                            exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                            exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
+                            exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
+                            trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
+                            f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
+                            Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
+                                    (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
+                                        p_pen4 * exp_pen4 )) /
+                                SQR( trm_pen34 );
+                            e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                            //PERFORMANCE IMPACT
+                            //MYATOMICADD(&data->E_Pen, e_pen);
+                            E_Pen [j] += e_pen;
+                            CEpen1 = e_pen * Cf9j / f9_Dj;
+                            temp   = -2.0 * p_pen2 * e_pen;
+                            CEpen2 = temp * (BOA_ij - 2.0);
+                            CEpen3 = temp * (BOA_jk - 2.0);
+                            /* END PENALTY ENERGY */
+                            /* COALITION ENERGY */
+                            p_coa1 = thbp->p_coa1;
+                            p_coa2 = g_params.l[2];
+                            p_coa3 = g_params.l[38];
+                            p_coa4 = g_params.l[30];
+                            exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
+                            e_coa = 
+                                p_coa1 / (1. + exp_coa2) *
+                                EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
+                                EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
+                                EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
+                                EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+                            //PERFORMANCE IMPACT
+                            //MYATOMICADD(&data->E_Coa, e_coa);
+                            E_Coa [j] += e_coa;
+                            CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
+                            CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
+                            CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
+                            CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
+                            CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
+                            /* END COALITION ENERGY */
+                            /* FORCES */
+                            /*
+                               MYATOMICADD(&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) );
+                               MYATOMICADD(&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) );
+                               MYATOMICADD(&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) );
+                               MYATOMICADD(&workspace->CdDelta[i], CEcoa4 );
+                               MYATOMICADD(&workspace->CdDelta[k], CEcoa5 );              
+                             */
+                            bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ;
+                            bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ;
+                            workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ;
+                            //MYATOMICADD(&workspace->CdDelta[i], CEcoa4 );
+                            pbond_ij->CdDelta_ij += CEcoa4 ;
+                            //MYATOMICADD(&workspace->CdDelta[k], CEcoa5 );              
+                            pbond_jk->CdDelta_ij += CEcoa5;
+                            for( t = start_j; t < end_j; ++t ) {
+                                pbond_jt = &( bond_list[t] );
+                                bo_jt = &(pbond_jt->bo_data);
+                                temp_bo_jt = bo_jt->BO;
+                                temp = CUBE( temp_bo_jt );
+                                pBOjt7 = temp * temp * temp_bo_jt; 
+                                // fprintf( out_control->eval, "%6d%12.8f\n", 
+                                // workspace->orig_id[ bond_list[t].nbr ], 
+                                //    (CEval6 * pBOjt7) );
+                                /*
+                                   MYATOMICADD(&bo_jt->Cdbo, (CEval6 * pBOjt7) );
+                                   MYATOMICADD(&bo_jt->Cdbopi, CEval5 );
+                                   MYATOMICADD(&bo_jt->Cdbopi2, CEval5 );
+                                 */
+                                bo_jt->Cdbo        += (CEval6 * pBOjt7) ;
+                                bo_jt->Cdbopi    += CEval5 ;
+                                bo_jt->Cdbopi2    += CEval5 ;
+                            }              
+                            if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                                /*
+                                   atomic_rvecScaledAdd( atoms[i].f, CEval8, p_ijk->dcos_di );
+                                   atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                   atomic_rvecScaledAdd( atoms[k].f, CEval8, p_ijk->dcos_dk );
+                                 */
+                                rvec_ScaledAdd( pbond_ij->f, CEval8, p_ijk->dcos_di );
+                                rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                rvec_ScaledAdd( pbond_jk->f, CEval8, p_ijk->dcos_dk );
+                            }
+                            else {
+                                /* terms not related to bond order derivatives
+                                   are added directly into 
+                                   forces and pressure vector/tensor */
+                                rvec_Scale( force, CEval8, p_ijk->dcos_di );
+                                //atomic_rvecAdd( atoms[i].f, force );
+                                rvec_Add( pbond_ij->f, force );
+                                rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                //atomic_rvecAdd( data->ext_press, ext_press );
+                                rvec_Add( aux_ext_press [j], ext_press );
+                                //atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                rvec_Scale( force, CEval8, p_ijk->dcos_dk );
+                                //atomic_rvecAdd( atoms[k].f, force );
+                                rvec_Add( pbond_jk->f, force );
+                                rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                //atomic_rvecAdd( data->ext_press, ext_press );
+                                rvec_Add( aux_ext_press [j], ext_press );
+                                /* This part is for a fully-flexible box */
+                                /* rvec_OuterProduct( temp_rtensor, 
+                                   p_ijk->dcos_di, system->atoms[i].x );
+                                   rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
+                                   rvec_OuterProduct( temp_rtensor, 
+                                   p_ijk->dcos_dj, system->atoms[j].x );
+                                   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+                                   rvec_OuterProduct( temp_rtensor, 
+                                   p_ijk->dcos_dk, system->atoms[k].x );
+                                   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+                                   if( pbond_ij->imaginary || pbond_jk->imaginary )
+                                   rtensor_ScaledAdd( data->flex_bar.P, 
+                                   -1.0, total_rtensor );
+                                   else
+                                   rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                            }
+                            //TODO -- check this
+                            //        fprintf( out_control->eval, 
+                            //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
+                            //             "%6d%6d%6d%23.15e%23.15e%23.15e\n",
+                            //             i+1, j+1, k+1,
+                            //workspace->orig_id[i]+1,  
+                            //workspace->orig_id[j]+1,
+                            //workspace->orig_id[k]+1,
+                            //workspace->Delta_boc[j], 
+                            //             RAD2DEG(theta), /*BOA_ij, BOA_jk, */
+                            //             e_ang, data->E_Ang );
+                            /*fprintf( out_control->eval, 
+                              "%23.15e%23.15e%23.15e%23.15e",
+                              p_val3, p_val4, BOA_ij, BOA_jk );
+                              fprintf( out_control->eval, 
+                              "%23.15e%23.15e%23.15e%23.15e",
+                              f7_ij, f7_jk, f8_Dj, expval12theta );
+                              fprintf( out_control->eval, 
+                              "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                              CEval1, CEval2, CEval3, CEval4, CEval5
+                            //CEval6, CEval7, CEval8  );*/
+                            /*fprintf( out_control->eval, 
+                              "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                              -p_ijk->dcos_di[0]/sin_theta, 
+                              -p_ijk->dcos_di[1]/sin_theta, 
+                              -p_ijk->dcos_di[2]/sin_theta, 
+                              -p_ijk->dcos_dj[0]/sin_theta, 
+                              -p_ijk->dcos_dj[1]/sin_theta, 
+                              -p_ijk->dcos_dj[2]/sin_theta, 
+                              -p_ijk->dcos_dk[0]/sin_theta, 
+                              -p_ijk->dcos_dk[1]/sin_theta, 
+                              -p_ijk->dcos_dk[2]/sin_theta );*/
+                            /* fprintf( out_control->epen, 
+                               "%23.15e%23.15e%23.15e\n", 
+                               CEpen1, CEpen2, CEpen3 );
+                               fprintf( out_control->epen, 
+                               "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                               workspace->orig_id[i],  workspace->orig_id[j],
+                               workspace->orig_id[k], RAD2DEG(theta), 
+                               BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
+                            //        fprintf( out_control->ecoa, 
+                            //             "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                            //             workspace->orig_id[i], 
+                            //             workspace->orig_id[j],
+                            //             workspace->orig_id[k], 
+                            //             RAD2DEG(theta), BOA_ij, BOA_jk, 
+                            //             e_coa, data->E_Coa );
+#ifdef TEST_FORCES            /* angle forces */
+                            //TODO -- check this
+                            /*
+                               Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
+                               Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
+                               Add_dDelta( system, lists, 
+                               j, CEval3 + CEval7, workspace->f_ang );
+                               for( t = start_j; t < end_j; ++t ) {
+                               pbond_jt = &( bond_list[t] );
+                               bo_jt = &(pbond_jt->bo_data);
+                               temp_bo_jt = bo_jt->BO;
+                               temp = CUBE( temp_bo_jt );
+                               pBOjt7 = temp * temp * temp_bo_jt; 
+                               Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
+                               workspace->f_ang );
+                               Add_dBOpinpi2( system, lists, j, t, 
+                               CEval5, CEval5, 
+                               workspace->f_ang, workspace->f_ang );
+                               }
+                               rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
+                               rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
+                               rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
+                            // end angle forces 
+                            // penalty forces 
+                            Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
+                            Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
+                            Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
+                            // end penalty forces 
+                            // coalition forces 
+                            Add_dBO( system, lists, 
+                            j, pi, CEcoa1-CEcoa4, workspace->f_coa );
+                            Add_dBO( system, lists, 
+                            j, pk, CEcoa2-CEcoa5, workspace->f_coa );
+                            Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
+                            Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
+                            Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
+                            // end coalition forces 
+                             */
+                        }
+                    }
+                }
+            }
+        }
+        Set_End_Index(pi, num_thb_intrs, thb_intrs );
+    }
+    //  } // end of the main for loop here
+    //TODO - to be done on the CPU
+    /*
+       if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
+       workspace->realloc.num_3body = num_thb_intrs;
+       if( num_thb_intrs > thb_intrs->num_intrs ) {
+       fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
+       data->step, num_thb_intrs, thb_intrs->num_intrs );
+       exit( INSUFFICIENT_SPACE );
+       }
+       }
+     */
+    //fprintf( stderr,"%d: Number of angle interactions: %d\n", 
+    // data->step, num_thb_intrs );
+    /*
+       fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
+       fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
+       data->E_Ang, data->E_Pen, data->E_Coa );
+       fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
+       data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+     */
+GLOBAL void k_Three_Body_Interactions_results (     reax_atom *atoms, control_params *control,
+        static_storage p_workspace, 
+        list p_bonds, int N )
+    int i, pj;
+    bond_data *pbond;
+    bond_data *sym_index_bond;
+    list *bonds = &p_bonds;
+    static_storage *workspace = &p_workspace;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N) return;
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+        pbond = &(bonds->select.bond_list[pj]);
+        sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
+        workspace->CdDelta [i] += sym_index_bond->CdDelta_ij;
+        rvec_Add (atoms[i].f, sym_index_bond->f );
+    }
+/* this is a 3-body interaction in which the main role is 
+   played by j which sits in the middle of the other two. */
+GLOBAL void k_Three_Body_Estimate ( reax_atom *atoms, 
+        control_params *control,
+        list p_bonds, int N, 
+        int *count)
+    int  i, j, pi, k, pk, t;
+    int  type_i, type_j, type_k;
+    int  start_j, end_j ;
+    int  flag, cnt, num_thb_intrs;
+    real r_ij, r_jk;
+    real BOA_ij, BOA_jk;
+    list *bonds;
+    bond_order_data *bo_ij, *bo_jk, *bo_jt;
+    bond_data *bond_list;
+    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
+    bonds = &p_bonds;
+    bond_list = bonds->select.bond_list;
+    type_j = atoms[j].type;
+    start_j = Start_Index(j, bonds);
+    end_j = End_Index(j, bonds);
+    for( pi = start_j; pi < end_j; ++pi ) {
+        num_thb_intrs = 0;
+        count [pi] = 0;
+        pbond_ij = &(bond_list[pi]);
+        bo_ij = &(pbond_ij->bo_data);
+        BOA_ij = bo_ij->BO - control->thb_cut;
+        if( BOA_ij/*bo_ij->BO*/ > 0.0 ) {
+            i = pbond_ij->nbr;
+            r_ij = pbond_ij->d;     
+            type_i = atoms[i].type;
+            /*
+               for( pk = start_j; pk < pi; ++pk ) {
+               start_pk = Start_Index( pk, thb_intrs );
+               end_pk = End_Index( pk, thb_intrs );
+               for( t = start_pk; t < end_pk; ++t )
+               if( thb_list[t].thb == i ) {
+               ++num_thb_intrs;
+               break;
+               }
+               }
+             */
+            /* and this is the second for loop mentioned above */
+            for( pk = start_j; pk < end_j; ++pk ) {
+                if (pk == pi) continue;
+                pbond_jk = &(bond_list[pk]);
+                bo_jk    = &(pbond_jk->bo_data);
+                BOA_jk   = bo_jk->BO - control->thb_cut;
+                if (BOA_jk <= 0) continue;
+                ++num_thb_intrs;
+            }
+        }
+        count [pi] = num_thb_intrs;
+    }
+GLOBAL void k_Hydrogen_Bonds(reax_atom *atoms,
+        single_body_parameters *sbp,
+        hbond_parameters *d_hbp,
+        control_params *control,
+        simulation_data *data,
+        static_storage p_workspace, 
+        list p_bonds, list p_hbonds,
+        int N, int num_atom_types, 
+        real *E_HB, rvec *aux_ext_press, rvec *atoms_f )
+    extern __shared__ real t_hb[];
+    extern __shared__ real t_f[];
+    //extern __shared__ rvec t_cdbo[];
+    //extern __shared__ rvec t_hf [];
+    real *sh_hb = t_hb;
+    rvec *sh_atomf = (rvec *)(t_hb + blockDim.x);
+    //real *sh_cdbo = t_hb + blockDim.x;
+    //rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
+    int i, j, k, pi, pk, itr, top;
+    int type_i, type_j, type_k;
+    int start_j, end_j, hb_start_j, hb_end_j;
+    int hblist[MAX_BONDS];
+    int num_hb_intrs = 0;
+    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+    rvec dvec_jk, force, ext_press;
+    ivec rel_jk;
+    // rtensor temp_rtensor, total_rtensor;
+    hbond_parameters *hbp;
+    bond_order_data *bo_ij;
+    bond_data *pbond_ij;
+    far_neighbor_data *nbr_jk;
+    list *bonds, *hbonds;
+    bond_data *bond_list;
+    hbond_data *hbond_list, *hbond_jk;
+    static_storage *workspace = &p_workspace;
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
+    //j = blockIdx.x;
+    bonds = &p_bonds;
+    bond_list = bonds->select.bond_list;
+    hbonds = &p_hbonds;
+    hbond_list = hbonds->select.hbond_list;
+    // loops below discover the Hydrogen bonds between i-j-k triplets.
+    // here j is H atom and there has to be some bond between i and j.
+    // Hydrogen bond is between j and k.
+    // so in this function i->X, j->H, k->Z when we map 
+    // variables onto the ones in the handout.
+    //for( j = 0; j < system->N; ++j )
+    sh_hb [threadIdx.x] = 0;
+    rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+    if( sbp[atoms[j].type].p_hbond==1) {// j must be H
+        //set j's variables 
+        type_j  = atoms[j].type;
+        start_j = Start_Index(j, bonds);
+        end_j   = End_Index(j, bonds);
+        hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+        hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
+        top = 0;
+        for( pi = start_j; pi < end_j; ++pi ) {
+            pbond_ij = &( bond_list[pi] );
+            i = pbond_ij->nbr;
+            bo_ij = &(pbond_ij->bo_data);
+            type_i = atoms[i].type;
+            if( sbp[type_i].p_hbond == 2 && 
+                    bo_ij->BO >= HB_THRESHOLD )
+                hblist[top++] = pi;
+        }
+        // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+        //          j, top, hb_start_j, hb_end_j );
+        for( pk = hb_start_j; pk < hb_end_j; ++pk )
+            //pk = hb_start_j + threadIdx.x;
+            //while (pk < hb_end_j)
+        {
+            // set k's varibles 
+            //TODO
+            hbond_jk = &( hbond_list[pk] );
+            //TODO
+            k = hbond_list[pk].nbr;
+            type_k = atoms[k].type;
+            nbr_jk = hbond_list[pk].ptr;
+            r_jk = nbr_jk->d;
+            rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+            //TODO Double check this Hydrogen Bonds fix
+            //rvec_MakeZero ( nbr_jk->h_f );
+            rvec_MakeZero ( hbond_jk->h_f );
+            //TODO Double check this Hydrogen Bonds fix
+            //sh_hb [threadIdx.x] = 0;
+            //itr = threadIdx.x;
+            for( itr=0; itr < top; ++itr ) {
+                //while (itr < top) {
+                pi = hblist[itr];
+                pbond_ij = &( bond_list[pi] );
+                i = pbond_ij->nbr;
+                //TODO
+                //rvec_MakeZero (sh_hf [threadIdx.x]);
+                //sh_cdbo [threadIdx.x] = 0;
+                //rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+                if( i != k ) {
+                    bo_ij = &(pbond_ij->bo_data);
+                    type_i = atoms[i].type;
+                    r_ij = pbond_ij->d;         
+                    hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
+                    ++num_hb_intrs;
+                    d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &theta, &cos_theta );
+                    // the derivative of cos(theta)
+                    d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &dcos_theta_di, &dcos_theta_dj, 
+                            &dcos_theta_dk );
+                    // hydrogen bond energy
+                    sin_theta2 = SIN( theta/2.0 );
+                    sin_xhz4 = SQR(sin_theta2);
+                    sin_xhz4 *= sin_xhz4;
+                    cos_xhz1 = ( 1.0 - cos_theta );
+                    exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                    exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
+                                r_jk / hbp->r0_hb - 2.0 ) );
+                    //PERFORMANCE IMPACT
+                    e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                    //MYATOMICADD( &data->E_HB, e_hb );
+                    //E_HB [j] += e_hb;
+                    sh_hb [threadIdx.x] += e_hb;
+                    CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
+                    CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                    CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
+                            1.0 / hbp->r0_hb);
+                    //this is the problem here
+                    //TODO
+                    // hydrogen bond forces
+                    bo_ij->Cdbo += CEhb1;   // dbo term
+                    //sh_cdbo[threadIdx.x] += CEhb1;
+                    //TODO
+                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                        //PERFORMANCE IMPACT
+                        /*
+                           atomic_rvecScaledAdd( atoms[i].f, 
+                           +CEhb2, dcos_theta_di ); //dcos terms
+                           atomic_rvecScaledAdd( atoms[j].f, 
+                           +CEhb2, dcos_theta_dj );
+                           atomic_rvecScaledAdd( atoms[k].f, 
+                           +CEhb2, dcos_theta_dk );
+                        //dr terms
+                        atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                        atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk );
+                         */
+                        //PERFORMANCE IMPACT
+                        rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms
+                        //rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms
+                        //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
+                        //TODO you forgot here
+                        //TODO Hydrogen bonds fix. -- BE VERY CAREFUL *****
+                        rvec_ScaledAdd( hbond_jk->h_f, 
+                                +CEhb2, dcos_theta_dk );
+                        //rvec_ScaledAdd( nbr_jk->h_f, 
+                        //     +CEhb2, dcos_theta_dk );
+                        //dr terms
+                        //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                        rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk );
+                        //atoms_f [j] ++;
+                        //TODO you forgot 
+                        rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk );
+                        //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk );
+                    }
+                    else
+                    {
+                        // for pressure coupling, terms that are not related 
+                        // to bond order derivatives are added directly into 
+                        // pressure vector/tensor 
+                        rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
+                        rvec_Add( pbond_ij->h_f, force );
+                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                        //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press );
+                        rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
+                        ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                        rvec_Scale( force, +CEhb2, dcos_theta_dk );
+                        //rvec_Add( nbr_jk->h_f, force );
+                        rvec_Add( hbond_jk->h_f, force );
+                        rvec_iMultiply( ext_press, rel_jk, force );
+                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                        //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
+                        //dr terms
+                        rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                        rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                        rvec_Add( hbond_jk->h_f, force );
+                        rvec_iMultiply( ext_press, rel_jk, force );
+                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                        //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
+                    }
+                    //do the reduction for the bond_ij here
+                    /*
+                       if (threadIdx.x < 16){
+                       sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
+                       rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
+                       sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
+                       rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
+                       }
+                       if (threadIdx.x < 8){ 
+                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
+                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
+                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
+                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
+                    }
+                    if (threadIdx.x < 4){
+                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
+                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
+                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
+                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
+                    }
+                    if (threadIdx.x < 2){
+                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
+                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
+                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
+                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
+                    }
+                    if (threadIdx.x < 1){
+                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
+                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
+                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
+                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
+                    }
+                    if (threadIdx.x == 0){
+                    //bo_ij->Cdbo += sh_cdbo [threadIdx.x];
+                    //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
+                    E_HB [j] += sh_hb [threadIdx.x];
+                    //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+                    }
+                     */
+                } // i != k if statement
+                //itr += blockDim.x;
+            } //itr for statement
+            /*
+               __syncthreads ();
+               for (int x = 1; x < blockDim.x; x++)
+               sh_hb [0] += sh_hb [x];    
+               E_HB [j] += sh_hb[0];
+               if (threadIdx.x < 16) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
+               if (threadIdx.x < 8) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
+               if (threadIdx.x < 4) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
+               if (threadIdx.x < 2) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
+               if (threadIdx.x < 1) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
+               if (threadIdx.x == 0) E_HB [j] += sh_hb [threadIdx.x];
+             */
+            //pk += blockDim.x;
+        }  // pk for statement
+    } // main if statment
+    //do the reduction for the bond_ij here
+    /*
+       if (threadIdx.x < 16){
+       sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
+    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
+    }
+    if (threadIdx.x < 8){ 
+    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
+    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
+    }
+    if (threadIdx.x < 4){
+    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
+    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
+    }
+    if (threadIdx.x < 2){
+    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
+    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
+    }
+    if (threadIdx.x < 1){
+    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
+    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
+    }
+    if (threadIdx.x == 0){
+    E_HB [j] += sh_hb [threadIdx.x];
+    //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+    }
+     */
+    E_HB [j]  += sh_hb [threadIdx.x];
+    rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+    //rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]);
+DEVICE void warpReduce(volatile real* sdata, int tid) 
+    if (tid < 16) sdata[tid] += sdata[tid + 16]; 
+    if (tid < 8) sdata[tid] += sdata[tid + 8]; 
+    if (tid < 4) sdata[tid] += sdata[tid + 4]; 
+    if (tid < 2) sdata[tid] += sdata[tid + 2]; 
+    if (tid < 1) sdata[tid] += sdata[tid + 1]; 
+GLOBAL void k_Hydrogen_Bonds_HB(reax_atom *atoms,
+        single_body_parameters *sbp,
+        hbond_parameters *d_hbp,
+        control_params *control,
+        simulation_data *data,
+        static_storage p_workspace, 
+        list p_bonds, list p_hbonds,
+        int N, int num_atom_types, 
+        real *E_HB, rvec *aux_ext_press, rvec *atoms_f )
+    extern __shared__ real t_hb[];
+    extern __shared__ rvec t__f[];
+    extern __shared__ rvec t_cdbo[];
+    extern __shared__ rvec t_hf [];
+    real *sh_hb = t_hb;
+    real *sh_cdbo = t_hb + blockDim.x;
+    rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x);
+    rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warp_id = thread_id / __THREADS_PER_ATOM__;
+    int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
+    int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+    if (warp_id >= N ) return;
+    int i, j, k, pi, pk, itr, top;
+    int type_i, type_j, type_k;
+    int start_j, end_j, hb_start_j, hb_end_j;
+    int hblist[MAX_BONDS];
+    int num_hb_intrs = 0;
+    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+    rvec dvec_jk, force, ext_press;
+    ivec rel_jk;
+    // rtensor temp_rtensor, total_rtensor;
+    hbond_parameters *hbp;
+    bond_order_data *bo_ij;
+    bond_data *pbond_ij;
+    far_neighbor_data *nbr_jk;
+    list *bonds, *hbonds;
+    bond_data *bond_list;
+    hbond_data *hbond_list, *hbond_jk;
+    static_storage *workspace = &p_workspace;
+    /*
+       j = blockIdx.x * blockDim.x + threadIdx.x;
+       if (j >= N) return;
+     */
+    //     j = blockIdx.x;
+    j = warp_id;
+    bonds = &p_bonds;
+    bond_list = bonds->select.bond_list;
+    hbonds = &p_hbonds;
+    hbond_list = hbonds->select.hbond_list;
+    // loops below discover the Hydrogen bonds between i-j-k triplets.
+    // here j is H atom and there has to be some bond between i and j.
+    // Hydrogen bond is between j and k.
+    // so in this function i->X, j->H, k->Z when we map 
+    // variables onto the ones in the handout.
+    //for( j = 0; j < system->N; ++j )
+    sh_hb [threadIdx.x] = 0;
+    rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+    if( sbp[atoms[j].type].p_hbond==1) {// j must be H
+        //set j's variables 
+        type_j  = atoms[j].type;
+        start_j = Start_Index(j, bonds);
+        end_j   = End_Index(j, bonds);
+        hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+        hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
+        top = 0;
+        for( pi = start_j; pi < end_j; ++pi ) {
+            pbond_ij = &( bond_list[pi] );
+            i = pbond_ij->nbr;
+            bo_ij = &(pbond_ij->bo_data);
+            type_i = atoms[i].type;
+            if( sbp[type_i].p_hbond == 2 && 
+                    bo_ij->BO >= HB_THRESHOLD ) {
+                hblist[top++] = pi;
+            }
+        }
+        // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+        //          j, top, hb_start_j, hb_end_j );
+        for( itr=0; itr < top; ++itr ) {
+            pi = hblist[itr];
+            pbond_ij = &( bond_list[pi] );
+            i = pbond_ij->nbr;
+            //TODO
+            rvec_MakeZero (sh_hf [threadIdx.x]);
+            sh_cdbo [threadIdx.x] = 0;
+            //for( pk = hb_start_j; pk < hb_end_j; ++pk )
+            int loopcount = (hb_end_j - hb_start_j) / HBONDS_THREADS_PER_ATOM + (((hb_end_j - hb_start_j)%HBONDS_THREADS_PER_ATOM == 0) ? 0 : 1);
+            int count = 0;
+            //jpk = hb_start_j + threadIdx.x;
+            pk = hb_start_j + lane_id;
+            //while (pk < hb_end_j)
+            while (count < loopcount)
+            {
+                if (pk < hb_end_j)
+                {
+                    // set k's varibles 
+                    //TODO
+                    hbond_jk = &( hbond_list[pk] );
+                    //TODO
+                    k = hbond_list[pk].nbr;
+                    type_k = atoms[k].type;
+                    nbr_jk = hbond_list[pk].ptr;
+                    r_jk = nbr_jk->d;
+                    rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+                }
+                else k = -1;
+                //TODO Double check this Hydrogen Bonds fix
+                //rvec_MakeZero ( nbr_jk->h_f );
+                //rvec_MakeZero ( hbond_jk->h_f );
+                //TODO Double check this Hydrogen Bonds fix
+                //sh_hb [threadIdx.x] = 0;
+                //rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+                //__syncthreads ();
+                if(( i != k ) && (k != -1)) {
+                    bo_ij = &(pbond_ij->bo_data);
+                    type_i = atoms[i].type;
+                    r_ij = pbond_ij->d;         
+                    hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
+                    ++num_hb_intrs;
+                    d_Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &theta, &cos_theta );
+                    // the derivative of cos(theta)
+                    d_Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &dcos_theta_di, &dcos_theta_dj, 
+                            &dcos_theta_dk );
+                    // hydrogen bond energy
+                    sin_theta2 = SIN( theta/2.0 );
+                    sin_xhz4 = SQR(sin_theta2);
+                    sin_xhz4 *= sin_xhz4;
+                    cos_xhz1 = ( 1.0 - cos_theta );
+                    exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                    exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
+                                r_jk / hbp->r0_hb - 2.0 ) );
+                    //PERFORMANCE IMPACT
+                    e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                    //MYATOMICADD( &data->E_HB, e_hb );
+                    //E_HB [j] += e_hb;
+                    sh_hb [threadIdx.x] += e_hb;
+                    CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
+                    CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                    CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
+                            1.0 / hbp->r0_hb);
+                    //this is the problem here
+                    //TODO
+                    // hydrogen bond forces
+                    //bo_ij->Cdbo += CEhb1;   // dbo term
+                    sh_cdbo[threadIdx.x] += CEhb1;
+                    //TODO
+                    //warpReduce (sh_cdbo, threadIdx.x);
+                    //if (threadIdx.x == 0)
+                    //    bo_ij->Cdbo += sh_cdbo [0];
+                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
+                        //PERFORMANCE IMPACT
+                        /*
+                           atomic_rvecScaledAdd( atoms[i].f, 
+                           +CEhb2, dcos_theta_di ); //dcos terms
+                           atomic_rvecScaledAdd( atoms[j].f, 
+                           +CEhb2, dcos_theta_dj );
+                           atomic_rvecScaledAdd( atoms[k].f, 
+                           +CEhb2, dcos_theta_dk );
+                        //dr terms
+                        atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                        atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk );
+                         */
+                        //PERFORMANCE IMPACT
+                        //rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms
+                        rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms
+                        //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
+                        //TODO you forgot here
+                        //TODO Hydrogen bonds fix. -- BE VERY CAREFUL *****
+                        rvec_ScaledAdd( hbond_jk->h_f, +CEhb2, dcos_theta_dk );
+                        //rvec_ScaledAdd( nbr_jk->h_f, 
+                        //     +CEhb2, dcos_theta_dk );
+                        //dr terms
+                        //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                        rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk );
+                        //TODO you forgot 
+                        rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk );
+                        //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk );
+                    }
+                    else
+                    {
+                        // for pressure coupling, terms that are not related 
+                        // to bond order derivatives are added directly into 
+                        // pressure vector/tensor 
+                        //rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
+                        //rvec_Add( pbond_ij->h_f, force );
+                        //rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                        //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press );
+                        //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
+                        //ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                        //rvec_Scale( force, +CEhb2, dcos_theta_dk );
+                        //rvec_Add( nbr_jk->h_f, force );
+                        //rvec_Add( hbond_jk->h_f, force );
+                        //rvec_iMultiply( ext_press, rel_jk, force );
+                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                        //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
+                        //dr terms
+                        //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                        //rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                        //rvec_Add( hbond_jk->h_f, force );
+                        //rvec_iMultiply( ext_press, rel_jk, force );
+                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                        //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
+                    }
+                } // i != k if statement
+                pk += __THREADS_PER_ATOM__;
+                count ++;
+            }  // pk for statement
+            //__syncthreads ();
+            //at this point done with one bond....
+            //do the reduction now
+            //if (threadIdx.x == 0){
+            if (lane_id < 16) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
+            }
+            if (lane_id < 8) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
+            }
+            if (lane_id < 4) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
+            }
+            if (lane_id < 2) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
+            }
+            if (lane_id < 1) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
+                bo_ij->Cdbo += sh_cdbo [threadIdx.x];
+                rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
+            }
+            /*
+               if (lane_id == 0){
+               for (i = 1; i < 32; i++)
+               {
+            //sh_cdbo [threadIdx.x] += sh_cdbo [i];
+            //rvec_Add (sh_hf [threadIdx.x], sh_hf [i]);
+            sh_cdbo [lane_id] += sh_cdbo [lane_id + i];
+            rvec_Add (sh_hf [lane_id], sh_hf [lane_id + i]);
+            }
+            //bo_ij->Cdbo += sh_cdbo [threadIdx.x];
+            //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
+            bo_ij->Cdbo += sh_cdbo [lane_id];
+            rvec_Add (pbond_ij->h_f, sh_hf [lane_id]);
+            }
+             */
+        } //itr for statement
+    //__syncthreads ();
+    } // main if statment
+    //__syncthreads ();
+    //do the reduction for the bond_ij here
+    if (lane_id < 16){
+        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
+    }
+    if (lane_id < 8){ 
+        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
+    }
+    if (lane_id < 4){
+        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
+    }
+    if (lane_id < 2){
+        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
+    }
+    if (lane_id < 1){
+        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
+        rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
+        E_HB [j] += sh_hb [threadIdx.x];
+        rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+    }
+    /*
+       if (lane == 0){
+    //E_HB [j] += sh_hb [threadIdx.x];
+    rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+    rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]);
+    }
+     */
+    //if (threadIdx.x == 0){
+    /*
+       if (lane_id == 0){
+       for (i = 1; i < 32; i++)
+       {
+    //sh_hb [threadIdx.x] += sh_hb [i];
+    //rvec_Add (sh_atomf [threadIdx.x], sh_atomf [i]);
+    sh_hb [lane_id] += sh_hb [lane_id + i];
+    rvec_Add (sh_atomf [lane_id], sh_atomf [lane_id + i]);
+    }
+    //E_HB [j] += sh_hb [threadIdx.x];
+    //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+    E_HB [j] += sh_hb [lane_id];
+    rvec_Add (atoms[j].f, sh_atomf [lane_id]);
+    //rvec_Copy (atoms_f[j], sh_atomf [threadIdx.x]);
+    }
+     */
+    //E_HB [j]  += sh_hb [threadIdx.x];
+    //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+GLOBAL void k_Hydrogen_Bonds_Postprocess(reax_atom *atoms, 
+        single_body_parameters *sbp,
+        static_storage p_workspace,
+        list p_bonds, list p_hbonds, list p_far_nbrs, int N, 
+        real *e_hb)
+    int i, pj, hj, nbr, k, j;
+    int start, end;
+    bond_data *pbond;
+    bond_data *sym_index_bond;
+    far_neighbor_data *nbr_pj, *sym_index_nbr;
+    list *bonds = &p_bonds;
+    list *far_nbrs = &p_far_nbrs;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N) return;
+    // For processing ij information
+    start = Start_Index(i, bonds);
+    end = End_Index(i, bonds); 
+    //rvec_Scale (atoms[i].f, e_hb[i], atoms[i].f);
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+        pbond = &(bonds->select.bond_list[pj]);
+        sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
+        rvec_Add (atoms[i].f, sym_index_bond->h_f );
+    }
+    /*
+       for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++)
+       {
+    // check if the neighbor is of h_type
+    nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+    j = nbr_pj->nbr;
+    sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]);
+    rvec_Add (atoms[i].f, sym_index_nbr->h_f );
+    }
+     */
+    //    if (workspace->hbond_index [j] != -1)
+    //    {
+    //        hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+    //        hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
+    //        for ( hj = hb_start_j; hj < hb_end_j; hj ++ )
+    //        {
+    //            h_bond_data = &( hbonds->select.hbond_list [hj] );
+    //             nbr = h_bond_data->nbr;
+    //            if (nbr == i) {
+    //                     rvec_Add (atoms[i].f, h_bond_data->h_f );
+    //            }
+    //        }
+    //    }
+GLOBAL void k_Hydrogen_Bonds_Far_Nbrs(reax_atom *atoms, 
+        single_body_parameters *sbp,
+        static_storage p_workspace,
+        list p_bonds, list p_hbonds, list p_far_nbrs, int N )
+    extern __shared__ rvec __f[];
+    int i, pj,j;
+    int start, end;
+    far_neighbor_data *nbr_pj, *sym_index_nbr;
+    list *far_nbrs = &p_far_nbrs;
+    i = blockIdx.x;
+    start = Start_Index (i, far_nbrs);
+    end = End_Index (i, far_nbrs);
+    pj = start + threadIdx.x;
+    rvec_MakeZero (__f[threadIdx.x]);
+    while (pj < end)
+    {
+        nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        //sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]);
+        //
+        //rvec_Add (atoms[i].f, sym_index_nbr->h_f );
+        //
+        //rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f );
+        pj += blockDim.x;
+    }
+    if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
+    if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
+    if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
+    if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
+    if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
+    if (threadIdx.x == 0)
+        rvec_Add (atoms[i].f, __f[0]);
+GLOBAL void k_Hydrogen_Bonds_HNbrs(reax_atom *atoms, 
+        single_body_parameters *sbp,
+        static_storage p_workspace,
+        list p_bonds, list p_hbonds, list p_far_nbrs, int N )
+    extern __shared__ rvec __f[];
+    int i, pj,j;
+    int start, end;
+    hbond_data *nbr_pj, *sym_index_nbr;
+    list *hbonds = &p_hbonds;
+    i = blockIdx.x;
+    start = Start_Index (i, hbonds);
+    end = End_Index (i, hbonds);
+    pj = start + threadIdx.x;
+    rvec_MakeZero (__f[threadIdx.x]);
+    while (pj < end)
+    {
+        nbr_pj = &( hbonds->select.hbond_list[pj] );
+        j = nbr_pj->nbr;
+        sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
+        rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f );
+        pj += blockDim.x;
+    }
+    if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
+    if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
+    if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
+    if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
+    if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
+    if (threadIdx.x == 0)
+        rvec_Add (atoms[i].f, __f[0]);
diff --git a/PuReMD-GPU/src/cuda_three_body_interactions.h b/PuReMD-GPU/src/cuda_three_body_interactions.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a87fcfe42852c50ac58a1ef52e8353cf24971a2
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_three_body_interactions.h
@@ -0,0 +1,71 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
+DEVICE void d_Calculate_Theta( rvec, real, rvec, real, real*, real* );
+DEVICE void d_Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* );
+GLOBAL void k_Three_Body_Interactions( reax_atom *, single_body_parameters *, three_body_header *,
+        global_parameters , control_params *, simulation_data *, static_storage ,
+        list , list , int , int , real *, real *, real *, rvec *);
+GLOBAL void k_Three_Body_Interactions_results( reax_atom *,
+        control_params *, static_storage , list , int );
+GLOBAL void k_Three_Body_Estimate( reax_atom *atoms,
+        control_params *control, list p_bonds, int N, int *count);
+GLOBAL void k_Hydrogen_Bonds( reax_atom *,
+        single_body_parameters *, hbond_parameters *,
+        control_params *, simulation_data *, static_storage ,
+        list , list , int , int, real *, rvec *, rvec *);
+GLOBAL void k_Hydrogen_Bonds_HB( reax_atom *,
+        single_body_parameters *, hbond_parameters *,
+        control_params *, simulation_data *, static_storage ,
+        list , list , int , int, real *, rvec *, rvec *);
+GLOBAL void k_Hydrogen_Bonds_Postprocess(  reax_atom *,
+        single_body_parameters *,
+        static_storage , list,
+        list , list , int, real * );
+GLOBAL void k_Hydrogen_Bonds_Far_Nbrs(  reax_atom *,
+        single_body_parameters *, static_storage , list, list , list , int );
+GLOBAL void k_Hydrogen_Bonds_HNbrs( reax_atom *, single_body_parameters *,
+        static_storage , list, list , list , int );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_two_body_interactions.cu b/PuReMD-GPU/src/cuda_two_body_interactions.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d5f6abd64b16cc4582a404566560768eca49b866
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_two_body_interactions.cu
@@ -0,0 +1,1047 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "cuda_two_body_interactions.h"
+#include "bond_orders.h"
+#include "index_utils.h"
+#include "list.h"
+#include "lookup.h"
+#include "vector.h"
+#include "index_utils.h"
+#include "cuda_helpers.h"
+GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, 
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        simulation_data *data,
+        static_storage p_workspace, list p_bonds, 
+        int N, int num_atom_types, real *E_BE)
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    real ebond, pow_BOs_be2, exp_be12, CEbo;
+    real gp3, gp4, gp7, gp10, gp37;
+    real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
+    real decobdbo, decobdboua, decobdboub;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij;
+    list *bonds;
+    static_storage *workspace;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N ) return;
+    bonds = &p_bonds;
+    workspace = &p_workspace;
+    gp3 = g_params.l[3];
+    gp4 = g_params.l[4];
+    gp7 = g_params.l[7];
+    gp10 = g_params.l[10];
+    gp37 = (int) g_params.l[37];
+    //for( i=0; i < system->N; ++i )
+    start_i = Start_Index(i, bonds);
+    end_i = End_Index(i, bonds);
+    //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
+    for( pj = start_i; pj < end_i; ++pj )
+    {
+        //TODO
+        //if( i < bonds->select.bond_list[pj].nbr ) 
+        if( i < bonds->select.bond_list[pj].nbr ) 
+        {
+            //TODO
+            /* set the pointers */
+            j = bonds->select.bond_list[pj].nbr;
+            type_i = atoms[i].type;
+            type_j = atoms[j].type;
+            sbp_i = &( sbp[type_i] );
+            sbp_j = &( sbp[type_j] );
+            twbp = &( tbp[ index_tbp(type_i,type_j,num_atom_types) ] );
+            bo_ij = &( bonds->select.bond_list[pj].bo_data );
+            /* calculate the constants */
+            pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
+            exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
+            CEbo = -twbp->De_s * exp_be12 * 
+                ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
+            /* calculate the Bond Energy */
+            ebond = 
+                -twbp->De_s * bo_ij->BO_s * exp_be12 
+                -twbp->De_p * bo_ij->BO_pi 
+                -twbp->De_pp * bo_ij->BO_pi2;
+            //MYATOMICADD(&data->E_BE, ebond);
+            //TODO
+            //E_BE [ i ] += ebond/2.0;
+            E_BE [ i ] += ebond;
+            //data->E_BE += ebond;
+            /* calculate derivatives of Bond Orders */
+            bo_ij->Cdbo += CEbo;
+            bo_ij->Cdbopi -= (CEbo + twbp->De_p);
+            bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
+            //TODO
+            //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
+            //     workspace->orig_id[i], workspace->orig_id[j], 
+            // i+1, j+1, 
+            //     bo_ij->BO, ebond/*, data->E_BE*/ );
+            /*
+               fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
+               workspace->orig_id[i], workspace->orig_id[j], 
+               CEbo, -twbp->De_p, -twbp->De_pp );*/
+            //TODO
+            /*
+               Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
+               Add_dBOpinpi2( system, lists, i, pj, 
+               -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
+               workspace->f_be, workspace->f_be );
+             */
+            //TODO
+            /* Stabilisation terminal triple bond */
+            if( bo_ij->BO >= 1.00 ) {
+                if( gp37 == 2 ||
+                        (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
+                        (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
+                    // ba = SQR(bo_ij->BO - 2.50);
+                    exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
+                    //oboa=abo(j1)-boa;
+                    //obob=abo(j2)-boa;
+                    exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
+                    exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
+                    //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
+                    exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
+                    hulpov = 1.0 / (1.0 + 25.0 * exphuov);
+                    estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
+                    //estrain(j1) = estrain(j1) + 0.50*estriph;
+                    //estrain(j2) = estrain(j2) + 0.50*estriph;
+                    //PERFORMANCE IMPACT
+                    //MYATOMICADD(&data->E_BE, estriph);
+                    E_BE [ i] += estriph;
+                    //data->E_BE += estriph;
+                    decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
+                        ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
+                    decobdboua = -gp10 * exphu * hulpov * 
+                        (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+                    decobdboub = -gp10 * exphu * hulpov * 
+                        (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+                    bo_ij->Cdbo += decobdbo;
+                    //PERFORMANCE IMAPCT
+                    workspace->CdDelta[i] += decobdboua;
+                    //MYATOMICADD(&workspace->CdDelta[j], decobdboub);
+                    //CdDelta [ i * N + i ] += decobdboua;
+                    //CdDelta [ i * N + j ] += decobdboua;
+                    //workspace->CdDelta [i] += decobdboua;
+                    //workspace->CdDelta [j] += decobdboub;
+                    /*
+                       fprintf( out_control->ebond, 
+                       "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                       workspace->orig_id[i], workspace->orig_id[j],
+                    //i+1, j+1, 
+                    estriph, decobdbo, decobdboua, decobdboub );
+                     */
+                    /*
+                       Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
+                       Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
+                       Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
+                     */
+                }
+            }
+        }
+    } //TODO commented out the if statement for processing i < j. 
+    // we process all teh bonds and add only half the energy
+   GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms,     
+   two_body_parameters *tbp,
+   global_parameters g_p,
+   control_params *control, 
+   simulation_data *data,  
+   list p_far_nbrs, 
+   real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
+   int num_atom_types, int N )
+   {
+   int  i, j, pj;
+   int  start_i, end_i;
+   real self_coef;
+   real p_vdW1, p_vdW1i;
+   real powr_vdW1, powgi_vdW1;
+   real tmp, r_ij, fn13, exp1, exp2;
+   real Tap, dTap, dfn13, CEvd, CEclmb;
+   real dr3gamij_1, dr3gamij_3;
+   real e_ele, e_vdW, e_core, de_core;
+   rvec temp, ext_press;
+// rtensor temp_rtensor, total_rtensor;
+two_body_parameters *twbp;
+far_neighbor_data *nbr_pj;
+list *far_nbrs = &p_far_nbrs;
+i = blockIdx.x * blockDim.x + threadIdx.x;
+if ( i >= N ) return;
+p_vdW1 = g_p.l[28];
+p_vdW1i = 1.0 / p_vdW1;
+e_ele = 0;
+e_vdW = 0;
+e_core = 0;
+de_core = 0;
+//for( i = 0; i < system->N; ++i ) {
+start_i = Start_Index(i, far_nbrs);
+end_i   = End_Index(i, far_nbrs);
+// fprintf( stderr, "i: %d, start: %d, end: %d\n",
+//     i, start_i, end_i );
+for( pj = start_i; pj < end_i; ++pj )
+if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+j = nbr_pj->nbr;
+r_ij = nbr_pj->d;
+twbp = &(tbp[ index_tbp(atoms[i].type, atoms[j].type, num_atom_types) ]);
+self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
+//if (i <= j) continue;
+// Calculate Taper and its derivative 
+// Tap = nbr_pj->Tap;   -- precomputed during compte_H
+Tap = control->Tap7 * r_ij + control->Tap6;
+Tap = Tap * r_ij + control->Tap5;
+Tap = Tap * r_ij + control->Tap4;
+Tap = Tap * r_ij + control->Tap3;
+Tap = Tap * r_ij + control->Tap2;
+Tap = Tap * r_ij + control->Tap1;
+Tap = Tap * r_ij + control->Tap0;
+dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
+dTap = dTap * r_ij + 5*control->Tap5;
+dTap = dTap * r_ij + 4*control->Tap4;
+dTap = dTap * r_ij + 3*control->Tap3;
+dTap = dTap * r_ij + 2*control->Tap2;
+dTap += control->Tap1/r_ij;
+//vdWaals Calculations
+if(g_p.vdw_type==1 || g_p.vdw_type==3) {
+    // shielding
+    powr_vdW1 = POW(r_ij, p_vdW1);
+    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+    E_vdW [i] += e_vdW / 2.0;
+    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
+        POW(r_ij, p_vdW1 - 2.0);
+    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+            (exp1 - exp2) * dfn13 );
+else{ // no shielding
+    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+    E_vdW [i] += e_vdW / 2.0;
+    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+            (exp1 - exp2) );
+if(g_p.vdw_type==2 || g_p.vdw_type==3) {
+    // innner wall
+    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+    e_vdW = self_coef * Tap * e_core;
+    //TODO check this
+    E_vdW [i] += e_vdW / 2.0;
+    //TODO check this
+    de_core = -(twbp->acore/twbp->rcore) * e_core;
+    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
+//Coulomb Calculations
+dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+tmp = Tap / dr3gamij_3;
+//tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
+e_ele = 
+self_coef * C_ele * atoms[i].q * atoms[j].q * tmp;
+E_Ele [i] += e_ele / 2.0;
+CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q *
+( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+//CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
+// ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;
+if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+    if (i >= j)
+        rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec );
+    else
+        rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec );
+else { // NPT, iNPT or sNPT
+    // for pressure coupling, terms not related to bond order 
+    //  derivatives are added directly into pressure vector/tensor 
+    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+    if ( i >= j)
+        rvec_ScaledAdd( atoms[i].f, -1., temp );
+    else
+        rvec_Add( atoms[i].f, temp );
+    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+    //rvec_Add( data->ext_press, ext_press );
+    rvec_Copy (aux_ext_press[i], ext_press);
+    //TODO CHECK THIS calculation here, it should be divided by two somehow.
+GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms,     
+        two_body_parameters *tbp,
+        global_parameters g_p,
+        control_params *control, 
+        simulation_data *data,  
+        list p_far_nbrs, 
+        real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
+        int num_atom_types, int N )
+    extern __shared__ real _vdw[];
+    extern __shared__ real _ele[];
+    extern __shared__ rvec _force [];
+    real *sh_vdw;
+    real *sh_ele;
+    rvec *sh_force;
+    int  i, j, pj;
+    int  start_i, end_i;
+    real self_coef;
+    real p_vdW1, p_vdW1i;
+    real powr_vdW1, powgi_vdW1;
+    real tmp, r_ij, fn13, exp1, exp2;
+    real Tap, dTap, dfn13, CEvd, CEclmb;
+    real dr3gamij_1, dr3gamij_3;
+    real e_ele, e_vdW, e_core, de_core;
+    rvec temp, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    list *far_nbrs = &p_far_nbrs;
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warpid = thread_id / VDW_THREADS_PER_ATOM;
+    int laneid = thread_id & (VDW_THREADS_PER_ATOM -1);
+    i = warpid;
+    sh_vdw = _vdw;
+    sh_ele = _vdw + blockDim.x;
+    sh_force = (rvec *)( _vdw + 2*blockDim.x);
+    sh_vdw[threadIdx.x] = 0.0; 
+    sh_ele[threadIdx.x] = 0.0; 
+    rvec_MakeZero ( sh_force [threadIdx.x] );
+    if (i < N)
+    {
+        p_vdW1 = g_p.l[28];
+        p_vdW1i = 1.0 / p_vdW1;
+        e_ele = 0;
+        e_vdW = 0;
+        e_core = 0;
+        de_core = 0;
+        //for( i = 0; i < system->N; ++i ) {
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
+        // fprintf( stderr, "i: %d, start: %d, end: %d\n",
+        //     i, start_i, end_i );
+        pj = start_i + laneid;
+        //for( pj = start_i; pj < end_i; ++pj )
+        while (pj < end_i)
+        {
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j = nbr_pj->nbr;
+                r_ij = nbr_pj->d;
+                twbp = &(tbp[ index_tbp(atoms[i].type, atoms[j].type, num_atom_types) ]);
+                self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
+                //CHANGE ORIGINAL
+                //if (i <= j) continue;
+                //CHANGE ORIGINAL
+                // Calculate Taper and its derivative 
+                // Tap = nbr_pj->Tap;   -- precomputed during compte_H
+                Tap = control->Tap7 * r_ij + control->Tap6;
+                Tap = Tap * r_ij + control->Tap5;
+                Tap = Tap * r_ij + control->Tap4;
+                Tap = Tap * r_ij + control->Tap3;
+                Tap = Tap * r_ij + control->Tap2;
+                Tap = Tap * r_ij + control->Tap1;
+                Tap = Tap * r_ij + control->Tap0;
+                dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
+                dTap = dTap * r_ij + 5*control->Tap5;
+                dTap = dTap * r_ij + 4*control->Tap4;
+                dTap = dTap * r_ij + 3*control->Tap3;
+                dTap = dTap * r_ij + 2*control->Tap2;
+                dTap += control->Tap1/r_ij;
+                //vdWaals Calculations
+                if(g_p.vdw_type==1 || g_p.vdw_type==3) {
+                    // shielding
+                    powr_vdW1 = POW(r_ij, p_vdW1);
+                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+                    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+                    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+                    //E_vdW [i] += e_vdW / 2.0;
+                    sh_vdw [threadIdx.x] += e_vdW/2.0;
+                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
+                        POW(r_ij, p_vdW1 - 2.0);
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+                            (exp1 - exp2) * dfn13 );
+                }
+                else{ // no shielding
+                    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+                    //E_vdW [i] += e_vdW / 2.0;
+                    sh_vdw [threadIdx.x] += e_vdW/2.0;
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+                            (exp1 - exp2) );
+                }
+                if(g_p.vdw_type==2 || g_p.vdw_type==3) {
+                    // innner wall
+                    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+                    e_vdW = self_coef * Tap * e_core;
+                    //TODO check this
+                    //E_vdW [i] += e_vdW / 2.0;
+                    sh_vdw [threadIdx.x] += e_vdW / 2.0;
+                    //TODO check this
+                    de_core = -(twbp->acore/twbp->rcore) * e_core;
+                    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
+                }
+                //Coulomb Calculations
+                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+                tmp = Tap / dr3gamij_3;
+                //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
+                e_ele = 
+                    self_coef * C_ele * atoms[i].q * atoms[j].q * tmp;
+                //E_Ele [i] += e_ele / 2.0;
+                sh_ele [threadIdx.x] += e_ele / 2.0;
+                CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q *
+                    ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+                //CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
+                // ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                    if (i >= j){
+                        //rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force[threadIdx.x], -(CEvd+CEclmb), nbr_pj->dvec );
+                    }
+                    else
+                    {
+                        //rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force[threadIdx.x], +(CEvd+CEclmb), nbr_pj->dvec );
+                    }
+                }
+                else { // NPT, iNPT or sNPT
+                    // for pressure coupling, terms not related to bond order 
+                    //  derivatives are added directly into pressure vector/tensor 
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                    if ( i >= j)
+                    {
+                        //rvec_ScaledAdd( atoms[i].f, -1., temp );
+                        rvec_ScaledAdd( sh_force[threadIdx.x], -1., temp );
+                    }
+                    else
+                    {
+                        //rvec_Add( atoms[i].f, temp );
+                        rvec_Add( sh_force[threadIdx.x], temp );
+                    }
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                    //rvec_Add( data->ext_press, ext_press );
+                    rvec_Copy (aux_ext_press[i], ext_press);
+                    //TODO CHECK THIS calculation here, it should be divided by two somehow.
+                }
+            } // if condition for far neighbors
+            pj += VDW_THREADS_PER_ATOM;
+        } // end of while loop for pj < end_i condition
+    } // if (i < N ) condition
+    //}
+    __syncthreads ();
+    if (laneid < 16) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
+    }
+    __syncthreads ();
+    if (laneid < 8) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
+    }
+    __syncthreads ();
+    if (laneid < 4) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
+    }
+    __syncthreads ();
+    if (laneid < 2) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
+    }
+    __syncthreads ();
+    if (laneid < 1) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
+    }
+    __syncthreads ();
+    if (laneid == 0) {
+        E_vdW [i] += sh_vdw[threadIdx.x];
+        E_Ele [i] += sh_ele[threadIdx.x];
+        rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]);
+    }
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy(reax_atom *atoms, 
+        control_params *control,
+        simulation_data *data, 
+        list p_far_nbrs, 
+        real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
+        LR_lookup_table *d_LR,
+        int num_atom_types,
+        int energy_update_freq,
+        int N  )
+    extern __shared__ real _vdw[];
+    extern __shared__ real _ele[];
+    extern __shared__ rvec _force [];
+    real *sh_vdw;
+    real *sh_ele;
+    rvec *sh_force;
+    int i, j, pj, r, steps, update_freq, update_energies;
+    int type_i, type_j, tmin, tmax;
+    int start_i, end_i;
+    real r_ij, self_coef, base, dif;
+    real e_vdW, e_ele;
+    real CEvd, CEclmb;
+    rvec temp, ext_press;
+    far_neighbor_data *nbr_pj;
+    LR_lookup_table *t;
+    list *far_nbrs = &p_far_nbrs;
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warpid = thread_id / VDW_THREADS_PER_ATOM;
+    int laneid = thread_id & (VDW_THREADS_PER_ATOM -1);
+    i = warpid;
+    sh_vdw = _vdw;
+    sh_ele = _vdw + blockDim.x;
+    sh_force = (rvec *)( _vdw + 2*blockDim.x);
+    sh_vdw[threadIdx.x] = 0.0; 
+    sh_ele[threadIdx.x] = 0.0; 
+    rvec_MakeZero ( sh_force [threadIdx.x] );
+    if ( i < N ) 
+    {
+        reax_atom local_atom ;
+        local_atom.q =  atoms[i].q;
+        //local_atom.q =  d_far_data.q[i];
+        local_atom.type = atoms[i].type;
+        //local_atom.type = d_far_data.type[i];
+        /*
+           sh_vdw = _vdw;
+           sh_ele = _vdw + warpid;
+           sh_force = (rvec *)( _vdw + 2*warpid);
+           sh_vdw[threadIdx.x] = 0.0; 
+           sh_ele[threadIdx.x] = 0.0; 
+           rvec_MakeZero ( sh_force [threadIdx.x] );
+         */
+        steps = data->step - data->prev_steps;
+        update_freq = energy_update_freq;
+        update_energies = update_freq > 0 && steps % update_freq == 0;
+        //for( i = 0; i < system->N; ++i ) {
+        type_i  = local_atom.type;
+        start_i = Start_Index(i,far_nbrs);
+        end_i   = End_Index(i,far_nbrs);
+        pj = start_i + laneid;
+        //for( pj = start_i; pj < end_i; ++pj ) 
+        while (pj < end_i)
+        {
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
+                //if( d_far_data.d[pj] <= control->r_cut ) 
+            {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j      = nbr_pj->nbr;
+                //j      = d_far_data.nbrs[pj];
+                type_j = atoms[j].type;
+                //type_j = d_far_data.type[j];
+                r_ij   = nbr_pj->d;
+                //r_ij   = d_far_data.d[pj];
+                self_coef = (i == j) ? 0.5 : 1.0;
+                tmin  = MIN( type_i, type_j );
+                tmax  = MAX( type_i, type_j );
+                t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
+                //TODO
+                //CHANGE ORIGINAL
+                //if (i <= j) { pj += blockDim.x; continue; }
+                //CHANGE ORIGINAL
+                /* Cubic Spline Interpolation */
+                r = (int)(r_ij * t->inv_dx);
+                if( r == 0 )  ++r;
+                base = (real)(r+1) * t->dx;
+                dif = r_ij - base;
+                if(( update_energies )) 
+                {
+                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
+                        t->vdW[r].a;
+                    e_vdW *= self_coef;
+                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + t->ele[r].a;
+                    e_ele *= self_coef * local_atom.q * atoms[j].q;
+                    //data->E_vdW += e_vdW;
+                    //TODO
+                    //E_vdW [i] += e_vdW / 2.0;
+                    //E_vdW [i] = __dadd_rd (E_vdW [i], e_vdW/2.0);
+                    sh_vdw [threadIdx.x] += e_vdW/2.0;
+                    //E_vdW [i] += e_vdW;
+                    //TODO
+                    //data->E_Ele += e_ele;
+                    //E_Ele [i] += e_ele / 2.0;
+                    //E_Ele [i] = __dadd_rd ( E_Ele [i], e_ele / 2.0);
+                    sh_ele [threadIdx.x] += e_ele/2.0;
+                    //E_Ele [i] += e_ele;
+                }    
+                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
+                    t->CEvd[r].a;
+                CEvd *= self_coef;
+                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
+                    t->CEclmb[r].a;
+                CEclmb *= self_coef * local_atom.q * atoms[j].q;
+                //CEclmb *= self_coef * local_atom.q * d_far_data.q[j];
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                    if ( i >= j)
+                        //rvec_ScaledAdd( atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec );
+                    //rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), d_far_data.dvec[pj] );
+                    else 
+                        //rvec_ScaledAdd( atoms[i].f, +(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec );
+                    //rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), d_far_data.dvec[pj] );
+                }
+                else { // NPT, iNPT or sNPT
+                    // for pressure coupling, terms not related to bond order 
+                    //  derivatives are added directly into pressure vector/tensor /
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                    if (i >= j)
+                        rvec_ScaledAdd( atoms[i].f, -1., temp );
+                    else
+                        rvec_Add( atoms[i].f, temp );
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                    //rvec_Add( data->ext_press, ext_press );
+                    rvec_Copy (aux_ext_press [i], ext_press );
+                    //TODO CHECK THIS
+                }
+            }
+            pj += VDW_THREADS_PER_ATOM;
+        }
+    }// if i < n condition
+    __syncthreads ();
+    if (laneid < 16) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
+    }
+    __syncthreads ();
+    if (laneid < 8) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
+    }
+    __syncthreads ();
+    if (laneid < 4) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
+    }
+    __syncthreads ();
+    if (laneid < 2) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
+    }
+    __syncthreads ();
+    if (laneid < 1) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
+    }
+    __syncthreads ();
+    if (laneid == 0) {
+        E_vdW [i] += sh_vdw[threadIdx.x];
+        E_Ele [i] += sh_ele[threadIdx.x];
+        rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]);
+    }
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1(reax_atom *atoms, 
+        control_params *control,
+        simulation_data *data, 
+        list p_far_nbrs, 
+        real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
+        LR_lookup_table *d_LR,
+        int num_atom_types,
+        int energy_update_freq,
+        int N )
+    extern __shared__ real _vdw[];
+    extern __shared__ real _ele[];
+    real *sh_vdw;
+    real *sh_ele;
+    int i, j, pj, r, steps, update_freq, update_energies;
+    int type_i, type_j, tmin, tmax;
+    int start_i, end_i;
+    real r_ij, self_coef, base, dif;
+    real e_vdW, e_ele;
+    real CEvd, CEclmb;
+    rvec temp, ext_press;
+    far_neighbor_data *nbr_pj;
+    LR_lookup_table *t;
+    list *far_nbrs = &p_far_nbrs;
+    i = blockIdx.x;
+    reax_atom local_atom;
+    local_atom.q =  atoms[i].q;
+    local_atom.type = atoms[i].type;
+    sh_vdw = _vdw;
+    sh_ele = _vdw + blockDim.x;
+    sh_vdw[threadIdx.x] = 0.0; 
+    sh_ele[threadIdx.x] = 0.0; 
+    steps = data->step - data->prev_steps;
+    update_freq = energy_update_freq;
+    update_energies = update_freq > 0 && steps % update_freq == 0;
+    type_i  = local_atom.type;
+    start_i = Start_Index(i,far_nbrs);
+    end_i   = End_Index(i,far_nbrs);
+    pj = start_i + threadIdx.x;
+    while (pj < end_i)
+    {
+        if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
+        {
+            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+            j      = nbr_pj->nbr;
+            type_j = atoms[j].type;
+            r_ij   = nbr_pj->d;
+            self_coef = (i == j) ? 0.5 : 1.0;
+            tmin  = MIN( type_i, type_j );
+            tmax  = MAX( type_i, type_j );
+            t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
+            /* Cubic Spline Interpolation */
+            r = (int)(r_ij * t->inv_dx);
+            if( r == 0 )  ++r;
+            base = (real)(r+1) * t->dx;
+            dif = r_ij - base;
+            if(( update_energies )) 
+            {
+                e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
+                    t->vdW[r].a;
+                e_vdW *= self_coef;
+                e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                    t->ele[r].a;
+                e_ele *= self_coef * local_atom.q * atoms[j].q;
+                sh_vdw [threadIdx.x] += e_vdW/2.0;
+                sh_ele [threadIdx.x] += e_ele/2.0;
+            }    
+        }
+        pj += blockDim.x;
+    }
+    // now do a reduce inside the warp for E_vdW, E_Ele and force.
+    if (threadIdx.x < 16) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
+    }
+    if (threadIdx.x < 8) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
+    }
+    if (threadIdx.x < 4) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
+    }
+    if (threadIdx.x < 2) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
+    }
+    if (threadIdx.x < 1) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
+    }
+    if (threadIdx.x == 0) {
+        E_vdW [i] += sh_vdw[0];
+        E_Ele [i] += sh_ele[0];
+    }
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2(reax_atom *atoms, 
+        control_params *control,
+        simulation_data *data, 
+        list p_far_nbrs, 
+        real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
+        LR_lookup_table *d_LR,
+        int num_atom_types,
+        int energy_update_freq,
+        int N )
+    extern __shared__ rvec _force [];
+    rvec *sh_force;
+    int i, j, pj, r, steps, update_freq, update_energies;
+    int type_i, type_j, tmin, tmax;
+    int start_i, end_i;
+    real r_ij, self_coef, base, dif;
+    real e_vdW, e_ele;
+    real CEvd, CEclmb;
+    rvec temp, ext_press;
+    far_neighbor_data *nbr_pj;
+    LR_lookup_table *t;
+    list *far_nbrs = &p_far_nbrs;
+    i = blockIdx.x;
+    reax_atom local_atom;
+    local_atom.q =  atoms[i].q;
+    local_atom.type = atoms[i].type;
+    sh_force = _force;
+    rvec_MakeZero ( sh_force [threadIdx.x] );
+    steps = data->step - data->prev_steps;
+    update_freq = energy_update_freq;
+    update_energies = update_freq > 0 && steps % update_freq == 0;
+    //for( i = 0; i < system->N; ++i ) {
+    type_i  = local_atom.type;
+    start_i = Start_Index(i,far_nbrs);
+    end_i   = End_Index(i,far_nbrs);
+    pj = start_i + threadIdx.x;
+    while (pj < end_i)
+    {
+        if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
+        {
+            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+            j      = nbr_pj->nbr;
+            type_j = atoms[j].type;
+            r_ij   = nbr_pj->d;
+            self_coef = (i == j) ? 0.5 : 1.0;
+            tmin  = MIN( type_i, type_j );
+            tmax  = MAX( type_i, type_j );
+            t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
+            /* Cubic Spline Interpolation */
+            r = (int)(r_ij * t->inv_dx);
+            if( r == 0 )  ++r;
+            base = (real)(r+1) * t->dx;
+            dif = r_ij - base;
+            CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
+                t->CEvd[r].a;
+            CEvd *= self_coef;
+            CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
+                t->CEclmb[r].a;
+            CEclmb *= self_coef * local_atom.q * atoms[j].q;
+            if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
+                if ( i >= j)
+                    rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec );
+                else 
+                    rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec );
+            }
+            else { // NPT, iNPT or sNPT
+                // for pressure coupling, terms not related to bond order 
+                //  derivatives are added directly into pressure vector/tensor /
+                rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                if (i >= j)
+                    rvec_ScaledAdd( atoms[i].f, -1., temp );
+                else
+                    rvec_Add( atoms[i].f, temp );
+                rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                rvec_Copy (aux_ext_press [i], ext_press );
+            }
+        }
+        pj += blockDim.x;
+    }
+    // now do a reduce inside the warp for E_vdW, E_Ele and force.
+    if (threadIdx.x < 16) {
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
+    }
+    if (threadIdx.x < 8) {
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
+    }
+    if (threadIdx.x < 4) {
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
+    }
+    if (threadIdx.x < 2) {
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
+    }
+    if (threadIdx.x < 1) {
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
+    }
+    if (threadIdx.x == 0) {
+        rvec_Add (atoms[i].f, sh_force [ 0 ]);
+    }
diff --git a/PuReMD-GPU/src/cuda_two_body_interactions.h b/PuReMD-GPU/src/cuda_two_body_interactions.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe3e273775f67e17f88d742a52707f72bfbac56c
--- /dev/null
+++ b/PuReMD-GPU/src/cuda_two_body_interactions.h
@@ -0,0 +1,172 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "mytypes.h"
+#include "index_utils.h"
+#ifdef __cplusplus
+extern "C"  {
+GLOBAL void Cuda_Bond_Energy( reax_atom *, global_parameters , single_body_parameters *, two_body_parameters *,
+        simulation_data *, static_storage , list , int , int, real * );
+GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *, two_body_parameters *,
+        global_parameters , control_params *, simulation_data *, list , real *, real *, rvec *,
+        int , int );
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( reax_atom *, control_params *, simulation_data *,
+        list , real *, real *, rvec *,
+        LR_lookup_table *, int , int , int ) ;
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1( reax_atom *, control_params *, simulation_data *,
+        list , real *, real *, rvec *,
+        LR_lookup_table *, int , int , int ) ;
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2( reax_atom *, control_params *, simulation_data *,
+        list , real *, real *, rvec *,
+        LR_lookup_table *, int , int , int ) ;
+static DEVICE void d_LR_vdW_Coulomb( global_parameters g_params, two_body_parameters *tbp,
+        control_params *control, int i, int j, real r_ij, LR_data *lr, int num_atom_types )
+    real p_vdW1 = g_params.l[28];
+    real p_vdW1i = 1.0 / p_vdW1;
+    real powr_vdW1, powgi_vdW1;
+    real tmp, fn13, exp1, exp2;
+    real Tap, dTap, dfn13;
+    real dr3gamij_1, dr3gamij_3;
+    real e_core, de_core;
+    two_body_parameters *twbp;
+    twbp = &(tbp[ index_tbp (i, j, num_atom_types) ]);
+    e_core = 0;
+    de_core = 0;
+    /* calculate taper and its derivative */
+    Tap = control->Tap7 * r_ij + control->Tap6;
+    Tap = Tap * r_ij + control->Tap5;
+    Tap = Tap * r_ij + control->Tap4;
+    Tap = Tap * r_ij + control->Tap3;
+    Tap = Tap * r_ij + control->Tap2;
+    Tap = Tap * r_ij + control->Tap1;
+    Tap = Tap * r_ij + control->Tap0;
+    dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
+    dTap = dTap * r_ij + 5 * control->Tap5;
+    dTap = dTap * r_ij + 4 * control->Tap4;
+    dTap = dTap * r_ij + 3 * control->Tap3;
+    dTap = dTap * r_ij + 2 * control->Tap2;
+    dTap += control->Tap1 / r_ij;
+    /* vdWaals calculations */
+    powr_vdW1 = POW(r_ij, p_vdW1);
+    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
+    /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
+       Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
+       Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2),
+       powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
+    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
+    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) -
+               Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+    /*vdWaals Calculations*/
+    if (g_params.vdw_type == 1 || g_params.vdw_type == 3)
+    {
+        // shielding
+        powr_vdW1 = POW(r_ij, p_vdW1);
+        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
+        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
+                POW(r_ij, p_vdW1 - 2.0);
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+    }
+    else  // no shielding
+    {
+        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
+                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
+    }
+    if (g_params.vdw_type == 2 || g_params.vdw_type == 3)
+    {
+        // innner wall
+        e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
+        lr->e_vdW += Tap * e_core;
+        de_core = -(twbp->acore / twbp->rcore) * e_core;
+        lr->CEvd += dTap * e_core + Tap * de_core;
+    }
+    /* Coulomb calculations */
+    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+    tmp = Tap / dr3gamij_3;
+    lr->H = EV_to_KCALpMOL * tmp;
+    lr->e_ele = C_ele * tmp;
+    /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
+       Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
+       i, system->atoms[i].type, j, system->atoms[j].type,
+       twbp->gamma, Tap, dr3gamij_3,
+       system->atoms[i].q, system->atoms[j].q ); */
+    lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+    /* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
+       i+1, j+1, r_ij, e_vdW, CEvd * r_ij,
+       system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */
+    /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n",
+       i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/cuda_utils.cu b/PuReMD-GPU/src/cuda_utils.cu
index 2c632c058e2419fbfc82310bb6f5354f863e2b39..c420db769053e46ed871973200d2de3b2cb7f871 100644
--- a/PuReMD-GPU/src/cuda_utils.cu
+++ b/PuReMD-GPU/src/cuda_utils.cu
@@ -18,120 +18,136 @@
 #include "cuda_utils.h"
-#include "mytypes.h"
-void cuda_malloc (void **ptr, int size, int memset, int err_code) {
+void cuda_malloc( void **ptr, int size, int memset, int err_code )
     cudaError_t retVal = cudaSuccess;
     //fprintf (stderr, "&ptr --. %ld \n", &ptr);
     //fprintf (stderr, "ptr --> %ld \n", ptr );
-    retVal = cudaMalloc (ptr, size);
-    if (retVal != cudaSuccess) {
-        fprintf (stderr, "Failed to allocate memory on device for the res: %d...  exiting with code: %d size: %d \n", 
-                err_code, retVal, size);
-        exit (err_code);
+    retVal = cudaMalloc( ptr, size );
+    if ( retVal != cudaSuccess )
+    {
+        fprintf( stderr, "Failed to allocate memory on device for the res: %d...  exiting with code: %d size: %d \n", 
+                err_code, retVal, size );
+        exit( err_code );
     //fprintf (stderr, "&ptr --. %ld \n", &ptr);
     //fprintf (stderr, "ptr --> %ld \n", ptr );
-    if (memset) {
-        retVal = cudaMemset (*ptr, 0, size);
-        if (retVal != cudaSuccess) {
-            fprintf (stderr, "Failed to memset memory on device... exiting with code %d\n", 
-                    err_code);
-            exit (err_code);
+    if ( memset ) {
+        retVal = cudaMemset( *ptr, 0, size );
+        if ( retVal != cudaSuccess )
+        {
+            fprintf( stderr, "Failed to memset memory on device... exiting with code %d\n", 
+                    err_code );
+            exit( err_code );
-void cuda_free (void *ptr, int err_code) {
+void cuda_free( void *ptr, int err_code )
     cudaError_t retVal = cudaSuccess;
     if (!ptr) return;
-    retVal = cudaFree (ptr);
+    retVal = cudaFree( ptr );
-    if (retVal != cudaSuccess) {
-        fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld\n", 
-                err_code, retVal, ptr);
+    if ( retVal != cudaSuccess )
+    {
+        fprintf( stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld\n", 
+                err_code, retVal, ptr );
-void cuda_memset (void *ptr, int data, size_t count, int err_code){
+void cuda_memset( void *ptr, int data, size_t count, int err_code )
     cudaError_t retVal = cudaSuccess;
-    retVal = cudaMemset (ptr, data, count);
+    retVal = cudaMemset( ptr, data, count );
     if (retVal != cudaSuccess) {
-        fprintf (stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr);
-        fprintf (stderr, " size to memset: %d \n", count);
-        fprintf (stderr, " target data is : %d \n", data);
-        fprintf (stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d\n", 
-                err_code, retVal);
-        exit (err_code);
+        fprintf( stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr );
+        fprintf( stderr, " size to memset: %d \n", count );
+        fprintf( stderr, " target data is : %d \n", data );
+        fprintf( stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d\n", 
+                err_code, retVal );
+        exit( err_code );
-void copy_host_device (void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid)
+void copy_host_device( void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid )
-    cudaError_t    retVal = cudaErrorNotReady;
+    cudaError_t retVal = cudaErrorNotReady;
-    if (dir == cudaMemcpyHostToDevice)
-        retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice);
+    if ( dir == cudaMemcpyHostToDevice )
+    {
+        retVal = cudaMemcpy( dev, host, size, cudaMemcpyHostToDevice );
+    }
-        retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost);
+    {
+        retVal = cudaMemcpy( host, dev, size, cudaMemcpyDeviceToHost );
+    }
-    if (retVal != cudaSuccess) {
-        fprintf (stderr, "could not copy resource %d from host to device: reason %d \n",
-                resid, retVal);
-        exit (resid);
+    if ( retVal != cudaSuccess ) {
+        fprintf( stderr, "could not copy resource %d from host to device: reason %d \n",
+                resid, retVal );
+        exit( resid );
-void copy_device (void *dest, void *src, int size, int resid)
+void copy_device( void *dest, void *src, int size, int resid )
-    cudaError_t    retVal = cudaErrorNotReady;
+    cudaError_t retVal = cudaErrorNotReady;
-    retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice);
-    if (retVal != cudaSuccess) {
-        fprintf (stderr, "could not copy resource %d from host to device: reason %d \n",
-                resid, retVal);
-        exit (resid);
+    retVal = cudaMemcpy( dest, src, size, cudaMemcpyDeviceToDevice );
+    if ( retVal != cudaSuccess )
+    {
+        fprintf( stderr, "could not copy resource %d from host to device: reason %d \n",
+                resid, retVal );
+        exit( resid );
-void compute_blocks ( int *blocks, int *block_size, int count )
+void compute_blocks( int *blocks, int *block_size, int count )
     *block_size = CUDA_BLOCK_SIZE;
     *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1);
-void compute_nearest_pow_2 (int blocks, int *result)
+void compute_nearest_pow_2( int blocks, int *result )
     int power = 1;
-    while (power < blocks) power *= 2;
+    while (power < blocks)
+    {
+        power *= 2;
+    }
     *result = power;
-void print_device_mem_usage ()
+void print_device_mem_usage( )
     size_t total, free;
-    cudaMemGetInfo (&free, &total);
-    if (cudaGetLastError () != cudaSuccess )
+    cudaMemGetInfo( &free, &total );
+    if ( cudaGetLastError() != cudaSuccess )
-        fprintf (stderr, "Error on the memory call \n");
+        fprintf( stderr, "Error on the memory call \n" );
-    fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", 
+    fprintf( stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", 
             total, total/(1024*1024), total/ (1024*1024*1024), 
             free, free/(1024*1024), free/ (1024*1024*1024) );
diff --git a/PuReMD-GPU/src/cuda_utils.h b/PuReMD-GPU/src/cuda_utils.h
index ba793e4048c7367438a88b20b758b3090200f4b4..c8976d081bbead0048faa7dd12238c40f2eda5d7 100644
--- a/PuReMD-GPU/src/cuda_utils.h
+++ b/PuReMD-GPU/src/cuda_utils.h
@@ -21,35 +21,39 @@
 #ifndef __CUDA_UTILS_H_
 #define __CUDA_UTILS_H_
-#include "cuda.h"
-#include "cublas_v2.h"
-#include "cusparse_v2.h"
-#include "stdlib.h"
-#include "stdio.h"
+#include "mytypes.h"
+#include <stdlib.h>
+#include <stdio.h>
 #define IDX2C(i,j,ld) (((j)*(ld))+(i))
-static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta)
+#ifdef __cplusplus
+extern "C"  {
+static __inline__ void modify( cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta )
-    cublasSscal (handle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm);
-    cublasSscal (handle, ldm - p, &beta, &m[IDX2C(p, q, ldm)], 1);
+    cublasSscal( handle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm );
+    cublasSscal( handle, ldm - p, &beta, &m[IDX2C(p, q, ldm)], 1 );
-void cuda_malloc (void **, int , int , int);
-void cuda_free (void *, int);
-void cuda_memset (void *, int , size_t , int );
-void copy_host_device (void *, void *, int , enum cudaMemcpyKind, int);
-void copy_device (void *, void *, int , int );
+void cuda_malloc( void **, int , int , int );
+void cuda_free( void *, int );
+void cuda_memset( void *, int , size_t , int );
+void copy_host_device( void *, void *, int , enum cudaMemcpyKind, int );
+void copy_device( void *, void *, int , int );
-void compute_blocks (int *, int *, int);
-void compute_nearest_pow_2 (int blocks, int *result);
+void compute_blocks( int *, int *, int );
+void compute_nearest_pow_2( int blocks, int *result );
-void print_device_mem_usage ();
+void print_device_mem_usage( );
 #define cusparseCheckError(cusparseStatus) __cusparseCheckError (cusparseStatus, __FILE__, __LINE__)
-inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line )
+static inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *file, const int line )
-    if (cusparseStatus != CUSPARSE_STATUS_SUCCESS)
+    if ( cusparseStatus != CUSPARSE_STATUS_SUCCESS )
         fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cusparseStatus);
         exit (-1);
@@ -59,35 +63,42 @@ inline void __cusparseCheckError( cusparseStatus_t cusparseStatus, const char *f
 #define cublasCheckError(cublasStatus) __cublasCheckError (cublasStatus, __FILE__, __LINE__)
-inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line )
+static inline void __cublasCheckError( cublasStatus_t cublasStatus, const char *file, const int line )
-    if (cublasStatus != CUBLAS_STATUS_SUCCESS)
+    if ( cublasStatus != CUBLAS_STATUS_SUCCESS )
-        fprintf (stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cublasStatus);
-        exit (-1);
+        fprintf( stderr, "failed .. %s:%d -- error code %d \n", __FILE__, __LINE__, cublasStatus );
+        exit( -1 );
-#define cudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
-inline void __cudaCheckError( const char *file, const int line )
+#define cudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
+static inline void __cudaCheckError( const char *file, const int line )
-    cudaError err = cudaGetLastError();
+    cudaError err = cudaGetLastError( );
     if ( cudaSuccess != err )
-        fprintf (stderr, "Failed .. %s:%d -- gpu erro code %d\n", file, line, err );
+        fprintf( stderr, "Failed .. %s:%d -- gpu erro code %d\n", file, line, err );
         exit( -1 );
     // More careful checking. However, this will affect performance.
-    // Comment away if needed.
-    err = cudaDeviceSynchronize();
+    err = cudaDeviceSynchronize( );
     if( cudaSuccess != err )
         exit( -1 );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/forces.c b/PuReMD-GPU/src/forces.c
new file mode 100644
index 0000000000000000000000000000000000000000..c95d4896e32f60e954d79b0b623520afb042e9ea
--- /dev/null
+++ b/PuReMD-GPU/src/forces.c
@@ -0,0 +1,910 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "forces.h"
+#include "box.h"
+#include "bond_orders.h"
+#include "single_body_interactions.h"
+#include "two_body_interactions.h"
+#include "three_body_interactions.h"
+#include "four_body_interactions.h"
+#include "list.h"
+#include "print_utils.h"
+#include "system_props.h"
+#include "QEq.h"
+#include "vector.h"
+#include "index_utils.h"
+void Dummy_Interaction( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+void Init_Bonded_Force_Functions( control_params *control )
+    Interaction_Functions[0] = Calculate_Bond_Orders;
+    Interaction_Functions[1] = Bond_Energy;  //*/Dummy_Interaction;
+    Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy;
+    //*/Dummy_Interaction;
+    Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction;
+    Interaction_Functions[4] = Four_Body_Interactions;  //*/Dummy_Interaction;
+    if( control->hb_cut > 0 )
+        Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction;
+    else Interaction_Functions[5] = Dummy_Interaction;
+    Interaction_Functions[6] = Dummy_Interaction; //empty
+    Interaction_Functions[7] = Dummy_Interaction; //empty
+    Interaction_Functions[8] = Dummy_Interaction; //empty
+    Interaction_Functions[9] = Dummy_Interaction; //empty
+void Compute_Bonded_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
+    int i;
+    real t_start, t_elapsed;
+    /* Mark beginning of a new timestep in each energy file */
+    fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "bo", "ebond", "total" );
+    fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", 
+            data->step, "atom", "nlp", "elp", "total" );
+    fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", 
+            data->step, "atom", "eov", "total" );
+    fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", 
+            data->step, "atom", "eun", "total" );
+    fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", 
+            "angle", "bo(12)", "bo(23)", "eval", "epen", "total" );
+    fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", 
+            "angle", "bo(12)", "bo(23)", "epen", "total" );
+    fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", 
+            "angle", "bo(12)", "bo(23)", "ecoa", "total" );
+    fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", 
+            "r(23)", "angle", "bo(12)", "ehb", "total" );
+    fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", "atom4", 
+            "phi", "bo(23)", "etor", "total" );
+    fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
+            data->step, "atom1", "atom2", "atom3", "atom4", 
+            "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
+    /* Implement all the function calls as function pointers */
+    for( i = 0; i < NO_OF_INTERACTIONS; i++ ) {
+        //for( i = 0; i < 5; i++ ) {
+        t_start = Get_Time ();
+        (Interaction_Functions[i])(system, control, data, workspace, 
+                lists, out_control);
+        t_elapsed = Get_Timing_Info ( t_start );
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed );
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "f%d-", i );
+        (Print_Interactions[i])(system, control, data, workspace, 
+                lists, out_control);
+    }
+void Compute_NonBonded_Forces( reax_system *system, control_params *control, 
+        simulation_data *data,static_storage *workspace,
+        list** lists, output_controls *out_control )
+    real t_start, t_elapsed;
+    fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
+            data->step, "atom1", "atom2", "r12", "evdw", "total" );
+    fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n",
+            data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
+    t_start = Get_Time( );
+    QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
+    t_elapsed = Get_Timing_Info( t_start );
+    data->timing.QEq += t_elapsed;
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "qeq - " );
+    if ( control->tabulate == 0)
+        vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control );
+    else
+        Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, 
+                lists, out_control );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nonb forces - " );
+    Print_vdW_Coulomb_Forces( system, control, data, workspace, 
+            lists, out_control );
+/* This version of Compute_Total_Force computes forces from coefficients 
+   accumulated by all interaction functions. Saves enormous time & space! */
+void Compute_Total_Force( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace,
+        list **lists )
+    int i, pj;
+    list *bonds = (*lists) + BONDS;
+    for( i = 0; i < system->N; ++i )
+        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+            if( i < bonds->select.bond_list[pj].nbr ) {
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
+                    Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
+                else 
+                    Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
+            }
+void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
+        int Hmax, int Htop, int num_bonds, int num_hbonds )
+    int i, flag;
+    list *bonds, *hbonds;
+    bonds = *lists + BONDS;
+    hbonds = *lists + HBONDS;
+    /* far neighbors */
+    if( Htop > Hmax * DANGER_ZONE ) {
+        workspace->realloc.Htop = Htop;
+        if( Htop > Hmax ) {
+            fprintf( stderr, 
+                    "step%d - ran out of space on H matrix: Htop=%d, max = %d",
+                    step, Htop, Hmax );
+            exit(INSUFFICIENT_SPACE);
+        }
+    }
+    /* bond list */
+    flag = -1;
+    workspace->realloc.num_bonds = num_bonds;
+    for( i = 0; i < n-1; ++i )
+        if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) {
+            workspace->realloc.bonds = 1;
+            if( End_Index(i, bonds) > Start_Index(i+1, bonds) )
+                flag = i;
+        }
+    if( flag > -1 ) {
+        fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+                step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) );
+        exit(INSUFFICIENT_SPACE);
+    }    
+    if( End_Index(i, bonds) >= bonds->num_intrs-2 ) {
+        workspace->realloc.bonds = 1;
+        if( End_Index(i, bonds) > bonds->num_intrs ) {
+            fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
+                    step, flag, End_Index(i,bonds), bonds->num_intrs );
+            exit(INSUFFICIENT_SPACE);
+        }
+    }
+    /* hbonds list */
+    if( workspace->num_H > 0 ) {
+        flag = -1;
+        workspace->realloc.num_hbonds = num_hbonds;
+        for( i = 0; i < workspace->num_H-1; ++i )
+            if( Num_Entries(i, hbonds) >= 
+                    (Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) {
+                workspace->realloc.hbonds = 1;
+                if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) )
+                    flag = i;
+            }
+        if( flag > -1 ) {
+            fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+                    step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) );
+            exit(INSUFFICIENT_SPACE);
+        }
+        if( Num_Entries(i,hbonds) >= 
+                (hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) {
+            workspace->realloc.hbonds = 1;
+            if( End_Index(i, hbonds) > hbonds->num_intrs ) {
+                fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
+                        step, flag, End_Index(i,hbonds), hbonds->num_intrs );
+                exit(INSUFFICIENT_SPACE);
+            }
+        }
+    }
+void Init_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control ) {
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int ihb, jhb, ihb_top, jhb_top;
+    int flag;
+    real r_ij, r2, self_coef;
+    real dr3gamij_1, dr3gamij_3, Tap;
+    //real val, dif, base;
+    real C12, C34, C56;
+    real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2;   
+    sparse_matrix *H;
+    list *far_nbrs, *bonds, *hbonds;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    //LR_lookup_table *t;
+    reax_atom *atom_i, *atom_j;
+    bond_data *ibond, *jbond;
+    bond_order_data *bo_ij, *bo_ji;
+    far_nbrs = *lists + FAR_NBRS;
+    bonds = *lists + BONDS;
+    hbonds = *lists + HBONDS;
+    H = &workspace->H;
+    Htop = 0;
+    num_bonds = 0;
+    num_hbonds = 0;
+    btop_i = btop_j = 0;
+    p_boc1 = system->reaxprm.gp.l[0];
+    p_boc2 = system->reaxprm.gp.l[1];
+    for( i = 0; i < system->N; ++i ) {
+        atom_i = &(system->atoms[i]);
+        type_i  = atom_i->type;
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
+        H->start[i] = Htop;
+        btop_i = End_Index( i, bonds );
+        sbp_i = &(system->reaxprm.sbp[type_i]);
+        ihb = ihb_top = -1;
+        if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
+            ihb_top = End_Index( workspace->hbond_index[i], hbonds );
+        for( pj = start_i; pj < end_i; ++pj ) {
+            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
+            atom_j = &(system->atoms[j]);
+            flag = 0;
+            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+                if( nbr_pj->d <= control->r_cut)
+                    flag = 1;
+                else flag = 0;
+            }
+            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
+                            nbr_pj->dvec))<=SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+            if( flag ){    
+                type_j = system->atoms[j].type;
+                r_ij = nbr_pj->d;
+                sbp_j = &(system->reaxprm.sbp[type_j]);
+                twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
+                self_coef = (i == j) ? 0.5 : 1.0;
+                /* H matrix entry */
+                Tap = control->Tap7 * r_ij + control->Tap6;
+                Tap = Tap * r_ij + control->Tap5;
+                Tap = Tap * r_ij + control->Tap4;
+                Tap = Tap * r_ij + control->Tap3;
+                Tap = Tap * r_ij + control->Tap2;
+                Tap = Tap * r_ij + control->Tap1;
+                Tap = Tap * r_ij + control->Tap0;          
+                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+                H->entries[Htop].j = j;
+                H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
+                ++Htop;
+                /* hydrogen bond lists */ 
+                if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
+                        nbr_pj->d <= control->hb_cut ) {
+                    // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                    jhb = sbp_j->p_hbond;
+                    if( ihb == 1 && jhb == 2 ) {
+                        hbonds->select.hbond_list[ihb_top].nbr = j;
+                        hbonds->select.hbond_list[ihb_top].scl = 1;
+                        hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    }
+                    else if( ihb == 2 && jhb == 1 ) {
+                        jhb_top = End_Index( workspace->hbond_index[j], hbonds );
+                        hbonds->select.hbond_list[jhb_top].nbr = i;
+                        hbonds->select.hbond_list[jhb_top].scl = -1;
+                        hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
+                        Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
+                        ++num_hbonds;
+                    }
+                }
+                /* uncorrected bond orders */
+                if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                    r2 = SQR(r_ij);
+                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                    }
+                    else BO_s = C12 = 0.0;
+                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                        BO_pi = EXP( C34 );
+                    }
+                    else BO_pi = C34 = 0.0;
+                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                        BO_pi2= EXP( C56 );
+                    }
+                    else BO_pi2 = C56 = 0.0;
+                    /* Initially BO values are the uncorrected ones, page 1 */
+                    BO = BO_s + BO_pi + BO_pi2;
+                    if( BO >= control->bo_cut ) {
+                        num_bonds += 2;
+                        /****** bonds i-j and j-i ******/
+                        ibond = &( bonds->select.bond_list[btop_i] );
+                        btop_j = End_Index( j, bonds );
+                        jbond = &(bonds->select.bond_list[btop_j]);
+                        ibond->nbr = j;
+                        jbond->nbr = i;
+                        ibond->d = r_ij;
+                        jbond->d = r_ij;
+                        rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                        rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
+                        ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
+                        ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+                        ibond->dbond_index = btop_i;
+                        jbond->dbond_index = btop_i;
+                        ibond->sym_index = btop_j;
+                        jbond->sym_index = btop_i;
+                        ++btop_i;
+                        Set_End_Index( j, btop_j+1, bonds );
+                        bo_ij = &( ibond->bo_data );
+                        bo_ji = &( jbond->bo_data );
+                        bo_ji->BO = bo_ij->BO = BO;
+                        bo_ji->BO_s = bo_ij->BO_s = BO_s;
+                        bo_ji->BO_pi = bo_ij->BO_pi = BO_pi;
+                        bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
+                        /* Bond Order page2-3, derivative of total bond order prime */
+                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                           dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi2,
+                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                        rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
+                        rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
+                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
+                        /* Only dBOp wrt. dr_i is stored here, note that 
+                           dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+                        rvec_Scale( bo_ij->dBOp, 
+                                -(bo_ij->BO_s * Cln_BOp_s + 
+                                    bo_ij->BO_pi * Cln_BOp_pi + 
+                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                        rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
+                        rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
+                        rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
+                        bo_ij->BO_s -= control->bo_cut;
+                        bo_ij->BO -= control->bo_cut;
+                        bo_ji->BO_s -= control->bo_cut;
+                        bo_ji->BO -= control->bo_cut;
+                        workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
+                        workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
+                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
+                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+                        /*fprintf( stderr, "%d %d %g %g %g\n",
+                          i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/
+                        /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", 
+                          Cln_BOp_s, twbp->p_bo2, C12 );
+                          fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", 
+                          Cln_BOp_pi, twbp->p_bo4, C34 );
+                          fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n",
+                          Cln_BOp_pi2, twbp->p_bo6, C56 );*/
+                        /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2);
+                          fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4);
+                          fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6);
+                          fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", 
+                          twbp->r_s, twbp->r_p, twbp->r_pp );
+                          fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/
+                        /*fprintf( stderr, "\tfactors: %g %g %g\n",
+                          -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + 
+                          bo_ij->BO_pi2 * Cln_BOp_pp),
+                          -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/
+                        /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
+                          bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] );
+                          fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
+                          bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], 
+                          bo_ij->dln_BOp_pi[2] );
+                          fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n",
+                          bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], 
+                          bo_ij->dln_BOp_pi2[2] );*/
+                        Set_End_Index( j, btop_j+1, bonds );
+                    }
+                }
+            }
+        }
+        H->entries[Htop].j = i;
+        H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
+        ++Htop;
+        Set_End_Index( i, btop_i, bonds );
+        if( ihb == 1 )
+            Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
+        //fprintf( stderr, "%d bonds start: %d, end: %d\n", 
+        //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
+    }
+    // mark the end of j list
+    H->start[i] = Htop; 
+    /* validate lists - decide if reallocation is required! */
+    Validate_Lists( workspace, lists, 
+            data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
+            data->step, Htop, num_bonds, num_hbonds );
+void Init_Forces_Tab( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control ) {
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int tmin, tmax, r;
+    int ihb, jhb, ihb_top, jhb_top;
+    int flag;
+    real r_ij, r2, self_coef;
+    real val, dif, base;
+    real C12, C34, C56;
+    real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2;   
+    sparse_matrix *H;
+    list *far_nbrs, *bonds, *hbonds;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    LR_lookup_table *t;
+    reax_atom *atom_i, *atom_j;
+    bond_data *ibond, *jbond;
+    bond_order_data *bo_ij, *bo_ji;
+    far_nbrs = *lists + FAR_NBRS;
+    bonds = *lists + BONDS;
+    hbonds = *lists + HBONDS;
+    H = &workspace->H;
+    Htop = 0;
+    num_bonds = 0;
+    num_hbonds = 0;
+    btop_i = btop_j = 0;
+    p_boc1 = system->reaxprm.gp.l[0];
+    p_boc2 = system->reaxprm.gp.l[1];
+    for( i = 0; i < system->N; ++i ) {
+        atom_i = &(system->atoms[i]);
+        type_i  = atom_i->type;
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
+        H->start[i] = Htop;
+        btop_i = End_Index( i, bonds );
+        sbp_i = &(system->reaxprm.sbp[type_i]);
+        ihb = ihb_top = -1;
+        if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
+            ihb_top = End_Index( workspace->hbond_index[i], hbonds );
+        for( pj = start_i; pj < end_i; ++pj ) {
+            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
+            atom_j = &(system->atoms[j]);
+            flag = 0;
+            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+                if(nbr_pj->d <= control->r_cut)
+                    flag = 1;
+                else flag = 0;
+            }
+            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
+                            nbr_pj->dvec))<=SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+            if( flag ){    
+                type_j = system->atoms[j].type;
+                r_ij = nbr_pj->d;
+                sbp_j = &(system->reaxprm.sbp[type_j]);
+                twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
+                self_coef = (i == j) ? 0.5 : 1.0;
+                tmin  = MIN( type_i, type_j );
+                tmax  = MAX( type_i, type_j );
+                t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] );      
+                /* cubic spline interpolation */
+                r = (int)(r_ij * t->inv_dx);
+                if( r == 0 )  ++r;
+                base = (real)(r+1) * t->dx;
+                dif = r_ij - base;
+                val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                    t->ele[r].a;
+                val *= EV_to_KCALpMOL / C_ele;
+                H->entries[Htop].j = j;
+                H->entries[Htop].val = self_coef * val;
+                ++Htop;
+                /* hydrogen bond lists */ 
+                if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
+                        nbr_pj->d <= control->hb_cut ) {
+                    // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                    jhb = sbp_j->p_hbond;
+                    if( ihb == 1 && jhb == 2 ) {
+                        hbonds->select.hbond_list[ihb_top].nbr = j;
+                        hbonds->select.hbond_list[ihb_top].scl = 1;
+                        hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    }
+                    else if( ihb == 2 && jhb == 1 ) {
+                        jhb_top = End_Index( workspace->hbond_index[j], hbonds );
+                        hbonds->select.hbond_list[jhb_top].nbr = i;
+                        hbonds->select.hbond_list[jhb_top].scl = -1;
+                        hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
+                        Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
+                        ++num_hbonds;
+                    }
+                }
+                /* uncorrected bond orders */
+                if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                    r2 = SQR(r_ij);
+                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                    }
+                    else BO_s = C12 = 0.0;
+                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                        BO_pi = EXP( C34 );
+                    }
+                    else BO_pi = C34 = 0.0;
+                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                        BO_pi2= EXP( C56 );
+                    }
+                    else BO_pi2 = C56 = 0.0;
+                    /* Initially BO values are the uncorrected ones, page 1 */
+                    BO = BO_s + BO_pi + BO_pi2;
+                    if( BO >= control->bo_cut ) {
+                        num_bonds += 2;
+                        /****** bonds i-j and j-i ******/
+                        ibond = &( bonds->select.bond_list[btop_i] );
+                        btop_j = End_Index( j, bonds );
+                        jbond = &(bonds->select.bond_list[btop_j]);
+                        ibond->nbr = j;
+                        jbond->nbr = i;
+                        ibond->d = r_ij;
+                        jbond->d = r_ij;
+                        rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                        rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
+                        ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
+                        ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+                        ibond->dbond_index = btop_i;
+                        jbond->dbond_index = btop_i;
+                        ibond->sym_index = btop_j;
+                        jbond->sym_index = btop_i;
+                        ++btop_i;
+                        Set_End_Index( j, btop_j+1, bonds );
+                        bo_ij = &( ibond->bo_data );
+                        bo_ji = &( jbond->bo_data );
+                        bo_ji->BO = bo_ij->BO = BO;
+                        bo_ji->BO_s = bo_ij->BO_s = BO_s;
+                        bo_ji->BO_pi = bo_ij->BO_pi = BO_pi;
+                        bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
+                        /* Bond Order page2-3, derivative of total bond order prime */
+                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                           dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi2,
+                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                        rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
+                        rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
+                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
+                        /* Only dBOp wrt. dr_i is stored here, note that 
+                           dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+                        rvec_Scale( bo_ij->dBOp, 
+                                -(bo_ij->BO_s * Cln_BOp_s + 
+                                    bo_ij->BO_pi * Cln_BOp_pi + 
+                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                        rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
+                        rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
+                        rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
+                        bo_ij->BO_s -= control->bo_cut;
+                        bo_ij->BO -= control->bo_cut;
+                        bo_ji->BO_s -= control->bo_cut;
+                        bo_ji->BO -= control->bo_cut;
+                        workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
+                        workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
+                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
+                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+                        Set_End_Index( j, btop_j+1, bonds );
+                    }
+                }
+            }
+        }
+        H->entries[Htop].j = i;
+        H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
+        ++Htop;
+        Set_End_Index( i, btop_i, bonds );
+        if( ihb == 1 )
+            Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
+    }
+    // mark the end of j list
+    H->start[i] = Htop; 
+    /* validate lists - decide if reallocation is required! */
+    Validate_Lists( workspace, lists, 
+            data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
+            data->step, Htop, num_bonds, num_hbonds );
+    //Print_Bonds( system, bonds, "sbonds.out" );
+    //Print_Bond_List2( system, bonds, "sbonds.out" );
+    //Print_Sparse_Matrix2( H, "H.out" );
+void Estimate_Storage_Sizes( reax_system *system, control_params *control, 
+        list **lists, int *Htop, int *hb_top, 
+        int *bond_top, int *num_3body ) {
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int ihb, jhb;
+    real r_ij, r2;
+    real C12, C34, C56;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2; 
+    list *far_nbrs;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+    far_nbrs = *lists + FAR_NBRS;
+    p_boc1 = system->reaxprm.gp.l[0];
+    p_boc2 = system->reaxprm.gp.l[1];
+    for( i = 0; i < system->N; ++i ) {
+        atom_i = &(system->atoms[i]);
+        type_i  = atom_i->type;
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
+        sbp_i = &(system->reaxprm.sbp[type_i]);
+        ihb = sbp_i->p_hbond;
+        for( pj = start_i; pj < end_i; ++pj ) {
+            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
+            atom_j = &(system->atoms[j]);
+            type_j = atom_j->type;
+            sbp_j = &(system->reaxprm.sbp[type_j]);
+            twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
+            if( nbr_pj->d <= control->r_cut ) {
+                ++(*Htop);
+                /* hydrogen bond lists */ 
+                if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && 
+                        nbr_pj->d <= control->hb_cut ) {
+                    jhb = sbp_j->p_hbond;
+                    if( ihb == 1 && jhb == 2 )
+                        ++hb_top[i];
+                    else if( ihb == 2 && jhb == 1 )
+                        ++hb_top[j];
+                }
+                /* uncorrected bond orders */
+                if( nbr_pj->d <= control->nbr_cut ) {
+                    r_ij = nbr_pj->d;
+                    r2 = SQR(r_ij);
+                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                    }
+                    else BO_s = C12 = 0.0;
+                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                        BO_pi = EXP( C34 );
+                    }
+                    else BO_pi = C34 = 0.0;
+                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                        BO_pi2= EXP( C56 );
+                    }
+                    else BO_pi2 = C56 = 0.0;
+                    /* Initially BO values are the uncorrected ones, page 1 */
+                    BO = BO_s + BO_pi + BO_pi2;
+                    if( BO >= control->bo_cut ) {
+                        ++bond_top[i];
+                        ++bond_top[j];
+                    }
+                }
+            }
+        }
+    }
+    *Htop += system->N;
+    *Htop *= SAFE_ZONE;
+    for( i = 0; i < system->N; ++i ) {
+        hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
+        *num_3body += SQR(bond_top[i]);
+        bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
+    }
+    *num_3body *= SAFE_ZONE;
+void Compute_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list** lists, output_controls *out_control )
+    real t_start, t_elapsed;
+    t_start = Get_Time( );
+    if( !control->tabulate )
+        Init_Forces( system, control, data, workspace, lists, out_control );
+    else Init_Forces_Tab( system, control, data, workspace, lists, out_control );
+    t_elapsed = Get_Timing_Info( t_start );
+    data->timing.init_forces += t_elapsed;
+#if defined(DEBUG_FOCUS)
+    print_sparse_matrix (system, workspace);
+    fprintf( stderr, "init_forces - ");
+    //analyze_hbonds (system, workspace, lists);
+    t_start = Get_Time( );
+    Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
+    t_elapsed = Get_Timing_Info( t_start );
+    data->timing.bonded += t_elapsed;
+    //print_bond_list (system, workspace, lists);
+    //exit (0);
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr, "bonded_forces - ");
+    t_start = Get_Time( );
+    Compute_NonBonded_Forces( system, control, data, workspace, 
+            lists, out_control );
+    t_elapsed = Get_Timing_Info( t_start );
+    data->timing.nonb += t_elapsed;
+#ifdef __DEBUG_CUDA__
+    fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed);
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "nonbondeds - ");
+    Compute_Total_Force( system, control, data, workspace, lists );
+    //Print_Total_Force( system, control, data, workspace, lists, out_control );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "totalforces - ");
+    //Print_Total_Force( system, control, data, workspace, lists, out_control );
+    Print_Total_Force( system, control, data, workspace, lists, out_control );
+    Compare_Total_Forces( system, control, data, workspace, lists, out_control );
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr, "forces - ");
diff --git a/PuReMD-GPU/src/forces.cu b/PuReMD-GPU/src/forces.cu
deleted file mode 100644
index e8e1e2917b34eefa913f25cab67b1123d08f892e..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/forces.cu
+++ /dev/null
@@ -1,2880 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#include "forces.h"
-#include "box.h"
-#include "bond_orders.h"
-#include "single_body_interactions.h"
-#include "two_body_interactions.h"
-#include "three_body_interactions.h"
-#include "four_body_interactions.h"
-#include "list.h"
-#include "print_utils.h"
-#include "system_props.h"
-#include "QEq.h"
-#include "vector.h"
-#include "index_utils.h"
-#include "cuda_utils.h"
-#include "cuda_init.h"
-#include "reduction.h"
-//#include "matrix.h"
-#include "validation.h"
-#include "cudaProfiler.h"
-void Dummy_Interaction( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
-void Init_Bonded_Force_Functions( control_params *control )
-    Interaction_Functions[0] = Calculate_Bond_Orders;
-    Interaction_Functions[1] = Bond_Energy;  //*/Dummy_Interaction;
-    Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy;
-    //*/Dummy_Interaction;
-    Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction;
-    Interaction_Functions[4] = Four_Body_Interactions;  //*/Dummy_Interaction;
-    if( control->hb_cut > 0 )
-        Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction;
-    else Interaction_Functions[5] = Dummy_Interaction;
-    Interaction_Functions[6] = Dummy_Interaction; //empty
-    Interaction_Functions[7] = Dummy_Interaction; //empty
-    Interaction_Functions[8] = Dummy_Interaction; //empty
-    Interaction_Functions[9] = Dummy_Interaction; //empty
-void Compute_Bonded_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control )
-    int i;
-    real t_start, t_elapsed;
-    /* Mark beginning of a new timestep in each energy file */
-    fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "bo", "ebond", "total" );
-    fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", 
-            data->step, "atom", "nlp", "elp", "total" );
-    fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", 
-            data->step, "atom", "eov", "total" );
-    fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", 
-            data->step, "atom", "eun", "total" );
-    fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "angle", "bo(12)", "bo(23)", "eval", "epen", "total" );
-    fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "angle", "bo(12)", "bo(23)", "epen", "total" );
-    fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "angle", "bo(12)", "bo(23)", "ecoa", "total" );
-    fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", 
-            "r(23)", "angle", "bo(12)", "ehb", "total" );
-    fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", 
-            data->step, "atom1", "atom2", "atom3", "atom4", 
-            "phi", "bo(23)", "etor", "total" );
-    fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
-            data->step, "atom1", "atom2", "atom3", "atom4", 
-            "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
-    /* Implement all the function calls as function pointers */
-    for( i = 0; i < NO_OF_INTERACTIONS; i++ ) {
-        //for( i = 0; i < 5; i++ ) {
-        t_start = Get_Time ();
-        (Interaction_Functions[i])(system, control, data, workspace, 
-                lists, out_control);
-        t_elapsed = Get_Timing_Info ( t_start );
-#ifdef __DEBUG_CUDA__
-        fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed );
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "f%d-", i );
-        (Print_Interactions[i])(system, control, data, workspace, 
-                lists, out_control);
-    }
-    }
-    void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, 
-            simulation_data *data, static_storage *workspace,
-            list **lists, output_controls *out_control )
-    {
-        real t_start, t_elapsed;
-        real *spad = (real *)scratch;
-        rvec *rvec_spad;
-        //Compute the bonded for interaction here. 
-        //Step 1.
-#ifdef __DEBUG_CUDA__
-        t_start = Get_Time( );
-        fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE);
-        Cuda_Calculate_Bond_Orders_Init <<< BLOCKS, BLOCK_SIZE >>>
-            (  system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp,
-               *dev_workspace, system->reaxprm.num_atom_types, system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_Calculate_Bond_Orders <<< BLOCKS, BLOCK_SIZE >>>
-            ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
-              system->reaxprm.d_tbp, *dev_workspace, 
-              *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), 
-              system->reaxprm.num_atom_types, system->N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_Update_Uncorrected_BO <<<BLOCKS, BLOCK_SIZE>>>
-            (*dev_workspace, *(dev_lists + BONDS), system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_Update_Workspace_After_Bond_Orders <<<BLOCKS, BLOCK_SIZE>>>
-            (system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
-             *dev_workspace, system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-        t_elapsed = Get_Timing_Info( t_start );
-        fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-        fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n");
-        //Step 2.
-#ifdef __DEBUG_CUDA__
-        t_start = Get_Time( );
-        //cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE + system->N * REAL_SIZE + 16 * REAL_SIZE), RES_SCRATCH );
-        cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE ) , RES_SCRATCH );
-        Cuda_Bond_Energy <<< BLOCKS, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>
-            ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, system->reaxprm.d_tbp,
-              (simulation_data *)data->d_simulation_data, *dev_workspace, *(dev_lists + BONDS), 
-              system->N, system->reaxprm.num_atom_types, spad );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for E_BE
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-            (spad, spad + system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-            //(spad + system->N, spad + system->N + 16, 16);
-            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_BE, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-        t_elapsed = Get_Timing_Info( t_start );
-        fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-        fprintf (stderr, "Cuda_Bond_Energy Done... \n");
-        //Step 3.
-#ifdef __DEBUG_CUDA__
-        t_start = Get_Time( );
-        cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N ), RES_SCRATCH );
-        test_LonePair_OverUnder_Coordination_Energy_LP <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, 
-                system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-                *dev_workspace, (simulation_data *)data->d_simulation_data,
-                *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, 
-                spad, spad + 2 * system->N, spad + 4*system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        test_LonePair_OverUnder_Coordination_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, 
-                system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-                *dev_workspace, (simulation_data *)data->d_simulation_data,
-                *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, 
-                spad, spad + 2 * system->N, spad + 4*system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        test_LonePair_Postprocess        <<<BLOCKS, BLOCK_SIZE, 0>>>( system->d_atoms, system->reaxprm.d_gp, 
-                system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-                *dev_workspace, (simulation_data *)data->d_simulation_data,
-                *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for E_Lp
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-            (spad, spad + system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Lp, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for E_Ov
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-            (spad + 2*system->N, spad + 3*system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-            (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ov, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for E_Un
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-            (spad + 4*system->N, spad + 5*system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-            (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Un, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-        t_elapsed = Get_Timing_Info( t_start );
-        fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-        fprintf (stderr, "test_LonePair_postprocess Done... \n");
-        //Step 4.
-#ifdef __DEBUG_CUDA__
-        t_start = Get_Time( );
-        cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH);
-        Three_Body_Estimate <<<BLOCKS, BLOCK_SIZE>>>
-            (system->d_atoms, 
-             (control_params *)control->d_control, 
-             *(dev_lists + BONDS),
-             system->N, (int *)spad);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-        t_elapsed = Get_Timing_Info( t_start );
-        fprintf (stderr, "Three_Body_Estimate... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-        int *thbody = (int *) malloc (sizeof (int) * (dev_lists + BONDS)->num_intrs);
-        memset (thbody, 0, sizeof (int) * (dev_lists + BONDS)->num_intrs);
-        copy_host_device (thbody, spad, (dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, RES_SCRATCH);
-        int total_3body = thbody [0] * SAFE_ZONE;
-        for (int x = 1; x < (dev_lists + BONDS)->num_intrs; x++) {
-            total_3body += thbody [x]*SAFE_ZONE;
-            thbody [x] += thbody [x-1];
-        }
-        system->num_thbodies = thbody [(dev_lists+BONDS)->num_intrs-1];
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, "Total Three body estimate is %d (bonds: %d) \n", total_3body, (dev_lists+BONDS)->num_intrs);
-        if (!system->init_thblist) 
-        {
-            system->init_thblist = true;
-            if(!Make_List((dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) {
-                fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
-                exit( INIT_ERR );
-            }
-#ifdef __CUDA_MEM__
-            fprintf (stderr, "Device memory allocated: three body list = %d MB\n", 
-                    sizeof (three_body_interaction_data) * total_3body / (1024*1024));
-        } else {
-            if ((dev_workspace->realloc.bonds > 0) || (system->num_thbodies > (dev_lists+THREE_BODIES)->num_intrs )) {
-                int size = MAX (dev_workspace->realloc.num_bonds, (dev_lists+BONDS)->num_intrs);
-                /*Delete Three-body list*/
-                Delete_List( dev_lists + THREE_BODIES, TYP_DEVICE );
-#ifdef __CUDA_MEM__
-                fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", 
-                        data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies);
-                /*Recreate Three-body list */
-                if(!Make_List(size, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) {
-                    fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
-                    exit( INIT_ERR );
-                }
-            }
-        }
-        //copy the indexes into the thb list;
-        copy_host_device (thbody, ((dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), 
-                cudaMemcpyHostToDevice, LIST_INDEX);
-        copy_host_device (thbody, ((dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), 
-                cudaMemcpyHostToDevice, LIST_END_INDEX);
-        free (thbody );
-#ifdef __DEBUG_CUDA__
-        t_start = Get_Time( );
-        cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
-        Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
-            ( system->d_atoms,
-              system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, 
-              (control_params *)control->d_control,
-              (simulation_data *)data->d_simulation_data,
-              *dev_workspace, 
-              *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
-              system->N, system->reaxprm.num_atom_types, 
-              spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N));
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Not necessary to validate three-body list anymore, 
-        // Estimate is already done at the beginning which makes sure that 
-        // we have sufficient size for this list
-        //Cuda_Threebody_List( system, workspace, dev_lists + THREE_BODIES, data->step );
-        //Reduction for E_Ang
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-            (spad, spad + system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Ang, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for E_Pen
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-            (spad + 2*system->N, spad + 3*system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-            (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Pen, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for E_Coa
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-            (spad + 4*system->N, spad + 5*system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-            (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for ext_pres
-        rvec_spad = (rvec *) (spad + 6*system->N);
-        Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
-            (rvec_spad, rvec_spad + system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
-            (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        real t_1, t_2;
-        t_1 = Get_Time ();
-        //Sum up the f vector for each atom and collect the CdDelta from all the bonds
-        Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>>
-            (     system->d_atoms,
-                (control_params *)control->d_control,
-                *dev_workspace, 
-                *(dev_lists + BONDS), 
-                system->N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        t_2 = Get_Timing_Info (t_1);
-#ifdef __DEBUG_CUDA__
-        t_elapsed = Get_Timing_Info( t_start );
-        fprintf (stderr, "Three_Body_Interactions post process Timing %lf \n", t_2);
-        fprintf (stderr, "Three_Body_Interactions ...  Timing %lf \n", t_elapsed );
-        fprintf (stderr, "Three_Body_Interactions Done... \n");
-        //Step 5.
-#ifdef __DEBUG_CUDA__
-        t_start = Get_Time( );
-        cuda_memset (spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
-        Four_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
-            //Four_Body_Interactions <<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>>
-            ( system->d_atoms,
-              system->reaxprm.d_gp,
-              system->reaxprm.d_fbp,
-              (control_params *)control->d_control,
-              *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
-              (simulation_box *)system->d_box,
-              (simulation_data *)data->d_simulation_data,
-              *dev_workspace,
-              system->N, system->reaxprm.num_atom_types, 
-              spad, spad + 2*system->N, (rvec *) (spad + 4*system->N));
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for E_Tor
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-            (spad, spad + system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Tor, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for E_Con
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-            (spad + 2*system->N, spad + 3*system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-            (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Con, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Reduction for ext_pres
-        rvec_spad = (rvec *) (spad + 4*system->N);
-        Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
-            (rvec_spad, rvec_spad + system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
-            (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //Post process here
-        Four_Body_Postprocess     <<< BLOCKS, BLOCK_SIZE >>>
-            (     system->d_atoms,
-                *dev_workspace,
-                *(dev_lists + BONDS),
-                system->N );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-        t_elapsed = Get_Timing_Info( t_start );
-        fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed );
-        fprintf (stderr, " Four_Body_ Done... \n");
-        //Step 6.
-        if (control->hb_cut > 0) {
-#ifdef __DEBUG_CUDA__
-            t_start = Get_Time( );
-            cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH );
-            /*
-               Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>>
-               (  system->d_atoms, 
-               system->reaxprm.d_sbp,
-               system->reaxprm.d_hbp,
-               (control_params *)control->d_control,
-               (simulation_data *)data->d_simulation_data,
-             *dev_workspace, 
-             *(dev_lists + BONDS), *(dev_lists + HBONDS),
-             system->N, system->reaxprm.num_atom_types, 
-             spad, (rvec *) (spad + 2*system->N), NULL);
-             cudaThreadSynchronize ();
-             cudaCheckError ();
-             */
-#ifdef __DEBUG_CUDA__
-            real test1,test2;
-            test1 = Get_Time ();
-            int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + 
-                (((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1);
-            Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE )  >>>
-                (  system->d_atoms, 
-                   system->reaxprm.d_sbp,
-                   system->reaxprm.d_hbp,
-                   (control_params *)control->d_control,
-                   (simulation_data *)data->d_simulation_data,
-                   *dev_workspace, 
-                   *(dev_lists + BONDS), *(dev_lists + HBONDS),
-                   system->N, system->reaxprm.num_atom_types, 
-                   spad, (rvec *) (spad + 2*system->N), NULL);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-            test2 = Get_Timing_Info (test1);
-            fprintf (stderr, "Timing for the hb and forces ---> %f \n", test2);
-            //Reduction for E_HB
-            Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-                (spad, spad + system->N,  system->N);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-                (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_HB, BLOCKS_POW_2);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            //Reduction for ext_pres
-            rvec_spad = (rvec *) (spad + 2*system->N);
-            Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
-                (rvec_spad, rvec_spad + system->N,  system->N);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
-                (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            //Post process here
-#ifdef __DEBUG_CUDA__
-            real t_1, t_2;
-            t_1 = Get_Time ();
-            Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>>
-                (     system->d_atoms, 
-                    system->reaxprm.d_sbp, 
-                    *dev_workspace, 
-                    *(dev_lists + BONDS),
-                    *(dev_lists + HBONDS), 
-                    *(dev_lists + FAR_NBRS),
-                    system->N, 
-                    spad); //this is for the fix to use the shared memory
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-#ifdef __DEBUG_CUDA__
-            t_2 = Get_Timing_Info ( t_1 );
-            fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2);
-            t_1 = Get_Time ();
-            //Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
-            Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
-                (     system->d_atoms, 
-                    system->reaxprm.d_sbp, 
-                    *dev_workspace, 
-                    *(dev_lists + BONDS),
-                    *(dev_lists + HBONDS), 
-                    *(dev_lists + FAR_NBRS),
-                    system->N );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            t_2 = Get_Timing_Info ( t_1 );
-#ifdef __DEBUG_CUDA__
-            fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2);
-            t_elapsed = Get_Timing_Info( t_start );
-            fprintf (stderr, "Hydrogen bonds post process return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed );
-            fprintf (stderr, "Hydrogen_Bond Done... \n");
-        }
-        return; 
-    }
-    void Compute_NonBonded_Forces( reax_system *system, control_params *control, 
-            simulation_data *data,static_storage *workspace,
-            list** lists, output_controls *out_control )
-    {
-        real t_start, t_elapsed;
-        fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
-                data->step, "atom1", "atom2", "r12", "evdw", "total" );
-        fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n",
-                data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
-        t_start = Get_Time( );
-        QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
-        t_elapsed = Get_Timing_Info( t_start );
-        data->timing.QEq += t_elapsed;
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "qeq - " );
-        if ( control->tabulate == 0)
-            vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control );
-        else
-            Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, 
-                    lists, out_control );
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "nonb forces - " );
-        Print_vdW_Coulomb_Forces( system, control, data, workspace, 
-                lists, out_control );
-    }
-    void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
-            simulation_data *data,static_storage *workspace,
-            list** lists, output_controls *out_control )
-    {
-        real t_start, t_elapsed;
-        real t1 = 0, t2 = 0;
-        real *spad = (real *) scratch;
-        rvec *rvec_spad;
-        int cblks;
-        t_start = Get_Time( );
-        Cuda_QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
-        t_elapsed = Get_Timing_Info( t_start );
-        d_timing.QEq += t_elapsed;
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, " Cuda_QEq done with timing %lf \n", t_elapsed );
-        cuda_memset (spad, 0, system->N * ( 4 * REAL_SIZE + 2 * RVEC_SIZE), RES_SCRATCH );
-        t_start = Get_Time ();
-        if ( control->tabulate == 0)
-        {
-            cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + 
-                ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1);
-            Cuda_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE * ( 2*REAL_SIZE + RVEC_SIZE) >>>
-                ( system->d_atoms,   
-                  system->reaxprm.d_tbp,
-                  system->reaxprm.d_gp, 
-                  (control_params *)control->d_control, 
-                  (simulation_data *)data->d_simulation_data,  
-                  *(dev_lists + FAR_NBRS), 
-                  spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), 
-                  system->reaxprm.num_atom_types,
-                  system->N ) ;
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-        }
-        else
-        {
-            cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + 
-                ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1);
-            Cuda_Tabulated_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE* (2*REAL_SIZE + RVEC_SIZE)>>>
-                (   (reax_atom *)system->d_atoms, 
-                    (control_params *)control->d_control,
-                    (simulation_data *)data->d_simulation_data, 
-                    *(dev_lists + FAR_NBRS), 
-                    spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), 
-                    d_LR,
-                    system->reaxprm.num_atom_types,
-                    out_control->energy_update_freq,
-                    system->N ) ;
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-        }
-        t_elapsed = Get_Timing_Info (t_start );
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed - t2));
-        fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed));
-        //Reduction on E_vdW
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
-            (spad, spad + system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_vdW, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //reduction on E_Ele
-        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
-            (spad + 2*system->N, spad + 3*system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-            (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ele, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        rvec_spad = (rvec *) (spad + 4*system->N);
-        //reduction on ext_press
-        Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE>>> 
-            (rvec_spad, rvec_spad + system->N,  system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2>>> 
-            (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-    }
-    /* This version of Compute_Total_Force computes forces from coefficients 
-       accumulated by all interaction functions. Saves enormous time & space! */
-    void Compute_Total_Force( reax_system *system, control_params *control, 
-            simulation_data *data, static_storage *workspace,
-            list **lists )
-    {
-        int i, pj;
-        list *bonds = (*lists) + BONDS;
-        for( i = 0; i < system->N; ++i )
-            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-                if( i < bonds->select.bond_list[pj].nbr ) {
-                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
-                        Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
-                    else 
-                        Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
-                }
-    }
-    void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
-            int Hmax, int Htop, int num_bonds, int num_hbonds )
-    {
-        int i, flag;
-        list *bonds, *hbonds;
-        bonds = *lists + BONDS;
-        hbonds = *lists + HBONDS;
-        /* far neighbors */
-        if( Htop > Hmax * DANGER_ZONE ) {
-            workspace->realloc.Htop = Htop;
-            if( Htop > Hmax ) {
-                fprintf( stderr, 
-                        "step%d - ran out of space on H matrix: Htop=%d, max = %d",
-                        step, Htop, Hmax );
-                exit(INSUFFICIENT_SPACE);
-            }
-        }
-        /* bond list */
-        flag = -1;
-        workspace->realloc.num_bonds = num_bonds;
-        for( i = 0; i < n-1; ++i )
-            if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) {
-                workspace->realloc.bonds = 1;
-                if( End_Index(i, bonds) > Start_Index(i+1, bonds) )
-                    flag = i;
-            }
-        if( flag > -1 ) {
-            fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-                    step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) );
-            exit(INSUFFICIENT_SPACE);
-        }    
-        if( End_Index(i, bonds) >= bonds->num_intrs-2 ) {
-            workspace->realloc.bonds = 1;
-            if( End_Index(i, bonds) > bonds->num_intrs ) {
-                fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
-                        step, flag, End_Index(i,bonds), bonds->num_intrs );
-                exit(INSUFFICIENT_SPACE);
-            }
-        }
-        /* hbonds list */
-        if( workspace->num_H > 0 ) {
-            flag = -1;
-            workspace->realloc.num_hbonds = num_hbonds;
-            for( i = 0; i < workspace->num_H-1; ++i )
-                if( Num_Entries(i, hbonds) >= 
-                        (Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) {
-                    workspace->realloc.hbonds = 1;
-                    if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) )
-                        flag = i;
-                }
-            if( flag > -1 ) {
-                fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-                        step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) );
-                exit(INSUFFICIENT_SPACE);
-            }
-            if( Num_Entries(i,hbonds) >= 
-                    (hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) {
-                workspace->realloc.hbonds = 1;
-                if( End_Index(i, hbonds) > hbonds->num_intrs ) {
-                    fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
-                            step, flag, End_Index(i,hbonds), hbonds->num_intrs );
-                    exit(INSUFFICIENT_SPACE);
-                }
-            }
-        }
-    }
-    void Cuda_Validate_Lists( reax_system *system, static_storage *workspace, list **lists, int step, int n,
-            int num_bonds, int num_hbonds )
-    {
-        int i, flag;
-        list *bonds, *hbonds, *thblist;
-        int *bonds_start, *bonds_end;
-        int *hbonds_start, *hbonds_end;
-        int *mat_start, *mat_end;
-        int max_sparse_entries = 0;
-        bonds = *lists + BONDS;
-        hbonds = *lists + HBONDS;
-        bonds_start = (int *) calloc (bonds->n, INT_SIZE);
-        bonds_end = (int *) calloc (bonds->n, INT_SIZE);
-        hbonds_start = (int *) calloc (hbonds->n, INT_SIZE );
-        hbonds_end = (int *) calloc (hbonds->n, INT_SIZE );
-        mat_start = (int *) calloc (workspace->H.n, INT_SIZE );
-        mat_end = (int *) calloc (workspace->H.n, INT_SIZE );
-        copy_host_device (bonds_start, bonds->index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-        copy_host_device (bonds_end, bonds->end_index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-        copy_host_device (hbonds_start, hbonds->index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-        copy_host_device (hbonds_end, hbonds->end_index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-        copy_host_device (mat_start, workspace->H.start, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-        copy_host_device (mat_end, workspace->H.end, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-        /* Sparse Matrix entries */
-#ifdef __CUDA_TEST__
-        /*
-           workspace->realloc.Htop = 0;
-           for (i = 0; i < workspace->H.n-1; i++) {
-           if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])){
-           workspace->realloc.Htop = mat_end[i] - mat_start[i];
-           }
-           }
-         */
-        flag = -1;
-        workspace->realloc.Htop = 0;
-        for ( i = 0; i < n-1; i ++){
-            if( (mat_end[i] - mat_start[i]) > 
-                    (system->max_sparse_matrix_entries * DANGER_ZONE )) {
-                //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index: %d (%d %d) \n", 
-                //                                step, i, mat_start[i], mat_end[i]);
-                if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i]))
-                    workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ;
-            }
-            if ( (mat_end[i] > mat_start[i+1]) ){
-                fprintf( stderr, "step%d-matcheck failed: i=%d end(i)=%d start(i+1)=%d\n",
-                        step, flag, mat_end[i], mat_start[i+1]);
-                exit(INSUFFICIENT_SPACE);
-            }
-        }
-        if( (mat_end[i] - mat_start[i]) > system->max_sparse_matrix_entries * DANGER_ZONE ) {
-            if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i]))
-                workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ;
-            //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index %d (%d %d)  -- %d \n", 
-            //                                step, i, mat_start[i], mat_end[i], 
-            //                                (int) (system->max_sparse_matrix_entries * DANGER_ZONE));
-            if( mat_end[i] > system->N * system->max_sparse_matrix_entries ) {
-                fprintf( stderr, "step%d-matchk failed: i=%d end(i)=%d mat_end=%d\n",
-                        step, flag, mat_end[i], system->N * system->max_sparse_matrix_entries);
-                exit(INSUFFICIENT_SPACE);
-            }
-        }
-        /* bond list */
-#ifdef __CUDA_TEST__
-        //workspace->realloc.bonds = 1;
-        flag = -1;
-        workspace->realloc.num_bonds = 0;
-        for( i = 0; i < n-1; ++i ) {
-            workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS );
-            if( bonds_end[i] >= bonds_start[i+1]-2 ) {
-                workspace->realloc.bonds = 1;
-                //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", 
-                //                        step, i, bonds_start [i], bonds_end[i]);
-                if( bonds_end[i] > bonds_start[i+1] )
-                    flag = i;
-            }
-        }
-        if( flag > -1 ) {
-            fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-                    step, flag, bonds_end[flag], bonds_start[flag+1] );
-            exit(INSUFFICIENT_SPACE);
-        }    
-        workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS );
-        if( bonds_end[i] >= bonds->num_intrs-2 ) {
-            workspace->realloc.bonds = 1;
-            //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", 
-            //                        step, i, bonds_start [i], bonds_end[i]);
-            if( bonds_end[i] > bonds->num_intrs ) {
-                fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
-                        step, flag, bonds_end[i], bonds->num_intrs );
-                exit(INSUFFICIENT_SPACE);
-            }
-        }
-        //fprintf (stderr, "step:%d Total bonds: %d \n", step, workspace->realloc.num_bonds);
-        /* hbonds list */
-        if( workspace->num_H > 0 ) {
-#ifdef __CUDA_TEST__
-            //workspace->realloc.hbonds = 1;
-            flag = -1;
-            workspace->realloc.num_hbonds = 0;
-            for( i = 0; i < workspace->num_H-1; ++i ) {
-                workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS );
-                if( (hbonds_end[i] - hbonds_start[i]) >= 
-                        (hbonds_start[i+1] - hbonds_start[i]) * DANGER_ZONE ) {
-                    workspace->realloc.hbonds = 1;
-                    //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", 
-                    //                        step, i, hbonds_start [i], hbonds_end[i]);
-                    if( hbonds_end[i] > hbonds_start[i+1] )
-                        flag = i;
-                }
-            }
-            if( flag > -1 ) {
-                fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d,end(i)=%d str(i+1)=%d\n",
-                        step, flag, hbonds_start[(flag)],hbonds_end[(flag)], hbonds_start[(flag+1)] );
-                exit(INSUFFICIENT_SPACE);
-            }
-            workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS );
-            if( (hbonds_end[i] - hbonds_start[i]) >= 
-                    (hbonds->num_intrs - hbonds_start[i]) * DANGER_ZONE ) {
-                workspace->realloc.hbonds = 1;
-                //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", 
-                //                        step, i, hbonds_start [i], hbonds_end[i]);
-                if( hbonds_end[i] > hbonds->num_intrs ) {
-                    fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
-                            step, flag, hbonds_end[i], hbonds->num_intrs );
-                    exit(INSUFFICIENT_SPACE);
-                }
-            }
-        }
-        //fprintf (stderr, "step:%d Total Hbonds: %d \n", step, workspace->realloc.num_hbonds);
-        free (bonds_start);
-        free (bonds_end );
-        free (hbonds_start );
-        free (hbonds_end  );
-        free (mat_start );
-        free (mat_end );
-    }
-    void Cuda_Threebody_List( reax_system *system, static_storage *workspace, list *thblist, int step )
-    {
-        int *thb_start, *thb_end;
-        int i, flag;
-        thb_start = (int *) calloc (thblist->n, INT_SIZE);
-        thb_end = (int *) calloc (thblist->n, INT_SIZE );
-        copy_host_device (thb_start, thblist->index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-        copy_host_device (thb_end, thblist->end_index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-        /*three_body list*/
-        flag = -1;
-        workspace->realloc.num_3body = 0;
-        for( i = 0; i < thblist->n-1; ++i ){
-            if( (thb_end[i] - thb_start[i]) >= (thb_start[i+1] - thb_start[i])*DANGER_ZONE ) {
-                workspace->realloc.thbody = 1;
-                if( thb_end[i] > thb_end[i+1] || thb_end[i] > thblist->num_intrs ) {
-                    flag = i;
-                    break;
-                }
-            }
-        }
-        if( flag > -1 ) {
-            //fprintf( stderr, "step%d-thbchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-            //   step, flag, thb_end[flag], thb_start[flag+1] );
-            fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
-                    step, flag-1, thb_start[flag-1], thb_end[flag-1], thblist->num_intrs );
-            fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
-                    step, flag, thb_start[flag], thb_end[flag], thblist->num_intrs );
-            exit(INSUFFICIENT_SPACE);
-        }    
-        if( (thb_end[i]-thb_start[i]) >= (thblist->num_intrs - thb_start[i])*DANGER_ZONE ) {
-            workspace->realloc.thbody = 1;
-            if( thb_end[i] > thblist->num_intrs ) {
-                fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
-                        step, i-1, thb_start[i-1], thb_end[i-1], thblist->num_intrs );
-                fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
-                        step, i, thb_start[i], thb_end[i], thblist->num_intrs );
-                exit(INSUFFICIENT_SPACE);
-            }
-        }
-        free (thb_start);
-        free (thb_end);
-    }
-    void Init_Forces( reax_system *system, control_params *control, 
-            simulation_data *data, static_storage *workspace,
-            list **lists, output_controls *out_control ) {
-        int i, j, pj;
-        int start_i, end_i;
-        int type_i, type_j;
-        int Htop, btop_i, btop_j, num_bonds, num_hbonds;
-        int ihb, jhb, ihb_top, jhb_top;
-        int flag;
-        real r_ij, r2, self_coef;
-        real dr3gamij_1, dr3gamij_3, Tap;
-        //real val, dif, base;
-        real C12, C34, C56;
-        real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
-        real BO, BO_s, BO_pi, BO_pi2;
-        real p_boc1, p_boc2;   
-        sparse_matrix *H;
-        list *far_nbrs, *bonds, *hbonds;
-        single_body_parameters *sbp_i, *sbp_j;
-        two_body_parameters *twbp;
-        far_neighbor_data *nbr_pj;
-        //LR_lookup_table *t;
-        reax_atom *atom_i, *atom_j;
-        bond_data *ibond, *jbond;
-        bond_order_data *bo_ij, *bo_ji;
-        far_nbrs = *lists + FAR_NBRS;
-        bonds = *lists + BONDS;
-        hbonds = *lists + HBONDS;
-        H = &workspace->H;
-        Htop = 0;
-        num_bonds = 0;
-        num_hbonds = 0;
-        btop_i = btop_j = 0;
-        p_boc1 = system->reaxprm.gp.l[0];
-        p_boc2 = system->reaxprm.gp.l[1];
-        for( i = 0; i < system->N; ++i ) {
-            atom_i = &(system->atoms[i]);
-            type_i  = atom_i->type;
-            start_i = Start_Index(i, far_nbrs);
-            end_i   = End_Index(i, far_nbrs);
-            H->start[i] = Htop;
-            btop_i = End_Index( i, bonds );
-            sbp_i = &(system->reaxprm.sbp[type_i]);
-            ihb = ihb_top = -1;
-            if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
-                ihb_top = End_Index( workspace->hbond_index[i], hbonds );
-            for( pj = start_i; pj < end_i; ++pj ) {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j = nbr_pj->nbr;
-                atom_j = &(system->atoms[j]);
-                flag = 0;
-                if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-                    if( nbr_pj->d <= control->r_cut)
-                        flag = 1;
-                    else flag = 0;
-                }
-                else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
-                                nbr_pj->dvec))<=SQR(control->r_cut)){
-                    nbr_pj->d = sqrt(nbr_pj->d);
-                    flag = 1;
-                }
-                if( flag ){    
-                    type_j = system->atoms[j].type;
-                    r_ij = nbr_pj->d;
-                    sbp_j = &(system->reaxprm.sbp[type_j]);
-                    twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-                    self_coef = (i == j) ? 0.5 : 1.0;
-                    /* H matrix entry */
-                    Tap = control->Tap7 * r_ij + control->Tap6;
-                    Tap = Tap * r_ij + control->Tap5;
-                    Tap = Tap * r_ij + control->Tap4;
-                    Tap = Tap * r_ij + control->Tap3;
-                    Tap = Tap * r_ij + control->Tap2;
-                    Tap = Tap * r_ij + control->Tap1;
-                    Tap = Tap * r_ij + control->Tap0;          
-                    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-                    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-                    H->entries[Htop].j = j;
-                    H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
-                    ++Htop;
-                    /* hydrogen bond lists */ 
-                    if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-                            nbr_pj->d <= control->hb_cut ) {
-                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
-                        jhb = sbp_j->p_hbond;
-                        if( ihb == 1 && jhb == 2 ) {
-                            hbonds->select.hbond_list[ihb_top].nbr = j;
-                            hbonds->select.hbond_list[ihb_top].scl = 1;
-                            hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
-                            ++ihb_top;
-                            ++num_hbonds;
-                        }
-                        else if( ihb == 2 && jhb == 1 ) {
-                            jhb_top = End_Index( workspace->hbond_index[j], hbonds );
-                            hbonds->select.hbond_list[jhb_top].nbr = i;
-                            hbonds->select.hbond_list[jhb_top].scl = -1;
-                            hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
-                            Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
-                            ++num_hbonds;
-                        }
-                    }
-                    /* uncorrected bond orders */
-                    if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
-                        r2 = SQR(r_ij);
-                        if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-                            C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-                            BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-                        }
-                        else BO_s = C12 = 0.0;
-                        if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-                            C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-                            BO_pi = EXP( C34 );
-                        }
-                        else BO_pi = C34 = 0.0;
-                        if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                            C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                            BO_pi2= EXP( C56 );
-                        }
-                        else BO_pi2 = C56 = 0.0;
-                        /* Initially BO values are the uncorrected ones, page 1 */
-                        BO = BO_s + BO_pi + BO_pi2;
-                        if( BO >= control->bo_cut ) {
-                            num_bonds += 2;
-                            /****** bonds i-j and j-i ******/
-                            ibond = &( bonds->select.bond_list[btop_i] );
-                            btop_j = End_Index( j, bonds );
-                            jbond = &(bonds->select.bond_list[btop_j]);
-                            ibond->nbr = j;
-                            jbond->nbr = i;
-                            ibond->d = r_ij;
-                            jbond->d = r_ij;
-                            rvec_Copy( ibond->dvec, nbr_pj->dvec );
-                            rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
-                            ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
-                            ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
-                            ibond->dbond_index = btop_i;
-                            jbond->dbond_index = btop_i;
-                            ibond->sym_index = btop_j;
-                            jbond->sym_index = btop_i;
-                            ++btop_i;
-                            Set_End_Index( j, btop_j+1, bonds );
-                            bo_ij = &( ibond->bo_data );
-                            bo_ji = &( jbond->bo_data );
-                            bo_ji->BO = bo_ij->BO = BO;
-                            bo_ji->BO_s = bo_ij->BO_s = BO_s;
-                            bo_ji->BO_pi = bo_ij->BO_pi = BO_pi;
-                            bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
-                            /* Bond Order page2-3, derivative of total bond order prime */
-                            Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-                            Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-                            Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-                            /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
-                               dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-                            rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-                            rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
-                            rvec_Scale(bo_ij->dln_BOp_pi2,
-                                    -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
-                            rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
-                            rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
-                            rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
-                            /* Only dBOp wrt. dr_i is stored here, note that 
-                               dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-                            rvec_Scale( bo_ij->dBOp, 
-                                    -(bo_ij->BO_s * Cln_BOp_s + 
-                                        bo_ij->BO_pi * Cln_BOp_pi + 
-                                        bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-                            rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
-                            rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
-                            rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
-                            bo_ij->BO_s -= control->bo_cut;
-                            bo_ij->BO -= control->bo_cut;
-                            bo_ji->BO_s -= control->bo_cut;
-                            bo_ji->BO -= control->bo_cut;
-                            workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
-                            workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
-                            bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-                            bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
-                            /*fprintf( stderr, "%d %d %g %g %g\n",
-                              i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/
-                            /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", 
-                              Cln_BOp_s, twbp->p_bo2, C12 );
-                              fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", 
-                              Cln_BOp_pi, twbp->p_bo4, C34 );
-                              fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n",
-                              Cln_BOp_pi2, twbp->p_bo6, C56 );*/
-                            /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2);
-                              fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4);
-                              fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6);
-                              fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", 
-                              twbp->r_s, twbp->r_p, twbp->r_pp );
-                              fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/
-                            /*fprintf( stderr, "\tfactors: %g %g %g\n",
-                              -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + 
-                              bo_ij->BO_pi2 * Cln_BOp_pp),
-                              -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/
-                            /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
-                              bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] );
-                              fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
-                              bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], 
-                              bo_ij->dln_BOp_pi[2] );
-                              fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n",
-                              bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], 
-                              bo_ij->dln_BOp_pi2[2] );*/
-                            Set_End_Index( j, btop_j+1, bonds );
-                        }
-                    }
-                }
-            }
-            H->entries[Htop].j = i;
-            H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
-            ++Htop;
-            Set_End_Index( i, btop_i, bonds );
-            if( ihb == 1 )
-                Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
-            //fprintf( stderr, "%d bonds start: %d, end: %d\n", 
-            //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
-        }
-        // mark the end of j list
-        H->start[i] = Htop; 
-        /* validate lists - decide if reallocation is required! */
-        Validate_Lists( workspace, lists, 
-                data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
-#if defined(DEBUG_FOCUS)
-        fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
-                data->step, Htop, num_bonds, num_hbonds );
-    }
-    GLOBAL void Estimate_Sparse_Matrix_Entries ( reax_atom *atoms, control_params *control, 
-            simulation_data *data, simulation_box *box, list far_nbrs, int N, int *indices ) {
-        int i, j, pj;
-        int start_i, end_i;
-        int type_i, type_j;
-        int Htop;
-        int flag;
-        far_neighbor_data *nbr_pj;
-        reax_atom *atom_i, *atom_j;
-        int temp;
-        Htop = 0;
-        i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i >= N) return;
-        atom_i = &(atoms[i]);
-        type_i  = atom_i->type;
-        start_i = Start_Index(i, &far_nbrs);
-        end_i   = End_Index(i, &far_nbrs);
-        indices[i] = Htop;
-        for( pj = start_i; pj < end_i; ++pj ) {
-            nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-            j = nbr_pj->nbr;
-            atom_j = &(atoms[j]);
-            //CHANGE ORIGINAL
-            //if (i < j) continue;
-            //CHANGE ORIGINAL
-            flag = 0;
-            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-                if( nbr_pj->d <= control->r_cut)
-                    flag = 1;
-                else flag = 0;
-            }
-            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec)) <=     
-                    SQR(control->r_cut)){
-                nbr_pj->d = sqrt(nbr_pj->d);
-                flag = 1;
-            }
-            if( flag ){    
-                ++Htop;
-            }
-        }
-        ++Htop;
-        // mark the end of j list
-        indices[i] = Htop;
-    }
-    GLOBAL void Init_Forces( reax_atom *atoms,         global_parameters g_params, control_params *control, 
-            single_body_parameters *sbp, two_body_parameters *tbp, 
-            simulation_data *data, simulation_box *box,    static_storage workspace,
-            list far_nbrs,             list bonds,                list hbonds, 
-            int N,                         int max_sparse_entries, int num_atom_types ) 
-    {
-        int i, j, pj;
-        int start_i, end_i;
-        int type_i, type_j;
-        int Htop, btop_i, btop_j, num_bonds, num_hbonds;
-        int ihb, jhb, ihb_top, jhb_top;
-        int flag;
-        real r_ij, r2, self_coef;
-        real dr3gamij_1, dr3gamij_3, Tap;
-        //real val, dif, base;
-        real C12, C34, C56;
-        real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
-        real BO, BO_s, BO_pi, BO_pi2;
-        real p_boc1, p_boc2;   
-        sparse_matrix *H;
-        single_body_parameters *sbp_i, *sbp_j;
-        two_body_parameters *twbp;
-        far_neighbor_data *nbr_pj;
-        //LR_lookup_table *t;
-        reax_atom *atom_i, *atom_j;
-        bond_data *ibond, *jbond;
-        bond_order_data *bo_ij, *bo_ji;
-        i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i >= N) return;
-        H = &( workspace.H );
-        //Htop = 0;
-        Htop = i * max_sparse_entries;
-        num_bonds = 0;
-        num_hbonds = 0;
-        btop_i = btop_j = 0;
-        p_boc1 = g_params.l[0];
-        p_boc2 = g_params.l[1];
-        //for( i = 0; i < system->N; ++i ) 
-        atom_i = &(atoms[i]);
-        type_i  = atom_i->type;
-        start_i = Start_Index(i, &far_nbrs);
-        end_i   = End_Index(i, &far_nbrs);
-        H->start[i] = Htop;
-        H->end[i] = Htop;
-        btop_i = End_Index( i, &bonds );
-        sbp_i = &(sbp[type_i]);
-        ihb = ihb_top = -1;
-        ihb = sbp_i->p_hbond;
-        if( control->hb_cut > 0 && (ihb==1 || ihb == 2))
-            ihb_top = End_Index( workspace.hbond_index[i], &hbonds );
-        for( pj = start_i; pj < end_i; ++pj ) {
-            nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-            j = nbr_pj->nbr;
-            atom_j = &(atoms[j]);
-            flag = 0;
-            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-                if( nbr_pj->d <= control->r_cut)
-                    flag = 1;
-                else flag = 0;
-            }
-            else if (i > j) {
-                if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
-                    nbr_pj->d = sqrt(nbr_pj->d);
-                    flag = 1;
-                }
-            } else if (i < j) {
-                if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
-                    nbr_pj->d = sqrt(nbr_pj->d);
-                    flag = 1;
-                }
-            }
-            if( flag ){    
-                type_j = atoms[j].type;
-                r_ij = nbr_pj->d;
-                sbp_j = &(sbp[type_j]);
-                twbp = &(tbp[ index_tbp (type_i,type_j, num_atom_types) ]);
-                self_coef = (i == j) ? 0.5 : 1.0;
-                /* H matrix entry */
-                //CHANGE ORIGINAL
-                //if (i > j) {
-                Tap = control->Tap7 * r_ij + control->Tap6;
-                Tap = Tap * r_ij + control->Tap5;
-                Tap = Tap * r_ij + control->Tap4;
-                Tap = Tap * r_ij + control->Tap3;
-                Tap = Tap * r_ij + control->Tap2;
-                Tap = Tap * r_ij + control->Tap1;
-                Tap = Tap * r_ij + control->Tap0;          
-                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-                H->entries[Htop].j = j;
-                H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
-                ++Htop;
-                //}
-                //CHANGE ORIGINAL
-                /* hydrogen bond lists */ 
-                if( control->hb_cut > 0 && (ihb==1 || ihb == 2) && 
-                        nbr_pj->d <= control->hb_cut ) {
-                    // fprintf( stderr, "%d %d\n", atom1, atom2 );
-                    jhb = sbp_j->p_hbond;
-                    if (ihb == 1 && jhb == 2) {
-                        if (i > j) {
-                            hbonds.select.hbond_list[ihb_top].nbr = j;
-                            hbonds.select.hbond_list[ihb_top].scl = 1;
-                            hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-                            //Auxilary data structures
-                            rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-                            hbonds.select.hbond_list[ihb_top].sym_index= -1;
-                            ++ihb_top;
-                            ++num_hbonds;
-                        } else {
-                            hbonds.select.hbond_list[ihb_top].nbr = j;
-                            hbonds.select.hbond_list[ihb_top].scl = -1;
-                            hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-                            //Auxilary data structures
-                            rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-                            hbonds.select.hbond_list[ihb_top].sym_index= -1;
-                            ++ihb_top;
-                            ++num_hbonds;
-                        }
-                    } else if (ihb == 2 && jhb == 1) { 
-                        hbonds.select.hbond_list[ihb_top].nbr = j; 
-                        hbonds.select.hbond_list[ihb_top].scl = 1; 
-                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-                        //TODO
-                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
-                        ++ihb_top;
-                        ++num_hbonds;
-                    } 
-                }
-                /* uncorrected bond orders */
-                if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) {
-                    r2 = SQR(r_ij);
-                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-                    }
-                    else BO_s = C12 = 0.0;
-                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-                        BO_pi = EXP( C34 );
-                    }
-                    else BO_pi = C34 = 0.0;
-                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                        BO_pi2= EXP( C56 );
-                    }
-                    else BO_pi2 = C56 = 0.0;
-                    /* Initially BO values are the uncorrected ones, page 1 */
-                    BO = BO_s + BO_pi + BO_pi2;
-                    if( BO >= control->bo_cut ) {
-                        //CHANGE ORIGINAL
-                        num_bonds += 1;
-                        //CHANGE ORIGINAL
-                        /****** bonds i-j and j-i ******/
-                        /* Bond Order page2-3, derivative of total bond order prime */
-                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-                        if (i > j) 
-                        {
-                            ibond = &( bonds.select.bond_list[btop_i] );
-                            ibond->nbr = j;
-                            ibond->d = r_ij;
-                            rvec_Copy( ibond->dvec, nbr_pj->dvec );
-                            ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
-                            //ibond->dbond_index = btop_i;
-                            //ibond->sym_index = btop_j;
-                            ++btop_i;
-                            bo_ij = &( ibond->bo_data );
-                            bo_ij->BO = BO;
-                            bo_ij->BO_s = BO_s;
-                            bo_ij->BO_pi = BO_pi;
-                            bo_ij->BO_pi2 = BO_pi2;
-                            //Auxilary data structures
-                            ibond->scratch = 0;
-                            ibond->CdDelta_ij = 0;
-                            rvec_MakeZero (ibond->f);
-                            ibond->l = -1;
-                            ibond->CdDelta_jk = 0;
-                            ibond->Cdbo_kl = 0;
-                            rvec_MakeZero (ibond->i_f);
-                            rvec_MakeZero (ibond->k_f);
-                            rvec_MakeZero (ibond->h_f);
-                            rvec_MakeZero (ibond->t_f);
-                            // Only dln_BOp_xx wrt. dr_i is stored here, note that 
-                            //     dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 
-                            rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-                            rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
-                            rvec_Scale(bo_ij->dln_BOp_pi2,
-                                    -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
-                            // Only dBOp wrt. dr_i is stored here, note that 
-                            //    dBOp/dr_i = -dBOp/dr_j and all others are 0 
-                            rvec_Scale( bo_ij->dBOp, 
-                                    -(bo_ij->BO_s * Cln_BOp_s + 
-                                        bo_ij->BO_pi * Cln_BOp_pi + 
-                                        bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-                            rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp );
-                            bo_ij->BO_s -= control->bo_cut;
-                            bo_ij->BO -= control->bo_cut;
-                            workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp
-                            bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-                        } else if ( i < j )
-                        {
-                            rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
-                            rvec dBOp;
-                            btop_j = btop_i;
-                            jbond = &(bonds.select.bond_list[btop_j]);
-                            jbond->nbr = j;
-                            jbond->d = r_ij;
-                            rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
-                            ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
-                            btop_i ++;
-                            //jbond->dbond_index = btop_i;
-                            //jbond->sym_index = btop_i;
-                            bo_ji = &( jbond->bo_data );
-                            bo_ji->BO = BO;
-                            bo_ji->BO_s = BO_s;
-                            bo_ji->BO_pi = BO_pi;
-                            bo_ji->BO_pi2 = BO_pi2;
-                            //Auxilary data structures
-                            jbond->scratch = 0;
-                            jbond->CdDelta_ij = 0;
-                            rvec_MakeZero (jbond->f);
-                            jbond->l = -1;
-                            jbond->CdDelta_jk = 0;
-                            jbond->Cdbo_kl = 0;
-                            rvec_MakeZero (jbond->i_f);
-                            rvec_MakeZero (jbond->k_f);
-                            rvec_MakeZero (jbond->h_f);
-                            rvec_MakeZero (jbond->t_f);
-                            // Only dln_BOp_xx wrt. dr_i is stored here, note that 
-                            // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0
-                            rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec);
-                            rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec);
-                            rvec_Scale(dln_BOp_pi2,
-                                    -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec);
-                            rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s);
-                            rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi );
-                            rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 );
-                            // Only dBOp wrt. dr_i is stored here, note that 
-                            //    dBOp/dr_i = -dBOp/dr_j and all others are 0 
-                            rvec_Scale( dBOp, 
-                                    -(BO_s * Cln_BOp_s + 
-                                        BO_pi * Cln_BOp_pi + 
-                                        BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec );
-                            rvec_Scale( bo_ji->dBOp, -1., dBOp );
-                            rvec_Add( workspace.dDeltap_self[i] , bo_ji->dBOp );
-                            bo_ji->BO_s -= control->bo_cut;
-                            bo_ji->BO -= control->bo_cut;
-                            workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp
-                            bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
-                        }
-                    } 
-                }
-            }
-        }
-        H->entries[Htop].j = i;
-        H->entries[Htop].val = sbp[type_i].eta;
-        ++Htop;
-        H->end[i] = Htop;
-        Set_End_Index( i, btop_i, &bonds );
-        if( ihb == 1 || ihb == 2)
-            Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds );
-        //fprintf( stderr, "%d bonds start: %d, end: %d\n", 
-        //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
-        //}
-        // mark the end of j list
-        //H->start[i] = Htop; 
-        /* validate lists - decide if reallocation is required! */
-        //Validate_Lists( workspace, lists, 
-        //      data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
-GLOBAL void Init_Forces_Tab ( reax_atom *atoms,         global_parameters g_params, control_params *control, 
-        single_body_parameters *sbp, two_body_parameters *tbp, 
-        simulation_data *data, simulation_box *box,    static_storage workspace,
-        list far_nbrs,             list bonds,                list hbonds, 
-        int N,                         int max_sparse_entries, int num_atom_types, 
-        LR_lookup_table *d_LR) 
-    int i, j, pj;
-    int start_i, end_i;
-    int type_i, type_j;
-    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
-    int tmin, tmax, r;
-    int ihb, jhb, ihb_top, jhb_top;
-    int flag;
-    real r_ij, r2, self_coef;
-    real val, dif, base;
-    real C12, C34, C56;
-    real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
-    real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2;   
-    sparse_matrix *H;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
-    LR_lookup_table *t;
-    reax_atom *atom_i, *atom_j;
-    bond_data *ibond, *jbond;
-    bond_order_data *bo_ij, *bo_ji;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    H = &(workspace.H);
-    Htop = i * max_sparse_entries;
-    num_bonds = 0;
-    num_hbonds = 0;
-    btop_i = btop_j = 0;
-    p_boc1 = g_params.l[0];
-    p_boc2 = g_params.l[1];
-    //for( i = 0; i < system->N; ++i )
-    atom_i = &(atoms[i]);
-    type_i  = atom_i->type;
-    start_i = Start_Index(i, &far_nbrs);
-    end_i   = End_Index(i, &far_nbrs);
-    H->start[i] = Htop;
-    H->end[i] = Htop;
-    btop_i = End_Index( i, &bonds );
-    sbp_i = &(sbp[type_i]);
-    ihb = ihb_top = -1;
-    ihb = sbp_i->p_hbond;
-    if( control->hb_cut > 0 && (ihb==1 || ihb == 2))
-        ihb_top = End_Index( workspace.hbond_index[i], &hbonds );
-    for( pj = start_i; pj < end_i; ++pj ) {
-        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-        j = nbr_pj->nbr;
-        atom_j = &(atoms[j]);
-        flag = 0;
-        if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-            if(nbr_pj->d <= control->r_cut)
-                flag = 1;
-            else flag = 0;
-        }
-        else if (i > j) {
-            if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
-                nbr_pj->d = sqrt(nbr_pj->d);
-                flag = 1;
-            }
-        }
-        else if ( i < j) {
-            if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
-                nbr_pj->d = sqrt(nbr_pj->d);
-                flag = 1;
-            }
-        }
-        if( flag ){    
-            type_j = atoms[j].type;
-            r_ij = nbr_pj->d;
-            sbp_j = &(sbp[type_j]);
-            twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
-            self_coef = (i == j) ? 0.5 : 1.0;
-            tmin  = MIN( type_i, type_j );
-            tmax  = MAX( type_i, type_j );
-            t = &( d_LR[ index_lr (tmin, tmax, num_atom_types) ]);      
-            /* cubic spline interpolation */
-            //CHANGE ORIGINAL
-            //if (i > j) {
-            r = (int)(r_ij * t->inv_dx);
-            if( r == 0 )  ++r;
-            base = (real)(r+1) * t->dx;
-            dif = r_ij - base;
-            val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-                t->ele[r].a;
-            val *= EV_to_KCALpMOL / C_ele;
-            H->entries[Htop].j = j;
-            H->entries[Htop].val = self_coef * val;
-            //H->j [Htop] = j;
-            //H->val [Htop] = self_coef * val;
-            ++Htop;
-            //}
-            //CHANGE ORIGINAL
-            /* hydrogen bond lists */ 
-            if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-                    nbr_pj->d <= control->hb_cut ) {
-                // fprintf( stderr, "%d %d\n", atom1, atom2 );
-                jhb = sbp_j->p_hbond;
-                if ( ihb == 1 && jhb == 2 ) {
-                    if (i > j) {
-                        hbonds.select.hbond_list[ihb_top].nbr = j;
-                        hbonds.select.hbond_list[ihb_top].scl = 1;
-                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-                        //Auxilary data structures
-                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
-                        ++ihb_top;
-                        ++num_hbonds;
-                    } else {
-                        hbonds.select.hbond_list[ihb_top].nbr = j;
-                        hbonds.select.hbond_list[ihb_top].scl = -1;
-                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-                        //Auxilary data structures
-                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
-                        ++ihb_top;
-                        ++num_hbonds;
-                    }
-                } else if (ihb == 2 && jhb == 1) {
-                    hbonds.select.hbond_list[ihb_top].nbr = j;
-                    hbonds.select.hbond_list[ihb_top].scl = 1;
-                    hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-                    //Auxilary data structures
-                    rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-                    hbonds.select.hbond_list[ihb_top].sym_index= -1;
-                    ++ihb_top;
-                    ++num_hbonds;
-                }
-            }
-            /* uncorrected bond orders */
-            if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) {
-                r2 = SQR(r_ij);
-                if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-                }
-                else BO_s = C12 = 0.0;
-                if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-                    BO_pi = EXP( C34 );
-                }
-                else BO_pi = C34 = 0.0;
-                if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                    BO_pi2= EXP( C56 );
-                }
-                else BO_pi2 = C56 = 0.0;
-                /* Initially BO values are the uncorrected ones, page 1 */
-                BO = BO_s + BO_pi + BO_pi2;
-                if( BO >= control->bo_cut ) {
-                    //CHANGE ORIGINAL
-                    num_bonds += 1;
-                    //CHANGE ORIGINAL
-                    /****** bonds i-j and j-i ******/
-                    if ( i > j )
-                    {
-                        ibond = &( bonds.select.bond_list[btop_i] );
-                        ibond->nbr = j;
-                        ibond->d = r_ij;
-                        rvec_Copy( ibond->dvec, nbr_pj->dvec );
-                        ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
-                        //ibond->dbond_index = btop_i;
-                        //ibond->sym_index = btop_j;
-                        ++btop_i;
-                        bo_ij = &( ibond->bo_data );
-                        bo_ij->BO = BO;
-                        bo_ij->BO_s = BO_s;
-                        bo_ij->BO_pi = BO_pi;
-                        bo_ij->BO_pi2 = BO_pi2;
-                        //Auxilary data strucutres to resolve dependencies
-                        ibond->scratch = 0;
-                        ibond->CdDelta_ij = 0;
-                        rvec_MakeZero (ibond->f);
-                        ibond->l = -1;
-                        ibond->CdDelta_jk = 0;
-                        ibond->Cdbo_kl = 0;
-                        rvec_MakeZero (ibond->i_f);
-                        rvec_MakeZero (ibond->k_f);
-                        rvec_MakeZero (ibond->h_f);
-                        rvec_MakeZero (ibond->t_f);
-                        /* Bond Order page2-3, derivative of total bond order prime */
-                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
-                           dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi2,
-                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
-                        /* Only dBOp wrt. dr_i is stored here, note that 
-                           dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-                        rvec_Scale( bo_ij->dBOp, 
-                                -(bo_ij->BO_s * Cln_BOp_s + 
-                                    bo_ij->BO_pi * Cln_BOp_pi + 
-                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-                        rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp );
-                        bo_ij->BO_s -= control->bo_cut;
-                        bo_ij->BO -= control->bo_cut;
-                        workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp
-                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-                    } 
-                    else {
-                        rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
-                        rvec dBOp;
-                        btop_j = btop_i;
-                        jbond = &( bonds.select.bond_list[btop_j] );
-                        jbond->nbr = j; 
-                        jbond->d = r_ij;
-                        rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
-                        ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
-                        //jbond->dbond_index = btop_i;
-                        //jbond->sym_index = btop_i;
-                        ++btop_i;
-                        bo_ji = &( jbond->bo_data );
-                        bo_ji->BO = BO;
-                        bo_ji->BO_s = BO_s;
-                        bo_ji->BO_pi = BO_pi;
-                        bo_ji->BO_pi2 = BO_pi2;
-                        // Auxilary data structures to resolve dependencies
-                        jbond->scratch = 0;
-                        jbond->CdDelta_ij = 0;
-                        rvec_MakeZero (jbond->f);
-                        jbond->l = -1;
-                        jbond->CdDelta_jk = 0;
-                        jbond->Cdbo_kl = 0;
-                        rvec_MakeZero (jbond->i_f);
-                        rvec_MakeZero (jbond->k_f);
-                        rvec_MakeZero (jbond->h_f);
-                        rvec_MakeZero (jbond->t_f);
-                        // Bond Order page2-3, derivative of total bond order prime
-                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-                        // Only dln_BOp_xx wrt. dr_i is stored here, note that 
-                        //   dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 
-                        rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec);
-                        rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec);
-                        rvec_Scale(dln_BOp_pi2, -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec);
-                        rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s);
-                        rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi );
-                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 );
-                        // Only dBOp wrt. dr_i is stored here, note that 
-                        //   dBOp/dr_i = -dBOp/dr_j and all others are 0
-                        //CHANGE ORIGINAL
-                        rvec_Scale( dBOp, 
-                                -(BO_s * Cln_BOp_s + 
-                                    BO_pi * Cln_BOp_pi + 
-                                    BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec);
-                        rvec_Scale( bo_ji->dBOp, -1., dBOp);
-                        //CHANGE ORIGINAL
-                        rvec_Add( workspace.dDeltap_self[i], bo_ji->dBOp );
-                        bo_ji->BO_s -= control->bo_cut;
-                        bo_ji->BO -= control->bo_cut;
-                        workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp
-                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
-                    }
-                }
-            }
-        }
-    }
-    H->entries[Htop].j = i;
-    H->entries[Htop].val = sbp[type_i].eta;
-    //H->j [Htop] = i;
-    //H->val [Htop] = sbp[type_i].eta;
-    ++Htop;
-    H->end[i] = Htop;
-    Set_End_Index( i, btop_i, &bonds );
-    if( ihb == 1  || ihb == 2)
-        Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds );
-GLOBAL void fix_sym_dbond_indices (list pbonds, int N)
-    int i, nbr;
-    bond_data *ibond, *jbond;
-    int atom_j;
-    list *bonds = &pbonds;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++)
-    {
-        ibond = &( bonds->select.bond_list [j] );    
-        nbr = ibond->nbr;
-        for (int k = Start_Index (nbr, bonds); k < End_Index (nbr, bonds); k ++)
-        {
-            jbond = &( bonds->select.bond_list[ k ] );
-            atom_j = jbond->nbr;
-            if ( (atom_j == i) )
-            {
-                if (i > nbr) {
-                    ibond->dbond_index = j; 
-                    jbond->dbond_index = j;
-                    ibond->sym_index = k;
-                    jbond->sym_index = j;
-                }
-            }
-        }
-    }
-GLOBAL void fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N)
-    static_storage *workspace = &p_workspace;
-    hbond_data *ihbond, *jhbond;
-    int nbr;
-    //int i = (blockIdx.x * blockDim.x + threadIdx.x) >> 4;
-    int i = (blockIdx.x);
-    int start = Start_Index (workspace->hbond_index[i], &hbonds);
-    int end = End_Index (workspace->hbond_index[i], &hbonds);
-    //int j = start + threadIdx.x;
-    //int j = start + (threadIdx.x % 16);
-    //for (int j = Start_Index (workspace->hbond_index[i], &hbonds); 
-    //        j < End_Index (workspace->hbond_index[i], &hbonds); j++)
-    int j = start + threadIdx.x;
-    while (j < end)
-        //for (int j = start; j < end; j++)
-    {
-        ihbond = &( hbonds.select.hbond_list [j] );
-        nbr = ihbond->nbr;
-        int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds);
-        int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds);
-        for (int k = nbrstart; k < nbrend; k++)
-            //k = nbrstart + threadIdx.x;
-            //while (k < nbrend)
-        {
-            jhbond = &( hbonds.select.hbond_list [k] );
-            if (jhbond->nbr == i){
-                ihbond->sym_index = k;
-                jhbond->sym_index = j;
-                break;
-            }
-            //k += blockDim.x;
-        }
-        j += 32;
-    }
-GLOBAL void New_fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N )
-    static_storage *workspace = &p_workspace;
-    hbond_data *ihbond, *jhbond;
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int warp_id = thread_id / __THREADS_PER_ATOM__;
-    int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
-    int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-    if (warp_id >= N) return;
-    int i = warp_id;
-    int nbr;
-    int k;
-    int start = Start_Index (workspace->hbond_index[i], &hbonds);
-    int end = End_Index (workspace->hbond_index[i], &hbonds);
-    int j = start + lane_id;
-    //for (int j = start; j < end; j++)
-    while (j < end)
-    {
-        ihbond = &( hbonds.select.hbond_list [j] );
-        nbr = ihbond->nbr;
-        int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds);
-        int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds);
-        //k = nbrstart + lane_id;
-        //if (lane_id == 0) found [my_bucket] = 0;
-        //while (k < nbrend)
-        for (k = nbrstart; k < nbrend; k++)
-        {
-            jhbond = &( hbonds.select.hbond_list [k] );
-            if (jhbond->nbr == i){
-                ihbond->sym_index = k;
-                jhbond->sym_index = j;
-                break;
-            }
-        }
-        j += __THREADS_PER_ATOM__;
-    }
-void Init_Forces_Tab( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control ) {
-    int i, j, pj;
-    int start_i, end_i;
-    int type_i, type_j;
-    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
-    int tmin, tmax, r;
-    int ihb, jhb, ihb_top, jhb_top;
-    int flag;
-    real r_ij, r2, self_coef;
-    real val, dif, base;
-    real C12, C34, C56;
-    real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
-    real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2;   
-    sparse_matrix *H;
-    list *far_nbrs, *bonds, *hbonds;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
-    LR_lookup_table *t;
-    reax_atom *atom_i, *atom_j;
-    bond_data *ibond, *jbond;
-    bond_order_data *bo_ij, *bo_ji;
-    far_nbrs = *lists + FAR_NBRS;
-    bonds = *lists + BONDS;
-    hbonds = *lists + HBONDS;
-    H = &workspace->H;
-    Htop = 0;
-    num_bonds = 0;
-    num_hbonds = 0;
-    btop_i = btop_j = 0;
-    p_boc1 = system->reaxprm.gp.l[0];
-    p_boc2 = system->reaxprm.gp.l[1];
-    for( i = 0; i < system->N; ++i ) {
-        atom_i = &(system->atoms[i]);
-        type_i  = atom_i->type;
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
-        H->start[i] = Htop;
-        btop_i = End_Index( i, bonds );
-        sbp_i = &(system->reaxprm.sbp[type_i]);
-        ihb = ihb_top = -1;
-        if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
-            ihb_top = End_Index( workspace->hbond_index[i], hbonds );
-        for( pj = start_i; pj < end_i; ++pj ) {
-            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-            j = nbr_pj->nbr;
-            atom_j = &(system->atoms[j]);
-            flag = 0;
-            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-                if(nbr_pj->d <= control->r_cut)
-                    flag = 1;
-                else flag = 0;
-            }
-            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
-                            nbr_pj->dvec))<=SQR(control->r_cut)){
-                nbr_pj->d = sqrt(nbr_pj->d);
-                flag = 1;
-            }
-            if( flag ){    
-                type_j = system->atoms[j].type;
-                r_ij = nbr_pj->d;
-                sbp_j = &(system->reaxprm.sbp[type_j]);
-                twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-                self_coef = (i == j) ? 0.5 : 1.0;
-                tmin  = MIN( type_i, type_j );
-                tmax  = MAX( type_i, type_j );
-                t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] );      
-                /* cubic spline interpolation */
-                r = (int)(r_ij * t->inv_dx);
-                if( r == 0 )  ++r;
-                base = (real)(r+1) * t->dx;
-                dif = r_ij - base;
-                val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-                    t->ele[r].a;
-                val *= EV_to_KCALpMOL / C_ele;
-                H->entries[Htop].j = j;
-                H->entries[Htop].val = self_coef * val;
-                ++Htop;
-                /* hydrogen bond lists */ 
-                if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-                        nbr_pj->d <= control->hb_cut ) {
-                    // fprintf( stderr, "%d %d\n", atom1, atom2 );
-                    jhb = sbp_j->p_hbond;
-                    if( ihb == 1 && jhb == 2 ) {
-                        hbonds->select.hbond_list[ihb_top].nbr = j;
-                        hbonds->select.hbond_list[ihb_top].scl = 1;
-                        hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
-                        ++ihb_top;
-                        ++num_hbonds;
-                    }
-                    else if( ihb == 2 && jhb == 1 ) {
-                        jhb_top = End_Index( workspace->hbond_index[j], hbonds );
-                        hbonds->select.hbond_list[jhb_top].nbr = i;
-                        hbonds->select.hbond_list[jhb_top].scl = -1;
-                        hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
-                        Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
-                        ++num_hbonds;
-                    }
-                }
-                /* uncorrected bond orders */
-                if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
-                    r2 = SQR(r_ij);
-                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-                    }
-                    else BO_s = C12 = 0.0;
-                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-                        BO_pi = EXP( C34 );
-                    }
-                    else BO_pi = C34 = 0.0;
-                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                        BO_pi2= EXP( C56 );
-                    }
-                    else BO_pi2 = C56 = 0.0;
-                    /* Initially BO values are the uncorrected ones, page 1 */
-                    BO = BO_s + BO_pi + BO_pi2;
-                    if( BO >= control->bo_cut ) {
-                        num_bonds += 2;
-                        /****** bonds i-j and j-i ******/
-                        ibond = &( bonds->select.bond_list[btop_i] );
-                        btop_j = End_Index( j, bonds );
-                        jbond = &(bonds->select.bond_list[btop_j]);
-                        ibond->nbr = j;
-                        jbond->nbr = i;
-                        ibond->d = r_ij;
-                        jbond->d = r_ij;
-                        rvec_Copy( ibond->dvec, nbr_pj->dvec );
-                        rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
-                        ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
-                        ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
-                        ibond->dbond_index = btop_i;
-                        jbond->dbond_index = btop_i;
-                        ibond->sym_index = btop_j;
-                        jbond->sym_index = btop_i;
-                        ++btop_i;
-                        Set_End_Index( j, btop_j+1, bonds );
-                        bo_ij = &( ibond->bo_data );
-                        bo_ji = &( jbond->bo_data );
-                        bo_ji->BO = bo_ij->BO = BO;
-                        bo_ji->BO_s = bo_ij->BO_s = BO_s;
-                        bo_ji->BO_pi = bo_ij->BO_pi = BO_pi;
-                        bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
-                        /* Bond Order page2-3, derivative of total bond order prime */
-                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
-                           dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
-                        rvec_Scale(bo_ij->dln_BOp_pi2,
-                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
-                        rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
-                        rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
-                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
-                        /* Only dBOp wrt. dr_i is stored here, note that 
-                           dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-                        rvec_Scale( bo_ij->dBOp, 
-                                -(bo_ij->BO_s * Cln_BOp_s + 
-                                    bo_ij->BO_pi * Cln_BOp_pi + 
-                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-                        rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
-                        rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
-                        rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
-                        bo_ij->BO_s -= control->bo_cut;
-                        bo_ij->BO -= control->bo_cut;
-                        bo_ji->BO_s -= control->bo_cut;
-                        bo_ji->BO -= control->bo_cut;
-                        workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
-                        workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
-                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
-                        Set_End_Index( j, btop_j+1, bonds );
-                    }
-                }
-            }
-        }
-        H->entries[Htop].j = i;
-        H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
-        ++Htop;
-        Set_End_Index( i, btop_i, bonds );
-        if( ihb == 1 )
-            Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
-    }
-    // mark the end of j list
-    H->start[i] = Htop; 
-    /* validate lists - decide if reallocation is required! */
-    Validate_Lists( workspace, lists, 
-            data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
-            data->step, Htop, num_bonds, num_hbonds );
-    //Print_Bonds( system, bonds, "sbonds.out" );
-    //Print_Bond_List2( system, bonds, "sbonds.out" );
-    //Print_Sparse_Matrix2( H, "H.out" );
-void Estimate_Storage_Sizes( reax_system *system, control_params *control, 
-        list **lists, int *Htop, int *hb_top, 
-        int *bond_top, int *num_3body ) {
-    int i, j, pj;
-    int start_i, end_i;
-    int type_i, type_j;
-    int ihb, jhb;
-    real r_ij, r2;
-    real C12, C34, C56;
-    real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2; 
-    list *far_nbrs;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
-    reax_atom *atom_i, *atom_j;
-    far_nbrs = *lists + FAR_NBRS;
-    p_boc1 = system->reaxprm.gp.l[0];
-    p_boc2 = system->reaxprm.gp.l[1];
-    for( i = 0; i < system->N; ++i ) {
-        atom_i = &(system->atoms[i]);
-        type_i  = atom_i->type;
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
-        sbp_i = &(system->reaxprm.sbp[type_i]);
-        ihb = sbp_i->p_hbond;
-        for( pj = start_i; pj < end_i; ++pj ) {
-            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-            j = nbr_pj->nbr;
-            atom_j = &(system->atoms[j]);
-            type_j = atom_j->type;
-            sbp_j = &(system->reaxprm.sbp[type_j]);
-            twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-            if( nbr_pj->d <= control->r_cut ) {
-                ++(*Htop);
-                /* hydrogen bond lists */ 
-                if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && 
-                        nbr_pj->d <= control->hb_cut ) {
-                    jhb = sbp_j->p_hbond;
-                    if( ihb == 1 && jhb == 2 )
-                        ++hb_top[i];
-                    else if( ihb == 2 && jhb == 1 )
-                        ++hb_top[j];
-                }
-                /* uncorrected bond orders */
-                if( nbr_pj->d <= control->nbr_cut ) {
-                    r_ij = nbr_pj->d;
-                    r2 = SQR(r_ij);
-                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-                    }
-                    else BO_s = C12 = 0.0;
-                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-                        BO_pi = EXP( C34 );
-                    }
-                    else BO_pi = C34 = 0.0;
-                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                        BO_pi2= EXP( C56 );
-                    }
-                    else BO_pi2 = C56 = 0.0;
-                    /* Initially BO values are the uncorrected ones, page 1 */
-                    BO = BO_s + BO_pi + BO_pi2;
-                    if( BO >= control->bo_cut ) {
-                        ++bond_top[i];
-                        ++bond_top[j];
-                    }
-                }
-            }
-        }
-    }
-    *Htop += system->N;
-    *Htop *= SAFE_ZONE;
-    for( i = 0; i < system->N; ++i ) {
-        hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
-        *num_3body += SQR(bond_top[i]);
-        bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
-    }
-    *num_3body *= SAFE_ZONE;
-void Cuda_Estimate_Storage_Sizes (reax_system *system, control_params *control, int *output)
-    int *Htop, *num_3body, input_size;
-    int *hb_top, *bond_top;
-    int *input = (int *) scratch;
-    int max_3body = 0;
-    Htop = 0;
-    num_3body = 0;
-    input_size = INT_SIZE * (2 * system->N + 1 + 1);
-    //cuda_malloc ((void **) &input, input_size, 1, __LINE__);
-    cuda_memset (input, 0, input_size, RES_SCRATCH );
-    Estimate_Storage_Sizes <<<BLOCKS_POW_2, BLOCK_SIZE>>>
-        (system->d_atoms, system->N, system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-         system->reaxprm.d_gp, (control_params *)control->d_control, *(dev_lists + FAR_NBRS), 
-         system->reaxprm.num_atom_types, input);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    copy_host_device (output, input, input_size, cudaMemcpyDeviceToHost, __LINE__ );
-    Htop = &output[0];
-    num_3body  = &output[1];
-    hb_top = &output[ 2 ];
-    bond_top = &output[ 2 + system->N ];
-    *Htop += system->N;
-    *Htop *= SAFE_ZONE;
-    for( int i = 0; i < system->N; ++i ) {
-        hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
-        if (max_3body <= SQR (bond_top[i]))
-            max_3body = SQR (bond_top[i]);
-        *num_3body += SQR(bond_top[i]);
-        bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
-    }
-    *num_3body = max_3body * SAFE_ZONE;
-GLOBAL void Estimate_Storage_Sizes     (reax_atom *atoms, 
-        int N,
-        single_body_parameters *sbp,
-        two_body_parameters *tbp,
-        global_parameters gp, 
-        control_params *control, 
-        list far_nbrs,
-        int num_atom_types, int *results)
-    int *Htop = &results[0];
-    int *num_3body  = &results[1];
-    int *hb_top = &results [ 2 ];
-    int *bond_top = &results [ 2 + N ];
-    int i, j, pj;
-    int start_i, end_i;
-    int type_i, type_j;
-    int ihb, jhb;
-    real r_ij, r2;
-    real C12, C34, C56;
-    real BO, BO_s, BO_pi, BO_pi2;
-    real p_boc1, p_boc2; 
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
-    reax_atom *atom_i, *atom_j;
-    p_boc1 = gp.l[0];
-    p_boc2 = gp.l[1];
-    //for( i = 0; i < N; ++i ) {
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N ) return ;
-    atom_i = &(atoms[i]);
-    type_i  = atom_i->type;
-    start_i = Start_Index(i, &far_nbrs);
-    end_i   = End_Index(i, &far_nbrs);
-    sbp_i = &(sbp[type_i]);
-    ihb = sbp_i->p_hbond;
-    for( pj = start_i; pj < end_i; ++pj ) {
-        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-        j = nbr_pj->nbr;
-        atom_j = &( atoms[j] );
-        type_j = atom_j->type;
-        sbp_j = &( sbp[type_j] );
-        twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );
-        if( nbr_pj->d <= control->r_cut ) {
-            //++(*Htop);
-            atomicAdd (Htop, 1);
-            /* hydrogen bond lists */ 
-            //TODO - CHANGE ORIGINAL
-            if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-                    nbr_pj->d <= control->hb_cut ) {
-                jhb = sbp_j->p_hbond;
-                if( ihb == 1 && jhb == 2 )
-                    //++hb_top[i];
-                    atomicAdd (&hb_top[i], 1);
-                else if( ihb == 2 && jhb == 1 )
-                    //++hb_top[j];
-                    //atomicAdd (&hb_top[j], 1);
-                    atomicAdd (&hb_top[i], 1);
-            }
-            //TODO -- CHANGE ORIGINAL
-            //CHANGE ORIGINAL
-            if (i < j) continue;
-            //CHANGE ORIGINAL
-            /* uncorrected bond orders */
-            if( nbr_pj->d <= control->nbr_cut ) {
-                r_ij = nbr_pj->d;
-                r2 = SQR(r_ij);
-                if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-                }
-                else BO_s = C12 = 0.0;
-                if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-                    BO_pi = EXP( C34 );
-                }
-                else BO_pi = C34 = 0.0;
-                if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
-                    BO_pi2= EXP( C56 );
-                }
-                else BO_pi2 = C56 = 0.0;
-                /* Initially BO values are the uncorrected ones, page 1 */
-                BO = BO_s + BO_pi + BO_pi2;
-                if( BO >= control->bo_cut ) {
-                    //++bond_top[i];
-                    //++bond_top[j];
-                    atomicAdd (&bond_top[i], 1);
-                    atomicAdd (&bond_top[j], 1);
-                }
-            }
-        }
-    }
-    //}
-void Cuda_Compute_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list** lists, output_controls *out_control )
-    real t_start, t_elapsed;
-    real t_1, t_2;
-    int *indices;
-    int *Htop;
-    int max_sparse_entries = 0;
-    list *far_nbrs = dev_lists + FAR_NBRS;
-    int hblocks;
-    t_start = Get_Time ();
-    if ( !control->tabulate ) {
-        Init_Forces <<<BLOCKS, BLOCK_SIZE>>>
-            (system->d_atoms,         system->reaxprm.d_gp, (control_params *)control->d_control, 
-             system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-             (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace,
-             *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), 
-             system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types ); 
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-    }
-    else 
-    {
-        Init_Forces_Tab <<< BLOCKS, BLOCK_SIZE >>>
-            ( system->d_atoms,         system->reaxprm.d_gp, (control_params *)control->d_control, 
-              system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-              (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box,  *dev_workspace,
-              *(dev_lists + FAR_NBRS),     *(dev_lists + BONDS), *(dev_lists + HBONDS), 
-              system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types, 
-              d_LR );
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-    }
-    /*This is for bonds processing to fix dbond and sym_indexes */
-    t_1 = Get_Time ();
-    fix_sym_dbond_indices <<<BLOCKS, BLOCK_SIZE>>> (*(dev_lists + BONDS), system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    t_2 = Get_Timing_Info ( t_1 );
-    //FIX -1 HYDROGEN BOND fix for cases where there are no hbonds.
-    if ((control->hb_cut > 0) && (dev_workspace->num_H > 0))
-    {
-        hblocks = (system->N * HBONDS_SYM_THREADS_PER_ATOM / HBONDS_SYM_BLOCK_SIZE) + 
-            ((system->N * HBONDS_SYM_THREADS_PER_ATOM % HBONDS_SYM_BLOCK_SIZE) == 0 ? 0 : 1);
-        t_1 = Get_Time ();
-        /*
-           int bs = system->N;
-           int ss = 32;
-           fix_sym_hbond_indices <<<bs, ss>>> (*dev_workspace, *(dev_lists + HBONDS), system->N);
-         */
-        New_fix_sym_hbond_indices <<<hblocks, HBONDS_SYM_BLOCK_SIZE>>> (*dev_workspace, *(dev_lists + HBONDS), system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-    }
-    t_2 = Get_Timing_Info ( t_1 );
-    t_elapsed = Get_Timing_Info (t_start);
-    d_timing.init_forces+= t_elapsed;
-    Cuda_Validate_Lists( system, dev_workspace, &dev_lists, data->step, system->N,
-            system->num_bonds, system->num_hbonds );
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, "Done with Cuda List Validation \n");
-    //Bonded Force Calculations here.
-    t_start = Get_Time ();
-    Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
-    t_elapsed = Get_Timing_Info (t_start);
-    d_timing.bonded += t_elapsed;
-    //Compute the Non Bonded Forces here. 
-    t_start = Get_Time ();
-    Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control );
-    t_elapsed = Get_Timing_Info (t_start);
-    d_timing.nonb += t_elapsed;
-    //Compute Total Forces here
-    Cuda_Compute_Total_Force<<< BLOCKS, BLOCK_SIZE >>>
-        (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, 
-         *(dev_lists + BONDS), control->ensemble, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    Cuda_Compute_Total_Force_PostProcess<<< BLOCKS, BLOCK_SIZE >>>
-        (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, 
-         *(dev_lists + BONDS), control->ensemble, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-void Compute_Forces( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list** lists, output_controls *out_control )
-    real t_start, t_elapsed;
-    t_start = Get_Time( );
-    if( !control->tabulate )
-        Init_Forces( system, control, data, workspace, lists, out_control );
-    else Init_Forces_Tab( system, control, data, workspace, lists, out_control );
-    t_elapsed = Get_Timing_Info( t_start );
-    data->timing.init_forces += t_elapsed;
-#if defined(DEBUG_FOCUS)
-    print_sparse_matrix (system, workspace);
-    fprintf( stderr, "init_forces - ");
-    //analyze_hbonds (system, workspace, lists);
-    t_start = Get_Time( );
-    Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
-    t_elapsed = Get_Timing_Info( t_start );
-    data->timing.bonded += t_elapsed;
-    //print_bond_list (system, workspace, lists);
-    //exit (0);
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr, "bonded_forces - ");
-    t_start = Get_Time( );
-    Compute_NonBonded_Forces( system, control, data, workspace, 
-            lists, out_control );
-    t_elapsed = Get_Timing_Info( t_start );
-    data->timing.nonb += t_elapsed;
-#ifdef __DEBUG_CUDA__
-    fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed);
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "nonbondeds - ");
-    Compute_Total_Force( system, control, data, workspace, lists );
-    //Print_Total_Force( system, control, data, workspace, lists, out_control );
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "totalforces - ");
-    //Print_Total_Force( system, control, data, workspace, lists, out_control );
-    Print_Total_Force( system, control, data, workspace, lists, out_control );
-    Compare_Total_Forces( system, control, data, workspace, lists, out_control );
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr, "forces - ");
-bool validate_device (reax_system *system, simulation_data *data, static_storage *workspace, list **lists )
-    bool retval = false;
-#ifdef __BUILD_DEBUG__
-    retval |= validate_neighbors (system, lists);
-    retval |= validate_sym_dbond_indices (system, workspace, lists);
-    retval |= validate_bonds (system, workspace, lists);
-    retval |= validate_sparse_matrix (system, workspace);
-    retval |= validate_three_bodies (system, workspace, lists );
-    retval |= validate_hbonds (system, workspace, lists);
-    retval |= validate_workspace (system, workspace, lists);
-    retval |= validate_data (system, data);
-    retval |= validate_atoms (system, lists);
-    //analyze_hbonds (system, workspace, lists);
-    if (!retval) {
-        fprintf (stderr, "Results *DOES NOT* mattch between device and host \n");
-    }
-    return retval;
diff --git a/PuReMD-GPU/src/forces.h b/PuReMD-GPU/src/forces.h
index 10ac0ee9db54e411b9527429b9da6733d1d4072d..73323f0419baf383d6bf671158ef85584a710728 100644
--- a/PuReMD-GPU/src/forces.h
+++ b/PuReMD-GPU/src/forces.h
@@ -28,21 +28,7 @@ void Init_Bonded_Force_Functions( control_params* );
 void Compute_Forces( reax_system*, control_params*, simulation_data*,
                      static_storage*, list**, output_controls* );
-void Cuda_Compute_Forces( reax_system*, control_params*, simulation_data*,
-                          static_storage*, list**, output_controls* );
 void Estimate_Storage_Sizes( reax_system*, control_params*, list**,
                              int*, int*, int*, int* );
-void Cuda_Estimate_Storage_Sizes (reax_system *, control_params *, int *);
-GLOBAL void Estimate_Storage_Sizes  (reax_atom *, int , single_body_parameters *,
-                                     two_body_parameters *, global_parameters ,
-                                     control_params *, list , int , int *);
-GLOBAL void Estimate_Sparse_Matrix_Entries ( reax_atom *, control_params *,
-        simulation_data *, simulation_box *, list , int , int *);
-void Cuda_Threebody_List( reax_system *, static_storage *, list *, int );
-bool validate_device (reax_system *, simulation_data *, static_storage *, list **);
diff --git a/PuReMD-GPU/src/four_body_interactions.c b/PuReMD-GPU/src/four_body_interactions.c
new file mode 100644
index 0000000000000000000000000000000000000000..c51601fa991203a77ec4840c10e74e15cfa42c87
--- /dev/null
+++ b/PuReMD-GPU/src/four_body_interactions.c
@@ -0,0 +1,677 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "four_body_interactions.h"
+#include "bond_orders.h"
+#include "box.h"
+#include "list.h"
+#include "lookup.h"
+#include "vector.h"
+#include "math.h"
+#include "index_utils.h"
+real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
+        rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
+        three_body_interaction_data *p_ijk, 
+        three_body_interaction_data *p_jkl, 
+        rvec dcos_omega_di, rvec dcos_omega_dj, 
+        rvec dcos_omega_dk, rvec dcos_omega_dl, 
+        output_controls *out_control )
+    real unnorm_cos_omega, unnorm_sin_omega, omega;
+    real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
+    real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe;
+    real arg, poem, tel;
+    rvec cross_jk_kl;
+    sin_ijk = SIN( p_ijk->theta );
+    cos_ijk = COS( p_ijk->theta );
+    sin_jkl = SIN( p_jkl->theta );
+    cos_jkl = COS( p_jkl->theta );
+    /* omega */
+    unnorm_cos_omega = -rvec_Dot( dvec_ij,dvec_jk )*rvec_Dot( dvec_jk,dvec_kl ) +
+        SQR( r_jk ) *  rvec_Dot( dvec_ij,dvec_kl );
+    rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
+    unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
+    omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); 
+    /* derivatives */
+    /* coef for adjusments to cos_theta's */
+    /* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li;
+       coshd = cos_ijk, coshe = cos_jkl;
+       sinhd = sin_ijk, sinhe = sin_jkl; */
+    htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk );
+    htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl;
+    htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk );
+    hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl );
+    hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk );
+    hnra = r_kl * sin_ijk * sin_jkl;
+    hnrc = r_ij * sin_ijk * sin_jkl;
+    hnhd = r_ij * r_kl * cos_ijk * sin_jkl;
+    hnhe = r_ij * r_kl * sin_ijk * cos_jkl;
+    poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
+    if( poem < 1e-20 ) poem = 1e-20;
+    tel  = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) - 
+        2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + 
+                r_jk * r_kl * cos_jkl );
+    arg  = tel / poem;
+    if( arg >  1.0 )
+    {
+        arg =  1.0;
+    }
+    if( arg < -1.0 )
+    {
+        arg = -1.0;
+    }
+    /*fprintf( out_control->etor, 
+      "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+      htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+      dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+      -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+      -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
+      r_li, dvec_li[0], dvec_li[1], dvec_li[2] );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
+      r_ij, r_jk, r_kl, r_li ); 
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", 
+      cos_ijk, cos_jkl, sin_ijk, sin_jkl ); 
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+      poem, tel, arg );*/
+    /* fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+       -p_ijk->dcos_dk[0]/sin_ijk, 
+       -p_ijk->dcos_dk[1]/sin_ijk, 
+       -p_ijk->dcos_dk[2]/sin_ijk );
+       fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+       -p_jkl->dcos_dk[0]/sin_jkl, 
+       -p_jkl->dcos_dk[1]/sin_jkl, 
+       -p_jkl->dcos_dk[2]/sin_jkl );*/
+    if( sin_ijk >= 0 && sin_ijk <= MIN_SINE )
+    {
+        sin_ijk = MIN_SINE;
+    }
+    else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE )
+    {
+        sin_ijk = -MIN_SINE;
+    }
+    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE )
+    {
+        sin_jkl = MIN_SINE;
+    }
+    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE )
+    {
+        sin_jkl = -MIN_SINE;
+    }
+    // dcos_omega_di
+    rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_di,-(hthd - arg*hnhd)/sin_ijk, p_ijk->dcos_dk );
+    rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di );
+    // dcos_omega_dj
+    rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, 
+            -htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_dj );
+    rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_di );
+    rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj );
+    // dcos_omega_dk
+    rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc) / r_kl, dvec_kl,  
+            htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_di );
+    rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dj );
+    rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk );
+    // dcos_omega_dl
+    rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc) / r_kl, dvec_kl, 1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dk );
+    rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl );
+    return omega;  
+    //return arg;
+void Four_Body_Interactions( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+    int i, j, k, l, pi, pj, pk, pl, pij, plk;
+    int type_i, type_j, type_k, type_l;
+    int start_j, end_j, start_k, end_k;
+    int start_pj, end_pj, start_pk, end_pk;
+    int num_frb_intrs = 0;
+    real Delta_j, Delta_k;
+    real r_ij, r_jk, r_kl, r_li;
+    real BOA_ij, BOA_jk, BOA_kl;
+    real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
+    real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
+    real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
+    real fn10, f11_DjDk, dfn11, fn12;
+    real theta_ijk, theta_jkl;
+    real sin_ijk, sin_jkl;
+    real cos_ijk, cos_jkl;
+    real tan_ijk_i, tan_jkl_i;
+    real omega, cos_omega, cos2omega, cos3omega;
+    rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
+    real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
+    real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
+    real Cconj, CEconj1, CEconj2, CEconj3;
+    real CEconj4, CEconj5, CEconj6;
+    real e_tor, e_con;
+    rvec dvec_li;
+    rvec force, ext_press;
+    ivec rel_box_jl;
+    // rtensor total_rtensor, temp_rtensor;
+    four_body_header *fbh;
+    four_body_parameters *fbp;
+    bond_data *pbond_ij, *pbond_jk, *pbond_kl;
+    bond_order_data *bo_ij, *bo_jk, *bo_kl;
+    three_body_interaction_data *p_ijk, *p_jkl;
+    real p_tor2 = system->reaxprm.gp.l[23];
+    real p_tor3 = system->reaxprm.gp.l[24];
+    real p_tor4 = system->reaxprm.gp.l[25];
+    real p_cot2 = system->reaxprm.gp.l[27];
+    list *bonds = (*lists) + BONDS;
+    list *thb_intrs = (*lists) + THREE_BODIES;
+    for( j = 0; j < system->N; ++j ) {
+        type_j = system->atoms[j].type;
+        Delta_j = workspace->Delta_boc[j];
+        start_j = Start_Index(j, bonds);
+        end_j = End_Index(j, bonds);
+        for( pk = start_j; pk < end_j; ++pk ) {
+            pbond_jk = &( bonds->select.bond_list[pk] );
+            k = pbond_jk->nbr;
+            bo_jk = &( pbond_jk->bo_data );
+            BOA_jk = bo_jk->BO - control->thb_cut;
+            /* see if there are any 3-body interactions involving j&k
+               where j is the central atom. Otherwise there is no point in
+               trying to form a 4-body interaction out of this neighborhood */    
+            if( j < k && bo_jk->BO > control->thb_cut/*0*/ && 
+                    Num_Entries(pk, thb_intrs) ) {
+                start_k = Start_Index(k, bonds);
+                end_k = End_Index(k, bonds);                   
+                pj = pbond_jk->sym_index; // pj points to j on k's list
+                /* do the same check as above: are there any 3-body interactions 
+                   involving k&j where k is the central atom */
+                if( Num_Entries(pj, thb_intrs) ) {
+                    type_k = system->atoms[k].type;
+                    Delta_k = workspace->Delta_boc[k];
+                    r_jk = pbond_jk->d;
+                    start_pk = Start_Index(pk, thb_intrs );
+                    end_pk = End_Index(pk, thb_intrs );
+                    start_pj = Start_Index(pj, thb_intrs );
+                    end_pj = End_Index(pj, thb_intrs );        
+                    exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
+                    exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
+                    exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
+                    exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
+                    exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
+                    f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
+                    /* pick i up from j-k interaction where j is the centre atom */
+                    for( pi = start_pk; pi < end_pk; ++pi ) {
+                        p_ijk = &( thb_intrs->select.three_body_list[pi] );
+                        pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
+                        pbond_ij = &( bonds->select.bond_list[pij] );
+                        bo_ij = &( pbond_ij->bo_data );
+                        if( bo_ij->BO > control->thb_cut/*0*/ ) {
+                            i = p_ijk->thb;
+                            type_i = system->atoms[i].type;
+                            r_ij = pbond_ij->d;
+                            BOA_ij = bo_ij->BO - control->thb_cut;
+                            theta_ijk = p_ijk->theta;
+                            sin_ijk = SIN( theta_ijk );
+                            cos_ijk = COS( theta_ijk );
+                            //tan_ijk_i = 1. / TAN( theta_ijk );
+                            if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
+                                tan_ijk_i = cos_ijk / MIN_SINE;
+                            else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
+                                tan_ijk_i = cos_ijk / -MIN_SINE;
+                            else tan_ijk_i = cos_ijk / sin_ijk;
+                            exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
+                            exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
+                            /* pick l up from j-k intr. where k is the centre */
+                            for( pl = start_pj; pl < end_pj; ++pl ) {
+                                p_jkl = &( thb_intrs->select.three_body_list[pl] );
+                                l = p_jkl->thb;
+                                plk = p_jkl->pthb; //pointer to l on k's bond_list!
+                                pbond_kl = &( bonds->select.bond_list[plk] );
+                                bo_kl = &( pbond_kl->bo_data );
+                                type_l = system->atoms[l].type;
+                                fbh = &(system->reaxprm.fbp[ index_fbp(type_i,type_j,type_k,type_l,system->reaxprm.num_atom_types ) ]);
+                                fbp = &(system->reaxprm.fbp[ index_fbp(type_i,type_j,type_k,type_l,system->reaxprm.num_atom_types )].prm[0]);
+                                if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
+                                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
+                                    ++num_frb_intrs;
+                                    r_kl = pbond_kl->d;
+                                    BOA_kl = bo_kl->BO - control->thb_cut;
+                                    theta_jkl = p_jkl->theta;
+                                    sin_jkl = SIN( theta_jkl );
+                                    cos_jkl = COS( theta_jkl );
+                                    //tan_jkl_i = 1. / TAN( theta_jkl );
+                                    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
+                                        tan_jkl_i = cos_jkl / MIN_SINE;
+                                    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
+                                        tan_jkl_i = cos_jkl / -MIN_SINE;
+                                    else tan_jkl_i = cos_jkl /sin_jkl;
+                                    Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, 
+                                            &(system->box), dvec_li );
+                                    r_li = rvec_Norm( dvec_li );
+                                    /* omega and its derivative */
+                                    //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, 
+                                    omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, 
+                                            r_jk, pbond_kl->dvec, r_kl,
+                                            dvec_li, r_li, p_ijk, p_jkl,
+                                            dcos_omega_di, dcos_omega_dj,
+                                            dcos_omega_dk, dcos_omega_dl,
+                                            out_control);
+                                    cos_omega = COS( omega );
+                                    cos2omega = COS( 2. * omega );
+                                    cos3omega = COS( 3. * omega );
+                                    /* end omega calculations */
+                                    /* torsion energy */
+                                    exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk));
+                                    exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
+                                    exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) );
+                                    fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
+                                        (1.0 - exp_tor2_kl);
+                                    CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
+                                            fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
+                                            fbp->V3 * (1.0 + cos3omega) );
+                                    //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + 
+                                    //  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
+                                    //  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
+                                    data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV;
+                                    dfn11 = (-p_tor3 * exp_tor3_DjDk +
+                                            (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
+                                            (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
+                                    CEtors1 = sin_ijk * sin_jkl * CV;
+                                    CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * 
+                                        (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * 
+                                        sin_ijk * sin_jkl; 
+                                    CEtors3 = CEtors2 * dfn11;
+                                    CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
+                                        (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
+                                    CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * 
+                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
+                                    CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
+                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
+                                    cmn = -fn10 * CV;
+                                    CEtors7 = cmn * sin_jkl * tan_ijk_i;
+                                    CEtors8 = cmn * sin_ijk * tan_jkl_i;
+                                    CEtors9 = fn10 * sin_ijk * sin_jkl * 
+                                        (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                         1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
+                                    //cmn = -fn10 * CV;
+                                    //CEtors7 = cmn * sin_jkl * cos_ijk;
+                                    //CEtors8 = cmn * sin_ijk * cos_jkl;
+                                    //CEtors9 = fn10 * sin_ijk * sin_jkl * 
+                                    //  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                    //   fbp->V3 * (6*SQR(cos_omega) - 1.50));
+                                    /* end  of torsion energy */
+                                    /* 4-body conjugation energy */
+                                    fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
+                                    data->E_Con += e_con = fbp->p_cot1 * fn12 * 
+                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+                                    Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
+                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+                                    CEconj1 = Cconj * (BOA_ij - 1.5e0);
+                                    CEconj2 = Cconj * (BOA_jk - 1.5e0);
+                                    CEconj3 = Cconj * (BOA_kl - 1.5e0);
+                                    CEconj4 = -fbp->p_cot1 * fn12 * 
+                                        (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
+                                    CEconj5 = -fbp->p_cot1 * fn12 * 
+                                        (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
+                                    //CEconj4 = -fbp->p_cot1 * fn12 * 
+                                    //  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
+                                    //CEconj5 = -fbp->p_cot1 * fn12 * 
+                                    //  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
+                                    CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
+                                        cos_omega * sin_ijk * sin_jkl;
+                                    /* end 4-body conjugation energy */
+                                    //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
+                                    //   workspace->orig_id[i], workspace->orig_id[j],
+                                    //       workspace->orig_id[k], workspace->orig_id[l], 
+                                    //    omega, cos_omega, cos2omega, cos3omega );
+                                    //fprintf(stdout, 
+                                    //    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                    //    CEtors2, CEtors3, CEtors4, CEtors5, 
+                                    //    CEtors6, CEtors7, CEtors8, CEtors9 );
+                                    //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                    //    theta_ijk, theta_jkl, sin_ijk, 
+                                    //    sin_jkl, cos_jkl, tan_jkl_i );
+                                    /* forces */
+                                    bo_jk->Cdbopi += CEtors2;
+                                    workspace->CdDelta[j] += CEtors3;
+                                    workspace->CdDelta[k] += CEtors3;
+                                    bo_ij->Cdbo += (CEtors4 + CEconj1);
+                                    bo_jk->Cdbo += (CEtors5 + CEconj2);
+                                    bo_kl->Cdbo += (CEtors6 + CEconj3);
+                                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                                        /* dcos_theta_ijk */
+                                        rvec_ScaledAdd( system->atoms[i].f, 
+                                                CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[k].f, 
+                                                CEtors7 + CEconj4, p_ijk->dcos_di );
+                                        /* dcos_theta_jkl */
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors8 + CEconj5, p_jkl->dcos_di );
+                                        rvec_ScaledAdd( system->atoms[k].f, 
+                                                CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[l].f, 
+                                                CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                        /* dcos_omega */
+                                        rvec_ScaledAdd( system->atoms[i].f, 
+                                                CEtors9 + CEconj6, dcos_omega_di );
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors9 + CEconj6, dcos_omega_dj );
+                                        rvec_ScaledAdd( system->atoms[k].f, 
+                                                CEtors9 + CEconj6, dcos_omega_dk );
+                                        rvec_ScaledAdd( system->atoms[l].f, 
+                                                CEtors9 + CEconj6, dcos_omega_dl );
+                                    }
+                                    else {
+                                        ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
+                                        /* dcos_theta_ijk */
+                                        rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                        rvec_Add( system->atoms[i].f, force );
+                                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                        rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
+                                        rvec_Add( system->atoms[k].f, force );
+                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+                                        /* dcos_theta_jkl */
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors8 + CEconj5, p_jkl->dcos_di );
+                                        rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                        rvec_Add( system->atoms[k].f, force );
+                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+                                        rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                        rvec_Add( system->atoms[l].f, force );
+                                        rvec_iMultiply( ext_press, rel_box_jl, force );
+                                        rvec_Add( data->ext_press, ext_press );
+                                        /* dcos_omega */                      
+                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
+                                        rvec_Add( system->atoms[i].f, force );
+                                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors9 + CEconj6, dcos_omega_dj );
+                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
+                                        rvec_Add( system->atoms[k].f, force );
+                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
+                                        rvec_Add( system->atoms[l].f, force );
+                                        rvec_iMultiply( ext_press, rel_box_jl, force );
+                                        rvec_Add( data->ext_press, ext_press );
+                                        /* This part is intended for a fully-flexible box */
+                                        /* rvec_ScaledSum( temp_rvec, 
+                                           CEtors7 + CEconj4, p_ijk->dcos_dk,      // i     
+                                           CEtors9 + CEconj6, dcos_omega_di );
+                                           rvec_OuterProduct( temp_rtensor, 
+                                           temp_rvec, system->atoms[i].x );
+                                           rtensor_Copy( total_rtensor, temp_rtensor );
+                                           rvec_ScaledSum( temp_rvec, 
+                                           CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
+                                           CEtors8 + CEconj5, p_jkl->dcos_di );
+                                           rvec_ScaledAdd( temp_rvec, 
+                                           CEtors9 + CEconj6, dcos_omega_dj );
+                                           rvec_OuterProduct( temp_rtensor, 
+                                           temp_rvec, system->atoms[j].x );
+                                           rtensor_Add( total_rtensor, temp_rtensor );
+                                           rvec_ScaledSum( temp_rvec, 
+                                           CEtors7 + CEconj4, p_ijk->dcos_di,      // k
+                                           CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                           rvec_ScaledAdd( temp_rvec, 
+                                           CEtors9 + CEconj6, dcos_omega_dk );
+                                           rvec_OuterProduct( temp_rtensor, 
+                                           temp_rvec, system->atoms[k].x );
+                                           rtensor_Add( total_rtensor, temp_rtensor );
+                                           rvec_ScaledSum( temp_rvec, 
+                                           CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
+                                           CEtors9 + CEconj6, dcos_omega_dl );
+                                           rvec_OuterProduct( temp_rtensor, 
+                                           temp_rvec, system->atoms[l].x );
+                                           rtensor_Copy( total_rtensor, temp_rtensor );
+                                           if( pbond_ij->imaginary || pbond_jk->imaginary || 
+                                           pbond_kl->imaginary )
+                                           rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
+                                           else
+                                           rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                                    }
+                                    /*fprintf( out_control->etor, 
+                                    //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                    //r_ij, r_jk, r_kl, 
+                                    "%12.8f%12.8f%12.8f%12.8f\n",
+                                    cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
+                                    // fprintf( out_control->etor, "%12.8f\n", dfn11 );
+                                    fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", 
+                                            fn10, cos_omega, CV );
+                                    fprintf( out_control->etor, 
+                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                            CEtors2, CEtors3, CEtors4, CEtors5, 
+                                            CEtors6, CEtors7, CEtors8, CEtors9 );
+                                    /* fprintf( out_control->etor, 
+                                       "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                       htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
+                                    fprintf( out_control->etor, 
+                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                            CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
+                                    /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
+                                       fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
+                                    fprintf( out_control->etor, 
+                                            //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", 
+                                            "%6d%6d%6d%6d%12.8f%12.8f\n", 
+                                            workspace->orig_id[i], workspace->orig_id[j], 
+                                            workspace->orig_id[k], workspace->orig_id[l], 
+                                            e_tor, e_con );
+                                    //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
+                                    fprintf( out_control->econ, 
+                                            "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
+                                            workspace->orig_id[i], workspace->orig_id[j], 
+                                            workspace->orig_id[k], workspace->orig_id[l], 
+                                            RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
+                                            e_con,data->E_Con );
+                                    /* fprintf( out_control->etor, 
+                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",       
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[0], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
+                                    /* fprintf( out_control->etor, 
+                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[0], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[1], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[2], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
+                                    fprintf( out_control->etor, 
+                                            "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                            dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], 
+                                            dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], 
+                                            dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
+                                            dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
+                                    // Torsion Forces 
+                                    Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., 
+                                            workspace->f_tor, workspace->f_tor);
+                                    Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
+                                    Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
+                                    Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
+                                    Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
+                                    Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
+                                    rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk);
+                                    rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj);
+                                    rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di);
+                                    rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di);
+                                    rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj);
+                                    rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk);
+                                    rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di );
+                                    rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj );
+                                    rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
+                                    rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
+                                    // Conjugation Forces 
+                                    Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
+                                    Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
+                                    Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
+                                    rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk);
+                                    rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj);
+                                    rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di);
+                                    rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di);
+                                    rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj);
+                                    rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk);
+                                    rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di );
+                                    rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj );
+                                    rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk );
+                                    rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl );
+                                } // pl check ends
+                            } // pl loop ends
+                        } // pi check ends
+                    } // pi loop ends
+                } // k-j neighbor check ends
+            } // j<k && j-k neighbor check ends
+        } // pk loop ends
+    } // j loop
+    /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", 
+       data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
+    fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs );
+    fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", 
+            data->E_Tor, data->E_Con );
diff --git a/PuReMD-GPU/src/four_body_interactions.h b/PuReMD-GPU/src/four_body_interactions.h
index 402ebe7dc807afd0d14541bf5686a0a674b43b09..8e8dd7c0991a747000e77b2d460711e433db52ef 100644
--- a/PuReMD-GPU/src/four_body_interactions.h
+++ b/PuReMD-GPU/src/four_body_interactions.h
@@ -23,20 +23,10 @@
 #include "mytypes.h"
+#define MIN_SINE 1e-10
 void Four_Body_Interactions( reax_system*, control_params*, simulation_data*,
-                             static_storage*, list**, output_controls* );
-GLOBAL void Four_Body_Interactions ( reax_atom *,
-                                     global_parameters ,
-                                     four_body_header *,
-                                     control_params *,
-                                     list , list ,
-                                     simulation_box *,
-                                     simulation_data *,
-                                     static_storage ,
-                                     int , int , real *, real *, rvec *);
-GLOBAL void Four_Body_Postprocess (reax_atom *,
-                                   static_storage,
-                                   list , int );
+    static_storage*, list**, output_controls* );
diff --git a/PuReMD-GPU/src/grid.cu b/PuReMD-GPU/src/grid.c
similarity index 93%
rename from PuReMD-GPU/src/grid.cu
rename to PuReMD-GPU/src/grid.c
index 00e638f4b3ae3828e4e0e208f307d51283d8c2d5..fb09b409194a84b1646da3b779aad8b547ff9db3 100644
--- a/PuReMD-GPU/src/grid.cu
+++ b/PuReMD-GPU/src/grid.c
@@ -19,12 +19,11 @@
 #include "grid.h"
 #include "reset_utils.h"
 #include "vector.h"
 #include "index_utils.h"
-#include "cuda_utils.h"
 int Estimate_GCell_Population( reax_system* system )
@@ -361,23 +360,6 @@ void Bin_Atoms( reax_system* system, static_storage *workspace )
         workspace->realloc.gcell_atoms = MAX(max_atoms*SAFE_ZONE,MIN_GCELL_POPL); 
-void Cuda_Bin_Atoms (reax_system *system, static_storage *workspace )
-    Cuda_Reset_Grid ( &system->d_g);
-    Bin_Atoms ( system, workspace );
-    dev_workspace->realloc.gcell_atoms = workspace->realloc.gcell_atoms;
-void Cuda_Bin_Atoms_Sync (reax_system *system)
-    copy_host_device (system->g.top, system->d_g.top, 
-            INT_SIZE * system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_TOP);
-    copy_host_device (system->g.atoms, system->d_g.atoms, 
-            INT_SIZE * system->g.max_atoms*system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_ATOMS);
 inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
@@ -396,11 +378,11 @@ void Copy_Storage( reax_system *system, static_storage *workspace,
     int i;
     for( i = 0; i < RESTART+1; ++i )
-        v[ index_wkspace_sys (i,top, system) ] = workspace->v[ index_wkspace_sys (i,old_id, system) ];
+        v[ index_wkspace_sys (i,top, system->N) ] = workspace->v[ index_wkspace_sys (i,old_id, system->N) ];
     for( i = 0; i < 3; ++i ) {
-        s[ index_wkspace_sys (i,top, system) ] = workspace->s[ index_wkspace_sys (i,old_id, system) ];
-        t[ index_wkspace_sys (i,top, system) ] = workspace->t[ index_wkspace_sys (i,old_id, system) ];
+        s[ index_wkspace_sys (i,top, system->N) ] = workspace->s[ index_wkspace_sys (i,old_id, system->N) ];
+        t[ index_wkspace_sys (i,top, system->N) ] = workspace->t[ index_wkspace_sys (i,old_id, system->N) ];
     orig_id[top]  = workspace->orig_id[old_id];
diff --git a/PuReMD-GPU/src/grid.h b/PuReMD-GPU/src/grid.h
index b524fb76ea394c38dd5a2f7fa94e3b668b43a744..8f26f018e947bc2ebe1789736d8203dc435addfd 100644
--- a/PuReMD-GPU/src/grid.h
+++ b/PuReMD-GPU/src/grid.h
@@ -23,6 +23,11 @@
 #include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
 void Setup_Grid( reax_system* );
 void Update_Grid( reax_system* );
@@ -32,9 +37,12 @@ int  Shift( int, int, int, grid* );
 void Cluster_Atoms( reax_system*, static_storage* );
 void Bin_Atoms( reax_system*, static_storage* );
-void Cuda_Bin_Atoms( reax_system*, static_storage* );
-void Cuda_Bin_Atoms_Sync (reax_system *);
 void Reset_Marks( grid*, ivec*, int );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/index_utils.h b/PuReMD-GPU/src/index_utils.h
index b856e5efe79eb7e2ca96163d9bd67744b44f9a00..cbd81cadc4113ecf8804d6bd28299336aa4705e6 100644
--- a/PuReMD-GPU/src/index_utils.h
+++ b/PuReMD-GPU/src/index_utils.h
@@ -23,101 +23,74 @@
 #include "mytypes.h"
-extern inline HOST_DEVICE int index_grid_3d (int i, int j, int k, grid *g)
-    return  (i * g->ncell[1] * g->ncell[2]) +
-            (j * g->ncell[2]) +
-            k;
-extern inline HOST_DEVICE int index_grid_nbrs (int i, int j, int k, int l, grid *g)
+static inline HOST_DEVICE int index_grid_3d( int i, int j, int k, grid *g )
-    return  (i * g->ncell[1] * g->ncell[2] * g->max_nbrs) +
-            (j * g->ncell[2] * g->max_nbrs) +
-            (k * g->max_nbrs) +
-            l;
+    return (i * g->ncell[1] * g->ncell[2]) + (j * g->ncell[2]) + k;
-extern inline HOST_DEVICE int index_grid_atoms (int i, int j, int k, int l, grid *g)
+static inline HOST_DEVICE int index_grid_nbrs( int i, int j, int k, int l, grid *g )
-    return  (i * g->ncell[1] * g->ncell[2] * g->max_atoms) +
-            (j * g->ncell[2] * g->max_atoms) +
-            (k * g->max_atoms) +
-            l;
+    return (i * g->ncell[1] * g->ncell[2] * g->max_nbrs) +
+           (j * g->ncell[2] * g->max_nbrs) +
+           (k * g->max_nbrs) +
+           l;
-extern inline HOST_DEVICE int index_wkspace_sys (int i, int j, reax_system *system)
+static inline HOST_DEVICE int index_grid_atoms( int i, int j, int k, int l, grid *g )
-    return (i * system->N) + j;
+    return (i * g->ncell[1] * g->ncell[2] * g->max_atoms) +
+           (j * g->ncell[2] * g->max_atoms) +
+           (k * g->max_atoms) +
+           l;
-extern inline HOST_DEVICE int index_wkspace_sys (int i, int j, int N)
+static inline HOST_DEVICE int index_wkspace_sys( int i, int j, int N )
     return (i * N) + j;
-extern inline HOST_DEVICE int index_wkspace_res (int i, int j )
+static inline HOST_DEVICE int index_wkspace_res( int i, int j )
     return (i * (RESTART + 1)) + j;
-extern inline HOST_DEVICE int index_tbp (int i, int j, reax_interaction *reax)
-    return (i * reax->num_atom_types) + j;
-extern inline HOST_DEVICE int index_tbp (int i, int j, int num_atom_types)
+static inline HOST_DEVICE int index_tbp( int i, int j, int num_atom_types )
     return (i * num_atom_types) + j;
-extern inline HOST_DEVICE int index_thbp (int i, int j, int k, reax_interaction *reax)
-    return  (i * reax->num_atom_types * reax->num_atom_types ) +
-            (j * reax->num_atom_types ) +
-            k;
-extern inline HOST_DEVICE int index_thbp (int i, int j, int k, int num_atom_types)
+static inline HOST_DEVICE int index_thbp( int i, int j, int k, int num_atom_types )
-    return  (i * num_atom_types * num_atom_types ) +
-            (j * num_atom_types ) +
-            k;
+    return (i * num_atom_types * num_atom_types ) + (j * num_atom_types ) + k;
-extern inline HOST_DEVICE int index_hbp (int i, int j, int k, reax_interaction *reax)
-    return  (i * reax->num_atom_types * reax->num_atom_types ) +
-            (j * reax->num_atom_types ) +
-            k;
-extern inline HOST_DEVICE int index_hbp (int i, int j, int k, int num_atom_types)
+static inline HOST_DEVICE int index_hbp( int i, int j, int k, int num_atom_types )
-    return  (i * num_atom_types * num_atom_types ) +
-            (j * num_atom_types ) +
-            k;
+    return (i * num_atom_types * num_atom_types ) + (j * num_atom_types ) + k;
-extern inline HOST_DEVICE int index_fbp (int i, int j, int k, int l, reax_interaction *reax)
-    return  (i * reax->num_atom_types * reax->num_atom_types * reax->num_atom_types ) +
-            (j * reax->num_atom_types * reax->num_atom_types ) +
-            (k * reax->num_atom_types ) +
-            l;
-extern inline HOST_DEVICE int index_fbp (int i, int j, int k, int l, int num_atom_types)
+static inline HOST_DEVICE int index_fbp( int i, int j, int k, int l, int num_atom_types )
-    return  (i * num_atom_types * num_atom_types * num_atom_types ) +
-            (j * num_atom_types * num_atom_types ) +
-            (k * num_atom_types ) +
-            l;
+    return (i * num_atom_types * num_atom_types * num_atom_types ) +
+           (j * num_atom_types * num_atom_types ) +
+           (k * num_atom_types ) +
+           l;
-extern inline HOST_DEVICE int index_lr (int i, int j, int num_atom_types )
+static inline HOST_DEVICE int index_lr( int i, int j, int num_atom_types )
     return (i * num_atom_types) + j;
diff --git a/PuReMD-GPU/src/init_md.c b/PuReMD-GPU/src/init_md.c
new file mode 100644
index 0000000000000000000000000000000000000000..2a2ce1270e2c694722e489b9a3f38f8dd48177a1
--- /dev/null
+++ b/PuReMD-GPU/src/init_md.c
@@ -0,0 +1,879 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "init_md.h"
+#include "allocate.h"
+#include "box.h"
+#include "forces.h"
+#include "grid.h"
+#include "index_utils.h"
+#include "lin_alg.h"
+#include "integrate.h"
+#include "neighbors.h"
+#include "list.h"
+#include "lookup.h"
+#include "print_utils.h"
+#include "reset_utils.h"
+#include "system_props.h"
+#include "traj.h"
+#include "vector.h"
+void Generate_Initial_Velocities(reax_system *system, real T )
+    int i;
+    real scale, norm;
+    if( T <= 0.1 )
+    {
+        for ( i = 0; i < system->N; i++ )
+        {
+            rvec_MakeZero( system->atoms[i].v );
+        }
+#if defined(DEBUG)
+        fprintf( stderr, "no random velocities...\n" );
+    }
+    else
+    {
+        for( i = 0; i < system->N; i++ )
+        {
+            rvec_Random( system->atoms[i].v );
+            norm = rvec_Norm_Sqr( system->atoms[i].v );
+            scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * 
+                    norm / (3.0 * K_B * T) );
+            rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v );
+            /*
+               fprintf( stderr, "v = %f %f %f\n", 
+               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
+               fprintf( stderr, "scale = %f\n", scale );
+               fprintf( stderr, "v = %f %f %f\n",
+               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
+             */
+        }
+    }
+void Init_System( reax_system *system, control_params *control, 
+        simulation_data *data )
+    int i;
+    rvec dx;
+    if( !control->restart )
+    {
+        Reset_Atoms( system );
+    }
+    Compute_Total_Mass( system, data );
+    Compute_Center_of_Mass( system, data, stderr );
+    /* reposition atoms */
+    // just fit the atoms to the periodic box
+    if( control->reposition_atoms == 0 )
+    {
+        rvec_MakeZero( dx );
+    }
+    // put the center of mass to the center of the box
+    else if( control->reposition_atoms == 1 )
+    {
+        rvec_Scale( dx, 0.5, system->box.box_norms );
+        rvec_ScaledAdd( dx, -1., data->xcm );
+    }
+    // put the center of mass to the origin
+    else if( control->reposition_atoms == 2 ) {
+        rvec_Scale( dx, -1., data->xcm );
+    }
+    else {
+        fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
+        exit( UNKNOWN_OPTION );
+    }
+    for( i = 0; i < system->N; ++i ) {
+        Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
+        /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", 
+          i, system->atoms[i].type, 
+          system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/
+    }
+    /* Initialize velocities so that desired init T can be attained */
+    if( !control->restart || (control->restart && control->random_vel) )  {
+        Generate_Initial_Velocities( system, control->T_init );
+    }
+    Setup_Grid( system );
+void Init_Simulation_Data( reax_system *system, control_params *control, 
+        simulation_data *data, output_controls *out_control, 
+        evolve_function *Evolve )
+    Reset_Simulation_Data( data );
+    if( !control->restart )  
+        data->step = data->prev_steps = 0;
+    switch( control->ensemble ) {
+        case NVE:
+            data->N_f = 3 * system->N;
+            *Evolve = Velocity_Verlet_NVE;
+            break;
+        case NVT:
+            data->N_f = 3 * system->N + 1;
+            //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
+            if( !control->restart || (control->restart && control->random_vel) ) {
+                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
+                        data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->therm.v_xi_old = 0;
+                data->therm.xi = 0;
+#if defined(DEBUG_FOCUS)
+                fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
+                        data->therm.G_xi, control->Tau_T, data->E_Kin, 
+                        data->N_f, data->therm.v_xi );
+            }
+            *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
+            break;
+        case NPT: // Anisotropic NPT
+            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+            exit( UNKNOWN_OPTION );
+            data->N_f = 3 * system->N + 9;
+            if( !control->restart ) {
+                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
+                        data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->iso_bar.eps = 0.33333 * log(system->box.volume);
+                //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
+                //Compute_Pressure( system, data, workspace );
+            }
+            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+            break;
+        case sNPT: // Semi-Isotropic NPT
+            data->N_f = 3 * system->N + 4;
+            *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
+            break;
+        case iNPT: // Isotropic NPT
+            data->N_f = 3 * system->N + 2;
+            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+            break;
+        case bNVT: //berendensen NVT
+            data->N_f = 3 * system->N + 1; 
+            *Evolve = Velocity_Verlet_Berendsen_NVT;
+            break;
+        default:
+            break;
+    }
+    Compute_Kinetic_Energy( system, data );
+    /* init timing info for the host*/
+    data->timing.start = Get_Time( );
+    data->timing.total = data->timing.start;
+    data->timing.nbrs = 0;
+    data->timing.init_forces = 0;
+    data->timing.bonded = 0;
+    data->timing.nonb = 0;
+    data->timing.QEq = 0;
+    data->timing.matvecs = 0;
+void Init_Workspace( reax_system *system, control_params *control, 
+        static_storage *workspace )
+    int i;
+    /* Allocate space for hydrogen bond list */
+    workspace->hbond_index = (int *) malloc( system->N * sizeof( int ) );
+    /* bond order related storage  */
+    workspace->total_bond_order = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Deltap           = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Deltap_boc       = (real *) malloc( system->N * sizeof( real ) );
+    workspace->dDeltap_self     = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->Delta          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_lp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_lp_temp    = (real *) malloc( system->N * sizeof( real ) );
+    workspace->dDelta_lp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->dDelta_lp_temp   = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_e          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_boc        = (real *) malloc( system->N * sizeof( real ) );
+    workspace->nlp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->nlp_temp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Clp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->CdDelta          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->vlpex          = (real *) malloc( system->N * sizeof( real ) );
+    /* QEq storage */
+    //workspace->H        = NULL;
+    //workspace->L        = NULL;
+    //workspace->U        = NULL;
+    //
+    workspace->H.start        = NULL;
+    workspace->L.start        = NULL;
+    workspace->U.start        = NULL;
+    workspace->H.entries         = NULL;
+    workspace->L.entries         = NULL;
+    workspace->U.entries        = NULL;
+    workspace->droptol  = (real *) calloc( system->N, sizeof( real ) );
+    workspace->w        = (real *) calloc( system->N, sizeof( real ) );
+    workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) );
+    workspace->b        = (real *) calloc( system->N * 2, sizeof( real ) );
+    workspace->b_s      = (real *) calloc( system->N, sizeof( real ) );
+    workspace->b_t      = (real *) calloc( system->N, sizeof( real ) );
+    workspace->b_prc    = (real *) calloc( system->N * 2, sizeof( real ) );
+    workspace->b_prm    = (real *) calloc( system->N * 2, sizeof( real ) );
+    workspace->s_t      = (real *) calloc( system->N * 2, sizeof( real ) );
+    workspace->s        = (real *) calloc( 5 * system->N, sizeof( real ) );
+    workspace->t        = (real *) calloc( 5 * system->N, sizeof( real ) );
+    // workspace->s_old    = (real *) calloc( system->N, sizeof( real ) );
+    // workspace->t_old    = (real *) calloc( system->N, sizeof( real ) );
+    // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) );
+    // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) );
+    for( i = 0; i < system->N; ++i ) {
+        workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta;
+        workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+        workspace->b_t[i] = -1.0;
+        workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+        workspace->b[i+system->N] = -1.0;
+    }
+    /* GMRES storage */
+    workspace->y  = (real *)  calloc( RESTART+1, sizeof( real ) );
+    workspace->z  = (real *)  calloc( RESTART+1, sizeof( real ) );
+    workspace->g  = (real *)  calloc( RESTART+1, sizeof( real ) );
+    workspace->hs = (real *)  calloc( RESTART+1, sizeof( real ) );
+    workspace->hc = (real *)  calloc( RESTART+1, sizeof( real ) );
+    workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) );
+    workspace->v  = (real *) calloc( (RESTART+1)*system->N, sizeof( real) );
+    workspace->h  = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) );
+    /* CG storage */
+    workspace->r = (real *) calloc( system->N, sizeof( real ) );
+    workspace->d = (real *) calloc( system->N, sizeof( real ) );
+    workspace->q = (real *) calloc( system->N, sizeof( real ) );
+    workspace->p = (real *) calloc( system->N, sizeof( real ) );
+    /* integrator storage */
+    workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) );
+    /* storage for analysis */
+    if( control->molec_anal || control->diffusion_coef )
+    {
+        workspace->mark = (int *) calloc( system->N, sizeof(int) );
+        workspace->old_mark = (int *) calloc( system->N, sizeof(int) );
+    }
+    else 
+        workspace->mark = workspace->old_mark = NULL;
+    if( control->diffusion_coef )
+        workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) );
+    else workspace->x_old = NULL;
+    workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_ele = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_vdw = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_bo = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_be = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_lp = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_ov = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_un = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_ang = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_coa = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_pen = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_hb = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_tor = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_con = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->realloc.num_far = -1;
+    workspace->realloc.Htop = -1;
+    workspace->realloc.hbonds = -1;
+    workspace->realloc.bonds = -1;
+    workspace->realloc.num_3body = -1;
+    workspace->realloc.gcell_atoms = -1;
+    Reset_Workspace( system, workspace );
+void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *data, list *slist, int N)
+    int index = 0;
+    int count = 0;
+    int jicount = 0;
+    int i, j, end_index, gpu_index, gpu_end, k;
+    far_neighbor_data gpu, cpu;
+    /*
+       for (int i = 0; i < N ; i++ )
+       {
+       if (test[i] != start[i]) {
+       fprintf (stderr, "start index does not match \n");
+       exit (0);
+       }
+       if (test[i+1] != (end[i]) ){
+       fprintf (stderr, "end index does not match for atom %d (cpu: %d gpu: %d) \n", i, test[i+1], end[i]);
+       exit (0);
+       }
+       }
+     */
+    for (i = 0; i < N; i++){
+        index = Start_Index (i, slist);
+        //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]);
+        for (j = start[i]; j < end[i]; j++){
+            gpu = data[j];
+            if (i < data[j].nbr) continue;
+            /*
+               if (i < data[j].nbr) {
+            //fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j);
+            int src = data[j].nbr;
+            int dest = i;
+            int x;
+            for (x = start[src]; x < end[src]; x++) {
+            if (data[x].nbr != dest) continue;
+            gpu = data[x];
+            cpu = data[j];
+            if (  (gpu.d != cpu.d) ||
+            (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
+            (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
+            fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, 
+            data[j].d, 
+            data[j].rel_box[0],
+            data[j].rel_box[1],
+            data[j].rel_box[2],
+            data[j].dvec[0], 
+            data[j].dvec[1], 
+            data[j].dvec[2] 
+            );
+            fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr,
+            data[x].d,
+            data[x].rel_box[0],
+            data[x].rel_box[1],
+            data[x].rel_box[2],
+            data[x].dvec[0],
+            data[x].dvec[1],
+            data[x].dvec[2]
+            );
+            jicount++;
+            }
+            break;
+            }
+            if (x >= end[src]) {
+            fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src );
+            exit (0);
+            }
+            continue;
+            }
+             */
+            cpu = slist->select.far_nbr_list[index];
+            //if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){
+            //if ( (gpu->d != cpu->d) ){
+            if (  (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ||
+                    (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
+                    (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
+                //if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) ||
+                //        (gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) {
+                //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){
+                fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d ) (%d %d %d) \n", i, start[i], end[i], j, gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
+                fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, slist), End_Index (i, slist), index);
+                /*
+                   fprintf (stdout, "Far neighbors does not match atom: %d \n", i );
+                   fprintf (stdout, "neighbor %d ,  %d \n",  cpu.nbr, gpu.nbr);
+                   fprintf (stdout, "d %f ,  %f \n", slist->select.far_nbr_list[index].d, data[j].d);
+                   fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", 
+                   cpu.dvec[0], cpu.dvec[1], cpu.dvec[2],
+                   gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] );
+                   fprintf (stdout, "ivec (%d %d %d) (%d %d %d) \n", 
+                   cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2],
+                   gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
+                 */
+                count ++;
+            }
+        //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d);
+        index ++;
+        }
+        if (index != End_Index (i, slist))
+        {
+            fprintf( stderr,
+                "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n",
+                 i, index, Start_Index (i, slist), End_Index(i, slist),
+                    start[i], end[i]);
+            exit( 10 );
+        }
+    }
+    fprintf (stderr, "Far neighbors MATCH between CPU and GPU -->%d  reverse %d \n", count, jicount);
+    /*
+       for (int i = 0; i < N; i++) 
+       {
+       index = Start_Index (i, slist);
+       end_index = End_Index (i, slist);
+       gpu_index = start[i];
+       gpu_end = end[i];
+       for (int j = index; j < end_index; j++) 
+       {
+       far_neighbor_data *cpu = &slist->select.far_nbr_list[j];
+       far_neighbor_data *gpu;
+       for (k = gpu_index; k < gpu_end; k++) {
+       gpu = &data[k];
+       if (gpu->nbr == cpu->nbr) break;
+       }
+       if (k == gpu_end) { fprintf (stderr, " could not find neighbor for atom %d \n", i); exit (1); }
+       if ( (gpu->nbr != cpu->nbr) || (gpu->d != cpu->d) ||
+       ((cpu->dvec[0] || gpu->dvec[0]) || (cpu->dvec[1] || gpu->dvec[1]) || (cpu->dvec[2] || gpu->dvec[2])) ||
+       ((cpu->rel_box[0] || gpu->rel_box[0]) || (cpu->rel_box[1] || gpu->rel_box[1]) || (cpu->rel_box[2] || gpu->rel_box[2])) ) {
+       fprintf (stderr, "Far neighbors does not match atom: %d \n", i );
+       fprintf (stderr, "neighbor %d ,  %d \n",  cpu->nbr, gpu->nbr);
+       fprintf (stderr, "d %d ,  %d \n", cpu->d, gpu->d);
+       fprintf (stderr, "dvec (%f %f %f) (%f %f %f) \n", 
+       cpu->dvec[0], cpu->dvec[1], cpu->dvec[2],
+       gpu->dvec[0], gpu->dvec[1], gpu->dvec[2] );
+       fprintf (stderr, "ivec (%d %d %d) (%d %d %d) \n", 
+       cpu->rel_box[0], cpu->rel_box[1], cpu->rel_box[2],
+       gpu->rel_box[0], gpu->rel_box[1], gpu->rel_box[2] );
+       fprintf (stderr, "GPU start %d GPU End %d \n", gpu_index, gpu_end );
+       exit (1);
+       }
+       }
+       }
+     */
+void Init_Lists( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+    int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
+    int *hb_top, *bond_top;
+    real t_start, t_elapsed;
+    num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists );
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs);
+    if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS ) ) {
+        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
+        exit( INIT_ERR );
+    }
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
+            num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
+    t_start = Get_Time ();
+    Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control);
+    t_elapsed = Get_Timing_Info ( t_start );
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed );
+    Htop = 0;
+    hb_top = (int*) calloc( system->N, sizeof(int) );
+    bond_top = (int*) calloc( system->N, sizeof(int) );
+    num_3body = 0;
+    Estimate_Storage_Sizes( system, control, lists, 
+            &Htop, hb_top, bond_top, &num_3body );
+    Allocate_Matrix( &(workspace->H), system->N, Htop );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "estimated storage - Htop: %d\n", Htop );
+    fprintf( stderr, "memory allocated: H = %ldMB\n", 
+            Htop * sizeof(sparse_matrix_entry) / (1024*1024) );
+    workspace->num_H = 0;
+    if( control->hb_cut > 0 ) {
+        /* init H indexes */
+        for( i = 0; i < system->N; ++i )
+            if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom
+                workspace->hbond_index[i] = workspace->num_H++;
+            else workspace->hbond_index[i] = -1;
+        Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, 
+                hb_top, (*lists)+HBONDS );
+        num_hbonds = hb_top[system->N-1];
+#ifdef __DEBUG_CUDA__
+        fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds );
+#if defined(DEBUG_FOCUS)
+        fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds );
+        fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
+                num_hbonds * sizeof(hbond_data) / (1024*1024) );
+    }
+    /* bonds list */
+    Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS );
+    num_bonds = bond_top[system->N-1];
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds );
+    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
+            num_bonds * sizeof(bond_data) / (1024*1024) );
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, " host num_3body : %d \n", num_3body);
+    fprintf (stderr, " host num_bonds : %d \n", num_bonds);
+    /* 3bodies list */
+    if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES )) {
+        fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
+        exit( INIT_ERR );
+    }
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body );
+    fprintf( stderr, "memory allocated: 3-body = %ldMB\n", 
+            num_3body * sizeof(three_body_interaction_data) / (1024*1024) );
+    if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) {
+        fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
+        exit( INIT_ERR );
+    }
+    if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) {
+        fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
+        exit( INIT_ERR );
+    }
+    free( hb_top );
+    free( bond_top );
+void Init_Out_Controls(reax_system *system, control_params *control, 
+        static_storage *workspace, output_controls *out_control)
+    char temp[1000];
+    /* Init trajectory file */
+    if( out_control->write_steps > 0 ) { 
+        strcpy( temp, control->sim_name );
+        strcat( temp, ".trj" );
+        out_control->trj = fopen( temp, "w" );
+        out_control->write_header( system, control, workspace, out_control );
+    }
+    if( out_control->energy_update_freq > 0 ) {
+        /* Init out file */
+        strcpy( temp, control->sim_name );
+        strcat( temp, ".out" );
+        out_control->out = fopen( temp, "w" );
+        fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n",
+                "step", "total energy", "poten. energy", "kin. energy", 
+                "temp.", "target", "volume", "press.", "target" );
+        fflush( out_control->out );
+        /* Init potentials file */
+        strcpy( temp, control->sim_name );
+        strcat( temp, ".pot" );
+        out_control->pot = fopen( temp, "w" );
+        fprintf( out_control->pot, 
+                "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n",
+                "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", 
+                "etor", "econj", "evdw","ecoul", "epol" );
+        fflush( out_control->pot );
+        /* Init log file */
+        strcpy( temp, control->sim_name );
+        strcat( temp, ".log" );
+        out_control->log = fopen( temp, "w" );
+        fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", 
+                "step", "total", "neighbors", "init", "bonded", 
+                "nonbonded", "QEq", "matvec" );
+    }
+    /* Init pressure file */
+    if( control->ensemble == NPT || 
+            control->ensemble == iNPT || 
+            control->ensemble == sNPT ) {
+        strcpy( temp, control->sim_name );
+        strcat( temp, ".prs" );
+        out_control->prs = fopen( temp, "w" );
+        fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n",
+                "step", "norm_x", "norm_y", "norm_z", 
+                "press_x", "press_y", "press_z", "target_p", "volume" );
+        fflush( out_control->prs );
+    }
+    /* Init molecular analysis file */
+    if( control->molec_anal ) {
+        sprintf( temp, "%s.mol", control->sim_name );
+        out_control->mol = fopen( temp, "w" );
+        if( control->num_ignored ) {
+            sprintf( temp, "%s.ign", control->sim_name );
+            out_control->ign = fopen( temp, "w" );
+        } 
+    }
+    /* Init electric dipole moment analysis file */
+    if( control->dipole_anal ) {
+        strcpy( temp, control->sim_name );
+        strcat( temp, ".dpl" );
+        out_control->dpl = fopen( temp, "w" );
+        fprintf( out_control->dpl, 
+                "Step      Molecule Count  Avg. Dipole Moment Norm\n" );
+        fflush( out_control->dpl );
+    }
+    /* Init diffusion coef analysis file */
+    if( control->diffusion_coef ) {
+        strcpy( temp, control->sim_name );
+        strcat( temp, ".drft" );
+        out_control->drft = fopen( temp, "w" );
+        fprintf( out_control->drft, "Step     Type Count   Avg Squared Disp\n" );
+        fflush( out_control->drft );
+    }
+    /* open bond energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".ebond" );
+    out_control->ebond = fopen( temp, "w" );
+    /* open lone-pair energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".elp" );
+    out_control->elp = fopen( temp, "w" );
+    /* open overcoordination energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".eov" );
+    out_control->eov = fopen( temp, "w" );
+    /* open undercoordination energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".eun" );
+    out_control->eun = fopen( temp, "w" );
+    /* open angle energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".eval" );
+    out_control->eval = fopen( temp, "w" );
+    /* open penalty energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".epen" );
+    out_control->epen = fopen( temp, "w" );
+    /* open coalition energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".ecoa" );
+    out_control->ecoa = fopen( temp, "w" );
+    /* open hydrogen bond energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".ehb" );
+    out_control->ehb = fopen( temp, "w" );
+    /* open torsion energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".etor" );
+    out_control->etor = fopen( temp, "w" );
+    /* open conjugation energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".econ" );
+    out_control->econ = fopen( temp, "w" );
+    /* open vdWaals energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".evdw" );
+    out_control->evdw = fopen( temp, "w" );
+    /* open coulomb energy file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".ecou" );
+    out_control->ecou = fopen( temp, "w" );
+    /* open bond orders file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".fbo" );
+    out_control->fbo = fopen( temp, "w" );
+    /* open bond orders derivatives file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".fdbo" );
+    out_control->fdbo = fopen( temp, "w" );
+    /* open bond forces file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".fbond" );
+    out_control->fbond = fopen( temp, "w" );
+    /* open lone-pair forces file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".flp" );
+    out_control->flp = fopen( temp, "w" );
+    /* open overcoordination forces file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".fatom" );
+    out_control->fatom = fopen( temp, "w" );
+    /* open angle forces file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".f3body" );
+    out_control->f3body = fopen( temp, "w" );
+    /* open hydrogen bond forces file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".fhb" );
+    out_control->fhb = fopen( temp, "w" );
+    /* open torsion forces file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".f4body" );
+    out_control->f4body = fopen( temp, "w" );
+    /* open nonbonded forces file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".fnonb" );
+    out_control->fnonb = fopen( temp, "w" );
+    /* open total force file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".ftot" );
+    out_control->ftot = fopen( temp, "w" );
+    /* open coulomb forces file */
+    strcpy( temp, control->sim_name );
+    strcat( temp, ".ftot2" );
+    out_control->ftot2 = fopen( temp, "w" );
+    /* Error handling */
+    /* if ( out_control->out == NULL || out_control->pot == NULL || 
+       out_control->log == NULL || out_control->mol == NULL || 
+       out_control->dpl == NULL || out_control->drft == NULL ||       
+       out_control->pdb == NULL )
+       {
+       fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." );
+       exit( CANNOT_OPEN_OUTFILE );
+       }*/
+void Initialize(reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, list **lists, 
+        output_controls *out_control, evolve_function *Evolve)
+    Randomize();
+    Init_System( system, control, data );
+    Init_Simulation_Data( system, control, data, out_control, Evolve );
+    Init_Workspace( system, control, workspace );
+    Init_Lists( system, control, data, workspace, lists, out_control );
+    Init_Out_Controls( system, control, workspace, out_control );
+    /* These are done in forces.c, only forces.c can see all those functions */
+    Init_Bonded_Force_Functions( control );
+    Init_Force_Test_Functions( );
+    if( control->tabulate )
+        Make_LR_Lookup_Table( system, control );
+#if defined(DEBUG_FOCUS)
+    fprintf( stderr, "data structures have been initialized...\n" ); 
diff --git a/PuReMD-GPU/src/init_md.cu b/PuReMD-GPU/src/init_md.cu
deleted file mode 100644
index e1912d3c2bd139e2dfcda43b23c77adfcd325782..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/init_md.cu
+++ /dev/null
@@ -1,1361 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#include "init_md.h"
-#include "allocate.h"
-#include "box.h"
-#include "forces.h"
-#include "grid.h"
-#include "GMRES.h"
-#include "integrate.h"
-#include "neighbors.h"
-#include "list.h"
-#include "lookup.h"
-#include "print_utils.h"
-#include "reset_utils.h"
-#include "system_props.h"
-#include "traj.h"
-#include "vector.h"
-#include "cuda_init.h"
-#include "cuda_copy.h"
-#include "cuda_utils.h"
-#include "helpers.h"
-#include "reduction.h"
-#include     "index_utils.h"
-#include "validation.h"
-void Generate_Initial_Velocities(reax_system *system, real T )
-    int i;
-    real scale, norm;
-    if( T <= 0.1 ) {
-        for (i=0; i < system->N; i++)
-            rvec_MakeZero( system->atoms[i].v );
-#if defined(DEBUG)
-        fprintf( stderr, "no random velocities...\n" );
-    }
-    else {
-        for( i = 0; i < system->N; i++ ) {
-            rvec_Random( system->atoms[i].v );
-            norm = rvec_Norm_Sqr( system->atoms[i].v );
-            scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * 
-                    norm / (3.0 * K_B * T) );
-            rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v );
-            /*
-               fprintf( stderr, "v = %f %f %f\n", 
-               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
-               fprintf( stderr, "scale = %f\n", scale );
-               fprintf( stderr, "v = %f %f %f\n",
-               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
-             */
-        }
-    }
-void Init_System( reax_system *system, control_params *control, 
-        simulation_data *data )
-    int i;
-    rvec dx;
-    if( !control->restart )
-        Reset_Atoms( system );
-    Compute_Total_Mass( system, data );
-    Compute_Center_of_Mass( system, data, stderr );
-    /* reposition atoms */
-    // just fit the atoms to the periodic box
-    if( control->reposition_atoms == 0 ) {
-        rvec_MakeZero( dx );
-    }
-    // put the center of mass to the center of the box
-    else if( control->reposition_atoms == 1 ) {
-        rvec_Scale( dx, 0.5, system->box.box_norms );
-        rvec_ScaledAdd( dx, -1., data->xcm );
-    }
-    // put the center of mass to the origin
-    else if( control->reposition_atoms == 2 ) {
-        rvec_Scale( dx, -1., data->xcm );
-    }
-    else {
-        fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
-        exit( UNKNOWN_OPTION );
-    }
-    for( i = 0; i < system->N; ++i ) {
-        Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
-        /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", 
-          i, system->atoms[i].type, 
-          system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/
-    }
-    /* Initialize velocities so that desired init T can be attained */
-    if( !control->restart || (control->restart && control->random_vel) )  {
-        Generate_Initial_Velocities( system, control->T_init );
-    }
-    Setup_Grid( system );
-void Cuda_Init_System( reax_system *system, control_params *control, 
-        simulation_data *data )
-    int i;
-    rvec dx;
-    if( !control->restart )
-        Cuda_Reset_Atoms( system );
-    Cuda_Compute_Total_Mass( system, data );
-    Cuda_Compute_Center_of_Mass( system, data, stderr );
-    /* reposition atoms */
-    // just fit the atoms to the periodic box
-    if( control->reposition_atoms == 0 ) {
-        rvec_MakeZero( dx );
-    }
-    // put the center of mass to the center of the box
-    else if( control->reposition_atoms == 1 ) {
-        rvec_Scale( dx, 0.5, system->box.box_norms );
-        rvec_ScaledAdd( dx, -1., data->xcm );
-    }
-    // put the center of mass to the origin
-    else if( control->reposition_atoms == 2 ) {
-        rvec_Scale( dx, -1., data->xcm );
-    }
-    else {
-        fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
-        exit( UNKNOWN_OPTION );
-    }
-    compute_Inc_on_T3 <<<BLOCKS_POW_2, BLOCK_SIZE>>>
-        (system->d_atoms, system->N, system->d_box, dx[0], dx[1], dx[2]);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    //copy back the atoms from device to the host
-    copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , 
-            cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
-    /* Initialize velocities so that desired init T can be attained */
-    if( !control->restart || (control->restart && control->random_vel) )  {
-        Generate_Initial_Velocities( system, control->T_init );
-    }
-    Setup_Grid( system );
-void Init_Simulation_Data( reax_system *system, control_params *control, 
-        simulation_data *data, output_controls *out_control, 
-        evolve_function *Evolve )
-    Reset_Simulation_Data( data );
-    if( !control->restart )  
-        data->step = data->prev_steps = 0;
-    switch( control->ensemble ) {
-        case NVE:
-            data->N_f = 3 * system->N;
-            *Evolve = Velocity_Verlet_NVE;
-            break;
-        case NVT:
-            data->N_f = 3 * system->N + 1;
-            //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
-            if( !control->restart || (control->restart && control->random_vel) ) {
-                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-                        data->N_f * K_B * control->T );
-                data->therm.v_xi = data->therm.G_xi * control->dt;
-                data->therm.v_xi_old = 0;
-                data->therm.xi = 0;
-#if defined(DEBUG_FOCUS)
-                fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
-                        data->therm.G_xi, control->Tau_T, data->E_Kin, 
-                        data->N_f, data->therm.v_xi );
-            }
-            *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
-            break;
-        case NPT: // Anisotropic NPT
-            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-            exit( UNKNOWN_OPTION );
-            data->N_f = 3 * system->N + 9;
-            if( !control->restart ) {
-                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-                        data->N_f * K_B * control->T );
-                data->therm.v_xi = data->therm.G_xi * control->dt;
-                data->iso_bar.eps = 0.33333 * log(system->box.volume);
-                //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
-                //Compute_Pressure( system, data, workspace );
-            }
-            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-            break;
-        case sNPT: // Semi-Isotropic NPT
-            data->N_f = 3 * system->N + 4;
-            *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
-            break;
-        case iNPT: // Isotropic NPT
-            data->N_f = 3 * system->N + 2;
-            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-            break;
-        case bNVT: //berendensen NVT
-            data->N_f = 3 * system->N + 1; 
-            *Evolve = Velocity_Verlet_Berendsen_NVT;
-            break;
-        default:
-            break;
-    }
-    Compute_Kinetic_Energy( system, data );
-    /* init timing info for the host*/
-    data->timing.start = Get_Time( );
-    data->timing.total = data->timing.start;
-    data->timing.nbrs = 0;
-    data->timing.init_forces = 0;
-    data->timing.bonded = 0;
-    data->timing.nonb = 0;
-    data->timing.QEq = 0;
-    data->timing.matvecs = 0;
-void Cuda_Init_Simulation_Data( reax_system *system, control_params *control, 
-        simulation_data *data, output_controls *out_control, 
-        evolve_function *Evolve )
-    Reset_Simulation_Data( data );
-    if( !control->restart )  
-        data->step = data->prev_steps = 0;
-    switch( control->ensemble ) {
-        case NVE:
-            data->N_f = 3 * system->N;
-            *Evolve = Cuda_Velocity_Verlet_NVE;
-            break;
-        case NVT:
-            data->N_f = 3 * system->N + 1;
-            //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
-            if( !control->restart || (control->restart && control->random_vel) ) {
-                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-                        data->N_f * K_B * control->T );
-                data->therm.v_xi = data->therm.G_xi * control->dt;
-                data->therm.v_xi_old = 0;
-                data->therm.xi = 0;
-#if defined(DEBUG_FOCUS)
-                fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
-                        data->therm.G_xi, control->Tau_T, data->E_Kin, 
-                        data->N_f, data->therm.v_xi );
-            }
-            *Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein;
-            break;
-        case NPT: // Anisotropic NPT
-            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-            exit( UNKNOWN_OPTION );
-            data->N_f = 3 * system->N + 9;
-            if( !control->restart ) {
-                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-                        data->N_f * K_B * control->T );
-                data->therm.v_xi = data->therm.G_xi * control->dt;
-                data->iso_bar.eps = 0.33333 * log(system->box.volume);
-                //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
-                //Compute_Pressure( system, data, workspace );
-            }
-            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-            break;
-        case sNPT: // Semi-Isotropic NPT
-            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-            exit( UNKNOWN_OPTION );
-            data->N_f = 3 * system->N + 4;
-            *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
-            break;
-        case iNPT: // Isotropic NPT
-            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-            exit( UNKNOWN_OPTION );
-            data->N_f = 3 * system->N + 2;
-            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-            break;
-        case bNVT: //berendensen NVT
-            data->N_f = 3 * system->N + 1; 
-            *Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
-            break;
-        default:
-            break;
-    }
-    Cuda_Compute_Kinetic_Energy (system, data);
-#ifdef __BUILD_DEBUG__
-    real t_E_Kin = 0;
-    t_E_Kin = data->E_Kin;
-    copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, 
-            REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-    data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
-    if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! 
-        data->therm.T = ALMOST_ZERO;
-#ifdef __BUILD_DEBUG__
-    if (check_zero (t_E_Kin, data->E_Kin)){
-        fprintf (stderr, "SimulationData:E_Kin does not match between host and device (%f %f) \n", t_E_Kin, data->E_Kin );
-        exit (1);
-    }
-    //validate_data ( system, data );
-    /* init timing info for the host*/
-    data->timing.start = Get_Time( );
-    data->timing.total = data->timing.start;
-    data->timing.nbrs = 0;
-    data->timing.init_forces = 0;
-    data->timing.bonded = 0;
-    data->timing.nonb = 0;
-    data->timing.QEq = 0;
-    data->timing.matvecs = 0;
-    /* init timing info for the device */
-    d_timing.start = Get_Time( );
-    d_timing.total = data->timing.start;
-    d_timing.nbrs = 0;
-    d_timing.init_forces = 0;
-    d_timing.bonded = 0;
-    d_timing.nonb = 0;
-    d_timing.QEq = 0;
-    d_timing.matvecs = 0;
-void Init_Workspace( reax_system *system, control_params *control, 
-        static_storage *workspace )
-    int i;
-    /* Allocate space for hydrogen bond list */
-    workspace->hbond_index = (int *) malloc( system->N * sizeof( int ) );
-    /* bond order related storage  */
-    workspace->total_bond_order = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Deltap           = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Deltap_boc       = (real *) malloc( system->N * sizeof( real ) );
-    workspace->dDeltap_self     = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->Delta          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_lp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_lp_temp    = (real *) malloc( system->N * sizeof( real ) );
-    workspace->dDelta_lp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->dDelta_lp_temp   = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_e          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Delta_boc        = (real *) malloc( system->N * sizeof( real ) );
-    workspace->nlp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->nlp_temp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->Clp          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->CdDelta          = (real *) malloc( system->N * sizeof( real ) );
-    workspace->vlpex          = (real *) malloc( system->N * sizeof( real ) );
-    /* QEq storage */
-    //workspace->H        = NULL;
-    //workspace->L        = NULL;
-    //workspace->U        = NULL;
-    //
-    workspace->H.start        = NULL;
-    workspace->L.start        = NULL;
-    workspace->U.start        = NULL;
-    workspace->H.entries         = NULL;
-    workspace->L.entries         = NULL;
-    workspace->U.entries        = NULL;
-    workspace->droptol  = (real *) calloc( system->N, sizeof( real ) );
-    workspace->w        = (real *) calloc( system->N, sizeof( real ) );
-    workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) );
-    workspace->b        = (real *) calloc( system->N * 2, sizeof( real ) );
-    workspace->b_s      = (real *) calloc( system->N, sizeof( real ) );
-    workspace->b_t      = (real *) calloc( system->N, sizeof( real ) );
-    workspace->b_prc    = (real *) calloc( system->N * 2, sizeof( real ) );
-    workspace->b_prm    = (real *) calloc( system->N * 2, sizeof( real ) );
-    workspace->s_t      = (real *) calloc( system->N * 2, sizeof( real ) );
-    workspace->s        = (real *) calloc( 5 * system->N, sizeof( real ) );
-    workspace->t        = (real *) calloc( 5 * system->N, sizeof( real ) );
-    // workspace->s_old    = (real *) calloc( system->N, sizeof( real ) );
-    // workspace->t_old    = (real *) calloc( system->N, sizeof( real ) );
-    // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) );
-    // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) );
-    for( i = 0; i < system->N; ++i ) {
-        workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta;
-        workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
-        workspace->b_t[i] = -1.0;
-        workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
-        workspace->b[i+system->N] = -1.0;
-    }
-    /* GMRES storage */
-    workspace->y  = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->z  = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->g  = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->hs = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->hc = (real *)  calloc( RESTART+1, sizeof( real ) );
-    workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) );
-    workspace->v  = (real *) calloc( (RESTART+1)*system->N, sizeof( real) );
-    workspace->h  = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) );
-    /* CG storage */
-    workspace->r = (real *) calloc( system->N, sizeof( real ) );
-    workspace->d = (real *) calloc( system->N, sizeof( real ) );
-    workspace->q = (real *) calloc( system->N, sizeof( real ) );
-    workspace->p = (real *) calloc( system->N, sizeof( real ) );
-    /* integrator storage */
-    workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) );
-    /* storage for analysis */
-    if( control->molec_anal || control->diffusion_coef )
-    {
-        workspace->mark = (int *) calloc( system->N, sizeof(int) );
-        workspace->old_mark = (int *) calloc( system->N, sizeof(int) );
-    }
-    else 
-        workspace->mark = workspace->old_mark = NULL;
-    if( control->diffusion_coef )
-        workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) );
-    else workspace->x_old = NULL;
-    workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_ele = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_vdw = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_bo = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_be = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_lp = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_ov = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_un = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_ang = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_coa = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_pen = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_hb = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_tor = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->f_con = (rvec *) malloc( system->N * sizeof( rvec ) );
-    workspace->realloc.num_far = -1;
-    workspace->realloc.Htop = -1;
-    workspace->realloc.hbonds = -1;
-    workspace->realloc.bonds = -1;
-    workspace->realloc.num_3body = -1;
-    workspace->realloc.gcell_atoms = -1;
-    Reset_Workspace( system, workspace );
-void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *data, list *slist, int N)
-    int index = 0;
-    int count = 0;
-    int jicount = 0;
-    int end_index, gpu_index, gpu_end, k;
-    far_neighbor_data gpu, cpu;
-    /*
-       for (int i = 0; i < N ; i++ )
-       {
-       if (test[i] != start[i]) {
-       fprintf (stderr, "start index does not match \n");
-       exit (0);
-       }
-       if (test[i+1] != (end[i]) ){
-       fprintf (stderr, "end index does not match for atom %d (cpu: %d gpu: %d) \n", i, test[i+1], end[i]);
-       exit (0);
-       }
-       }
-     */
-    for (int i = 0; i < N; i++){
-        index = Start_Index (i, slist);
-        //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]);
-        for (int j = start[i]; j < end[i]; j++){
-            gpu = data[j];
-            if (i < data[j].nbr) continue;
-            /*
-               if (i < data[j].nbr) {
-            //fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j);
-            int src = data[j].nbr;
-            int dest = i;
-            int x;
-            for (x = start[src]; x < end[src]; x++) {
-            if (data[x].nbr != dest) continue;
-            gpu = data[x];
-            cpu = data[j];
-            if (  (gpu.d != cpu.d) ||
-            (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
-            (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
-            fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, 
-            data[j].d, 
-            data[j].rel_box[0],
-            data[j].rel_box[1],
-            data[j].rel_box[2],
-            data[j].dvec[0], 
-            data[j].dvec[1], 
-            data[j].dvec[2] 
-            );
-            fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr,
-            data[x].d,
-            data[x].rel_box[0],
-            data[x].rel_box[1],
-            data[x].rel_box[2],
-            data[x].dvec[0],
-            data[x].dvec[1],
-            data[x].dvec[2]
-            );
-            jicount++;
-            }
-            break;
-            }
-            if (x >= end[src]) {
-            fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src );
-            exit (0);
-            }
-            continue;
-            }
-             */
-            cpu = slist->select.far_nbr_list[index];
-            //if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){
-            //if ( (gpu->d != cpu->d) ){
-            if (  (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ||
-                    (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
-                    (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
-                //if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) ||
-                //        (gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) {
-                //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){
-                fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d ) (%d %d %d) \n", i, start[i], end[i], j, gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
-                fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, slist), End_Index (i, slist), index);
-                /*
-                   fprintf (stdout, "Far neighbors does not match atom: %d \n", i );
-                   fprintf (stdout, "neighbor %d ,  %d \n",  cpu.nbr, gpu.nbr);
-                   fprintf (stdout, "d %f ,  %f \n", slist->select.far_nbr_list[index].d, data[j].d);
-                   fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", 
-                   cpu.dvec[0], cpu.dvec[1], cpu.dvec[2],
-                   gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] );
-                   fprintf (stdout, "ivec (%d %d %d) (%d %d %d) \n", 
-                   cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2],
-                   gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
-                 */
-                count ++;
-            }
-            //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d);
-            index ++;
-            }
-            if (index != End_Index (i, slist))
-            {
-                fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, slist), End_Index(i, slist),
-                        start[i], end[i]);
-                exit (10);
-            }
-            }
-            fprintf (stderr, "Far neighbors MATCH between CPU and GPU -->%d  reverse %d \n", count, jicount);
-            /*
-               for (int i = 0; i < N; i++) 
-               {
-               index = Start_Index (i, slist);
-               end_index = End_Index (i, slist);
-               gpu_index = start[i];
-               gpu_end = end[i];
-               for (int j = index; j < end_index; j++) 
-               {
-               far_neighbor_data *cpu = &slist->select.far_nbr_list[j];
-               far_neighbor_data *gpu;
-               for (k = gpu_index; k < gpu_end; k++) {
-               gpu = &data[k];
-               if (gpu->nbr == cpu->nbr) break;
-               }
-               if (k == gpu_end) { fprintf (stderr, " could not find neighbor for atom %d \n", i); exit (1); }
-               if ( (gpu->nbr != cpu->nbr) || (gpu->d != cpu->d) ||
-               ((cpu->dvec[0] || gpu->dvec[0]) || (cpu->dvec[1] || gpu->dvec[1]) || (cpu->dvec[2] || gpu->dvec[2])) ||
-               ((cpu->rel_box[0] || gpu->rel_box[0]) || (cpu->rel_box[1] || gpu->rel_box[1]) || (cpu->rel_box[2] || gpu->rel_box[2])) ) {
-               fprintf (stderr, "Far neighbors does not match atom: %d \n", i );
-               fprintf (stderr, "neighbor %d ,  %d \n",  cpu->nbr, gpu->nbr);
-               fprintf (stderr, "d %d ,  %d \n", cpu->d, gpu->d);
-               fprintf (stderr, "dvec (%f %f %f) (%f %f %f) \n", 
-               cpu->dvec[0], cpu->dvec[1], cpu->dvec[2],
-               gpu->dvec[0], gpu->dvec[1], gpu->dvec[2] );
-               fprintf (stderr, "ivec (%d %d %d) (%d %d %d) \n", 
-               cpu->rel_box[0], cpu->rel_box[1], cpu->rel_box[2],
-               gpu->rel_box[0], gpu->rel_box[1], gpu->rel_box[2] );
-               fprintf (stderr, "GPU start %d GPU End %d \n", gpu_index, gpu_end );
-               exit (1);
-               }
-               }
-               }
-             */
-        }
-        int Estimate_Device_Matrix (reax_system *system, control_params *control, 
-                simulation_data *data, static_storage *workspace, 
-                list **lists, output_controls *out_control )
-        {
-            int *indices, *Htop;
-            list *far_nbrs = dev_lists + FAR_NBRS;
-            int max_sparse_entries = 0;
-            real t1, t2;
-            indices = (int *) scratch;
-            cuda_memset ( indices, 0, INT_SIZE * system->N, RES_SCRATCH );
-            t1 = Get_Time ();
-            Estimate_Sparse_Matrix_Entries <<<BLOCKS, BLOCK_SIZE>>>
-                ( system->d_atoms, (control_params *)control->d_control, 
-                  (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, 
-                  *far_nbrs, system->N, indices );
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            t2 = Get_Timing_Info ( t1 );
-            //fprintf (stderr, " Time to estimate sparse matrix entries --- > %f \n", t2 );
-            Htop = (int *) malloc (INT_SIZE * (system->N + 1));
-            memset (Htop, 0, INT_SIZE * (system->N + 1));
-            copy_host_device (Htop, indices, system->N * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            for (int i = 0; i < system->N; i++) 
-            {
-                if (max_sparse_entries < Htop[i]) {
-                    max_sparse_entries = Htop[i];
-                }    
-            }
-#ifdef __DEBUG_CUDA__
-            fprintf (stderr, " Max sparse entries for this run are ---> %d \n", max_sparse_entries );
-            return max_sparse_entries * SAFE_ZONE;
-            //return max_sparse_entries;
-        }
-        void Allocate_Device_Matrix (reax_system *system, control_params *control, 
-                simulation_data *data, static_storage *workspace, 
-                list **lists, output_controls *out_control )
-        {
-            //Allocate space for the sparse Matrix entries here. 
-            system->max_sparse_matrix_entries = 
-                Estimate_Device_Matrix (system, control, data, workspace, lists, out_control );
-            dev_workspace->H.n = system->N ;
-            dev_workspace->H.m = system->N * system->max_sparse_matrix_entries;
-            Cuda_Init_Sparse_Matrix (&dev_workspace->H, system->max_sparse_matrix_entries * system->N, system->N );
-#ifdef __CUDA_MEM__
-            fprintf( stderr, "Device memory allocated: sparse matrix= %ld (MB)\n", 
-                    system->max_sparse_matrix_entries * system->N * sizeof(sparse_matrix_entry) / (1024*1024) );
-        }
-        void Cuda_Init_Lists( reax_system *system, control_params *control, 
-                simulation_data *data, static_storage *workspace, 
-                list **lists, output_controls *out_control )
-        {
-            int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
-            int *hb_top, *bond_top;
-            real t_start, t_elapsed;
-            grid *g = &( system->g );
-            int *d_indices = (int *) scratch;
-            int total = g->ncell[0] * g->ncell[1] * g->ncell[2];
-            cuda_memset ( d_indices, 0, INT_SIZE * system->N, RES_SCRATCH );
-#ifdef __BUILD_DEBUG__
-            for (int i = 0; i < g->max_nbrs; i ++) {
-                if ((g->nbrs[i][0] >= g->ncell[0]) ||
-                        (g->nbrs[i][1] >= g->ncell[1]) ||
-                        (g->nbrs[i][2] >= g->ncell[2]) ) {
-                    fprintf (stderr, " Grid Incorrectly built.... \n");
-                    exit (1);
-                }
-            }
-            dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
-            dim3 threadsperblock (system->g.max_atoms);
-#ifdef __BUILD_DEBUG__
-            fprintf (stderr, "Blocks per grid (%d %d %d)\n", system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
-            fprintf (stderr, "Estimate Num  Neighbors with threads per block as %d \n", system->d_g.max_atoms);
-            fprintf (stderr, "Max nbrs %d \n", system->d_g.max_nbrs);
-            //First Bin atoms and they sync the host and the device for the grid.
-            //This will copy the atoms from host to device.
-            Cuda_Bin_Atoms (system, workspace);
-            Sync_Host_Device (&system->g, &system->d_g, cudaMemcpyHostToDevice );
-            Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>>
-                (system->d_atoms, system->d_g, system->d_box, 
-                 (control_params *)control->d_control, d_indices);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            int *nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) );
-            memset (nbrs_indices , 0, INT_SIZE * (system->N + 1));
-            nbrs_indices [0] = 0;
-            copy_host_device (&nbrs_indices [1], d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); 
-            for (int i = 1; i <= system->N; i++)
-                nbrs_indices [i] += nbrs_indices [i-1];
-            num_nbrs = nbrs_indices [system->N] ;
-            system->num_nbrs = num_nbrs;
-#ifdef __DEBUG_CUDA__
-            fprintf (stderr, "Total neighbors %d \n", nbrs_indices[system->N]);
-            fprintf (stderr, "Corrected Total neighbors %d \n", num_nbrs);
-            list *far_nbrs = (dev_lists + FAR_NBRS);
-            if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE) ) {
-                fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-                exit( INIT_ERR );
-            }
-#ifdef __CUDA_MEM__
-            fprintf( stderr, "Device memory allocated: far_nbrs = %ld (MB)\n", 
-                    num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
-            copy_host_device (nbrs_indices, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ );
-            copy_host_device (nbrs_indices, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ );
-            Cuda_Generate_Neighbor_Lists (system, workspace, control, false);
-#ifdef __BUILD_DEBUG__
-            int *end = (int *)malloc (sizeof (int) * system->N);
-            int *start = (int *) malloc (sizeof (int) * system->N );
-            copy_host_device (start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0);
-            copy_host_device (end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0);
-            far_neighbor_data *far_data = (far_neighbor_data *) 
-                malloc (FAR_NEIGHBOR_SIZE * num_nbrs);
-            copy_host_device (far_data, far_nbrs->select.far_nbr_list, 
-                    FAR_NEIGHBOR_SIZE * num_nbrs, cudaMemcpyDeviceToHost, 0);
-            compare_far_neighbors (nbrs_indices, start, end, far_data, *lists + FAR_NBRS, system->N);
-            free (start);
-            free (end);
-            int *output, size;
-            size = INT_SIZE * 2 * system->N + 2;
-            output = (int *) malloc (size);
-            Cuda_Estimate_Storage_Sizes (system, control, output);
-            Htop = output[0];
-            num_3body  = output[1];
-            hb_top = &output[ 2 ]; 
-            bond_top = &output[ 2 + system->N ];
-#ifdef __DEBUG_CUDA__
-            int max_hbonds = 0;
-            int min_hbonds = 1000;
-            int max_bonds = 0;
-            int min_bonds = 1000;
-            for (int i = 0; i < system->N; i++) {
-                if ( max_hbonds < hb_top[i])
-                    max_hbonds = hb_top[i];
-                if (min_hbonds > hb_top[i])
-                    min_hbonds = hb_top[i];
-                if (max_bonds < bond_top [i])
-                    max_bonds = bond_top[i];
-                if (min_bonds > bond_top[i])
-                    min_bonds = bond_top[i];
-            }
-            fprintf (stderr, "Max Hbonds %d min Hbonds %d \n", max_hbonds, min_hbonds );
-            fprintf (stderr, "Max bonds %d min bonds %d \n", max_bonds, min_bonds );
-            fprintf (stderr, "Device HTop --> %d and num_3body --> %d \n", Htop, num_3body );
-            Allocate_Device_Matrix (system, control, data, workspace, lists, out_control );
-            dev_workspace->num_H = 0;
-            if( control->hb_cut > 0 ) {
-                int *hbond_index = (int *) malloc ( INT_SIZE * system->N );
-                // init H indexes 
-                num_hbonds = 0;
-                for( i = 0; i < system->N; ++i )
-                    if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 || 
-                            system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 2  ) // H atom
-                        //hbond_index[i] = workspace->num_H++;
-                        hbond_index[i] = num_hbonds ++;
-                    else 
-                        hbond_index[i] = -1;
-                copy_host_device (hbond_index, dev_workspace->hbond_index, 
-                        system->N * INT_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_HBOND_INDEX );
-                dev_workspace->num_H = num_hbonds;
-#ifdef __DEBUG_CUDA__
-                fprintf (stderr, "Device num_H --> %d \n", dev_workspace->num_H );
-                Cuda_Allocate_HBond_List( system->N, dev_workspace->num_H, dev_workspace->hbond_index, 
-                        hb_top, (dev_lists+HBONDS) );
-                num_hbonds = hb_top[system->N-1];
-                system->num_hbonds = num_hbonds;
-#ifdef __CUDA_MEM__
-                fprintf (stderr, "Device memory allocated: Hydrogen Bonds list: %ld (MB) \n", 
-                        sizeof (hbond_data) * num_hbonds / (1024*1024));
-#ifdef __DEBUG_CUDA__
-                fprintf (stderr, "Device Total number of HBonds --> %d \n", num_hbonds );
-                free (hbond_index);
-            }
-            // bonds list 
-            Cuda_Allocate_Bond_List( system->N, bond_top, dev_lists+BONDS );
-            num_bonds = bond_top[system->N-1];
-            system->num_bonds = num_bonds;
-#ifdef __CUDA_MEM__
-            fprintf (stderr, "Device memory allocated: Bonds list: %ld (MB) \n", 
-                    sizeof (bond_data) * num_bonds / (1024*1024));
-#ifdef __DEBUG_CUDA__
-            fprintf (stderr, "Device Total Bonds --> %d \n", num_bonds );
-            //    system->max_thb_intrs = num_3body;
-            // 3bodies list 
-            //if(!Make_List(num_bonds, num_bonds * MAX_THREE_BODIES, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) {
-            //  fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-            //  exit( INIT_ERR );
-            //}
-            //fprintf( stderr, "***memory allocated: three_body = %ldMB\n", 
-            //   num_bonds * MAX_THREE_BODIES *sizeof(three_body_interaction_data) / (1024*1024) );
-            //fprintf (stderr, "size of (three_body_interaction_data) : %d \n", sizeof (three_body_interaction_data));
-            //Free local resources
-            free (output);
-            free (nbrs_indices);
-        }
-        void Init_Lists( reax_system *system, control_params *control, 
-                simulation_data *data, static_storage *workspace, 
-                list **lists, output_controls *out_control )
-        {
-            int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
-            int *hb_top, *bond_top;
-            real t_start, t_elapsed;
-            num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists );
-#ifdef __DEBUG_CUDA__
-            fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs);
-            if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS) ) {
-                fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-                exit( INIT_ERR );
-            }
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-                    num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
-            t_start = Get_Time ();
-            Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control);
-            t_elapsed = Get_Timing_Info ( t_start );
-#ifdef __DEBUG_CUDA__
-            fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed );
-            Htop = 0;
-            hb_top = (int*) calloc( system->N, sizeof(int) );
-            bond_top = (int*) calloc( system->N, sizeof(int) );
-            num_3body = 0;
-            Estimate_Storage_Sizes( system, control, lists, 
-                    &Htop, hb_top, bond_top, &num_3body );
-            Allocate_Matrix( &(workspace->H), system->N, Htop );
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "estimated storage - Htop: %d\n", Htop );
-            fprintf( stderr, "memory allocated: H = %ldMB\n", 
-                    Htop * sizeof(sparse_matrix_entry) / (1024*1024) );
-            workspace->num_H = 0;
-            if( control->hb_cut > 0 ) {
-                /* init H indexes */
-                for( i = 0; i < system->N; ++i )
-                    if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom
-                        workspace->hbond_index[i] = workspace->num_H++;
-                    else workspace->hbond_index[i] = -1;
-                Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, 
-                        hb_top, (*lists)+HBONDS );
-                num_hbonds = hb_top[system->N-1];
-#ifdef __DEBUG_CUDA__
-                fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds );
-#if defined(DEBUG_FOCUS)
-                fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds );
-                fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
-                        num_hbonds * sizeof(hbond_data) / (1024*1024) );
-            }
-            /* bonds list */
-            Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS );
-            num_bonds = bond_top[system->N-1];
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds );
-            fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
-                    num_bonds * sizeof(bond_data) / (1024*1024) );
-#ifdef __DEBUG_CUDA__
-            fprintf (stderr, " host num_3body : %d \n", num_3body);
-            fprintf (stderr, " host num_bonds : %d \n", num_bonds);
-            /* 3bodies list */
-            if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES)) {
-                fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-                exit( INIT_ERR );
-            }
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body );
-            fprintf( stderr, "memory allocated: 3-body = %ldMB\n", 
-                    num_3body * sizeof(three_body_interaction_data) / (1024*1024) );
-            if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) {
-                fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
-                exit( INIT_ERR );
-            }
-            if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) {
-                fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
-                exit( INIT_ERR );
-            }
-            free( hb_top );
-            free( bond_top );
-        }
-        void Init_Out_Controls(reax_system *system, control_params *control, 
-                static_storage *workspace, output_controls *out_control)
-        {
-            char temp[1000];
-            /* Init trajectory file */
-            if( out_control->write_steps > 0 ) { 
-                strcpy( temp, control->sim_name );
-                strcat( temp, ".trj" );
-                out_control->trj = fopen( temp, "w" );
-                out_control->write_header( system, control, workspace, out_control );
-            }
-            if( out_control->energy_update_freq > 0 ) {
-                /* Init out file */
-                strcpy( temp, control->sim_name );
-                strcat( temp, ".out" );
-                out_control->out = fopen( temp, "w" );
-                fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n",
-                        "step", "total energy", "poten. energy", "kin. energy", 
-                        "temp.", "target", "volume", "press.", "target" );
-                fflush( out_control->out );
-                /* Init potentials file */
-                strcpy( temp, control->sim_name );
-                strcat( temp, ".pot" );
-                out_control->pot = fopen( temp, "w" );
-                fprintf( out_control->pot, 
-                        "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n",
-                        "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", 
-                        "etor", "econj", "evdw","ecoul", "epol" );
-                fflush( out_control->pot );
-                /* Init log file */
-                strcpy( temp, control->sim_name );
-                strcat( temp, ".log" );
-                out_control->log = fopen( temp, "w" );
-                fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", 
-                        "step", "total", "neighbors", "init", "bonded", 
-                        "nonbonded", "QEq", "matvec" );
-            }
-            /* Init pressure file */
-            if( control->ensemble == NPT || 
-                    control->ensemble == iNPT || 
-                    control->ensemble == sNPT ) {
-                strcpy( temp, control->sim_name );
-                strcat( temp, ".prs" );
-                out_control->prs = fopen( temp, "w" );
-                fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n",
-                        "step", "norm_x", "norm_y", "norm_z", 
-                        "press_x", "press_y", "press_z", "target_p", "volume" );
-                fflush( out_control->prs );
-            }
-            /* Init molecular analysis file */
-            if( control->molec_anal ) {
-                sprintf( temp, "%s.mol", control->sim_name );
-                out_control->mol = fopen( temp, "w" );
-                if( control->num_ignored ) {
-                    sprintf( temp, "%s.ign", control->sim_name );
-                    out_control->ign = fopen( temp, "w" );
-                } 
-            }
-            /* Init electric dipole moment analysis file */
-            if( control->dipole_anal ) {
-                strcpy( temp, control->sim_name );
-                strcat( temp, ".dpl" );
-                out_control->dpl = fopen( temp, "w" );
-                fprintf( out_control->dpl, 
-                        "Step      Molecule Count  Avg. Dipole Moment Norm\n" );
-                fflush( out_control->dpl );
-            }
-            /* Init diffusion coef analysis file */
-            if( control->diffusion_coef ) {
-                strcpy( temp, control->sim_name );
-                strcat( temp, ".drft" );
-                out_control->drft = fopen( temp, "w" );
-                fprintf( out_control->drft, "Step     Type Count   Avg Squared Disp\n" );
-                fflush( out_control->drft );
-            }
-            /* open bond energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".ebond" );
-            out_control->ebond = fopen( temp, "w" );
-            /* open lone-pair energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".elp" );
-            out_control->elp = fopen( temp, "w" );
-            /* open overcoordination energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".eov" );
-            out_control->eov = fopen( temp, "w" );
-            /* open undercoordination energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".eun" );
-            out_control->eun = fopen( temp, "w" );
-            /* open angle energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".eval" );
-            out_control->eval = fopen( temp, "w" );
-            /* open penalty energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".epen" );
-            out_control->epen = fopen( temp, "w" );
-            /* open coalition energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".ecoa" );
-            out_control->ecoa = fopen( temp, "w" );
-            /* open hydrogen bond energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".ehb" );
-            out_control->ehb = fopen( temp, "w" );
-            /* open torsion energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".etor" );
-            out_control->etor = fopen( temp, "w" );
-            /* open conjugation energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".econ" );
-            out_control->econ = fopen( temp, "w" );
-            /* open vdWaals energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".evdw" );
-            out_control->evdw = fopen( temp, "w" );
-            /* open coulomb energy file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".ecou" );
-            out_control->ecou = fopen( temp, "w" );
-            /* open bond orders file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".fbo" );
-            out_control->fbo = fopen( temp, "w" );
-            /* open bond orders derivatives file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".fdbo" );
-            out_control->fdbo = fopen( temp, "w" );
-            /* open bond forces file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".fbond" );
-            out_control->fbond = fopen( temp, "w" );
-            /* open lone-pair forces file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".flp" );
-            out_control->flp = fopen( temp, "w" );
-            /* open overcoordination forces file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".fatom" );
-            out_control->fatom = fopen( temp, "w" );
-            /* open angle forces file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".f3body" );
-            out_control->f3body = fopen( temp, "w" );
-            /* open hydrogen bond forces file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".fhb" );
-            out_control->fhb = fopen( temp, "w" );
-            /* open torsion forces file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".f4body" );
-            out_control->f4body = fopen( temp, "w" );
-            /* open nonbonded forces file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".fnonb" );
-            out_control->fnonb = fopen( temp, "w" );
-            /* open total force file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".ftot" );
-            out_control->ftot = fopen( temp, "w" );
-            /* open coulomb forces file */
-            strcpy( temp, control->sim_name );
-            strcat( temp, ".ftot2" );
-            out_control->ftot2 = fopen( temp, "w" );
-            /* Error handling */
-            /* if ( out_control->out == NULL || out_control->pot == NULL || 
-               out_control->log == NULL || out_control->mol == NULL || 
-               out_control->dpl == NULL || out_control->drft == NULL ||       
-               out_control->pdb == NULL )
-               {
-               fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." );
-               exit( CANNOT_OPEN_OUTFILE );
-               }*/
-        }
-        void Initialize(reax_system *system, control_params *control, 
-                simulation_data *data, static_storage *workspace, list **lists, 
-                output_controls *out_control, evolve_function *Evolve)
-        {
-            Randomize();
-            Init_System( system, control, data );
-            Init_Simulation_Data( system, control, data, out_control, Evolve );
-            Init_Workspace( system, control, workspace );
-            Init_Lists( system, control, data, workspace, lists, out_control );
-            Init_Out_Controls( system, control, workspace, out_control );
-            /* These are done in forces.c, only forces.c can see all those functions */
-            Init_Bonded_Force_Functions( control );
-            Init_Force_Test_Functions( );
-            if( control->tabulate )
-                Make_LR_Lookup_Table( system, control );
-#if defined(DEBUG_FOCUS)
-            fprintf( stderr, "data structures have been initialized...\n" ); 
-        }
-        void Cuda_Initialize(reax_system *system, control_params *control, 
-                simulation_data *data, static_storage *workspace, list **lists, 
-                output_controls *out_control, evolve_function *Evolve)
-        {
-            Randomize ();
-            Cuda_Init_Scratch ();
-            //System
-            Cuda_Init_System (system);
-            Sync_Host_Device ( system, cudaMemcpyHostToDevice );
-            Cuda_Init_System (system, control, data );
-            //Simulation Data
-            copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , 
-                    cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS );
-            Cuda_Init_Simulation_Data (data);
-            //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
-            Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve );
-            Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
-            //static storage
-            Cuda_Init_Workspace_System ( system, dev_workspace );
-            Cuda_Init_Workspace ( system, control, dev_workspace );
-            Cuda_Init_Workspace_Device (workspace);
-            //control
-            Cuda_Init_Control (control);
-            //Grid
-            Cuda_Init_Grid (&system->g, &system->d_g );
-            //lists
-            Cuda_Init_Lists (system, control, data, workspace, lists, out_control );
-            Init_Out_Controls( system, control, workspace, out_control );
-            if( control->tabulate ) {
-                real start, end;
-                start = Get_Time ();
-                Make_LR_Lookup_Table( system, control );
-                copy_LR_table_to_device (system, control );
-                end = Get_Timing_Info ( start );
-#ifdef __DEBUG_CUDA__
-                fprintf (stderr, "Done copying the LR table to the device ---> %f \n", end );
-            }
-        }
diff --git a/PuReMD-GPU/src/init_md.h b/PuReMD-GPU/src/init_md.h
index 65e12348eef4830a6eede11c881b9f46ffa283e8..8c23806594a8f2b107ddb884efbf68e7b5fe27ff 100644
--- a/PuReMD-GPU/src/init_md.h
+++ b/PuReMD-GPU/src/init_md.h
@@ -23,10 +23,22 @@
 #include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
 void Initialize( reax_system*, control_params*, simulation_data*,
-                 static_storage*, list**, output_controls*, evolve_function* );
+        static_storage*, list**, output_controls*, evolve_function* );
+void Generate_Initial_Velocities(reax_system *, real );
+void Init_Out_Controls(reax_system *, control_params *, static_storage *,
+        output_controls *);
+#ifdef __cplusplus
-void Cuda_Initialize( reax_system*, control_params*, simulation_data*,
-                      static_storage*, list**, output_controls*, evolve_function* );
diff --git a/PuReMD-GPU/src/integrate.cu b/PuReMD-GPU/src/integrate.c
similarity index 65%
rename from PuReMD-GPU/src/integrate.cu
rename to PuReMD-GPU/src/integrate.c
index d079028653d79dfeb5117fa6d95f33b700dee5b1..482a9c89a302c052e9ac44ae2de446c61b1c6a3e 100644
--- a/PuReMD-GPU/src/integrate.cu
+++ b/PuReMD-GPU/src/integrate.c
@@ -19,6 +19,7 @@
 #include "integrate.h"
 #include "allocate.h"
 #include "box.h"
 #include "forces.h"
@@ -32,10 +33,6 @@
 #include "vector.h"
 #include "list.h"
-#include "cuda_utils.h"
-#include "reduction.h"
-#include "validation.h"
 void Velocity_Verlet_NVE(reax_system* system, control_params* control, 
         simulation_data *data, static_storage *workspace, 
@@ -49,6 +46,7 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
     dt_sqr = SQR(dt);
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
 #if defined(DEBUG_FOCUS)  
     fprintf( stderr, "step%d: ", data->step );
@@ -63,6 +61,7 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
         rvec_ScaledAdd( system->atoms[i].v, 
                 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
 #if defined(DEBUG_FOCUS)  
     fprintf( stderr, "verlet1 - ");
@@ -70,106 +69,25 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
     Reallocate( system, workspace, lists, renbr );
     Reset( system, control, data, workspace, lists );
     if( renbr )
+    {
         Generate_Neighbor_Lists( system, control, data, workspace, 
                 lists, out_control );  
+    }
     Compute_Forces( system, control, data, workspace, lists, out_control );
-    for( i = 0; i < system->N; i++ ) {
+    for( i = 0; i < system->N; i++ )
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         rvec_ScaledAdd( system->atoms[i].v, 
                 0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr, "verlet2\n");
-//Cuda Function -- Velocity Verlet NVE
-GLOBAL void Cuda_Velocity_Verlet_NVE_atoms1 (reax_atom *atoms, 
-        single_body_parameters *sbp, 
-        simulation_box *box,
-        int N, real dt)
-    real inv_m, dt_sqr;
-    rvec dx;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    dt_sqr = SQR(dt);
-    //for( i = 0; i < system->N; i++ ) {
-    inv_m = 1.0 / sbp[atoms[i].type].mass;
-    rvec_ScaledSum( dx, dt, atoms[i].v, 
-            0.5 * dt_sqr * -F_CONV * inv_m, atoms[i].f );
-    Inc_on_T3( atoms[i].x, dx, box );
-    rvec_ScaledAdd( atoms[i].v, 
-            0.5 * dt * -F_CONV * inv_m, atoms[i].f );
-    //}
-GLOBAL void Cuda_Velocity_Verlet_NVE_atoms2 (reax_atom *atoms, single_body_parameters *sbp, int N, real dt)
-    real inv_m;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    //for( i = 0; i < system->N; i++ ) {
-    inv_m = 1.0 / sbp[atoms[i].type].mass;
-    rvec_ScaledAdd( atoms[i].v, 
-            0.5 * dt * -F_CONV * inv_m, atoms[i].f );
-    //}
-void Cuda_Velocity_Verlet_NVE(reax_system* system, control_params* control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
-    int i, steps, renbr;
-    real inv_m, dt, dt_sqr;
-    rvec dx;
-    int blocks, block_size;
-    dt = control->dt;
-    dt_sqr = SQR(dt);
-    steps = data->step - data->prev_steps;
-    renbr = (steps % control->reneighbor == 0);
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr, "step%d: ", data->step );
-    compute_blocks (&blocks, &block_size, system->N);
-    Cuda_Velocity_Verlet_NVE_atoms1 <<<blocks, block_size>>>
-        (system->d_atoms, system->reaxprm.d_sbp, 
-         (simulation_box *)system->d_box, system->N, dt);
-    cudaThreadSynchronize ();
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr, "verlet1 - ");
-    Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
-    Cuda_Reset( system, control, data, workspace, lists );
-    if( renbr ) {
-        Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true);
-    }
-    Cuda_Compute_Forces( system, control, data, workspace, lists, out_control );
-    Cuda_Velocity_Verlet_NVE_atoms2<<<blocks, block_size>>>
-        (system->d_atoms, system->reaxprm.d_sbp, system->N, dt);
-    cudaThreadSynchronize ();
 #if defined(DEBUG_FOCUS)  
     fprintf( stderr, "verlet2\n");
 void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, 
         control_params* control, 
         simulation_data *data, 
@@ -188,6 +106,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     therm = &( data->therm );
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d: ", data->step );
@@ -197,7 +116,8 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     /* Compute x(t + dt) and copy old forces */
-    for (i=0; i < system->N; i++) {
+    for (i=0; i < system->N; i++)
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, system->atoms[i].v,
@@ -209,6 +129,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     /* Compute xi(t + dt) */
     therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - " );
@@ -217,14 +138,17 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     Reset( system, control, data, workspace, lists );
     if( renbr )
+    {
         Generate_Neighbor_Lists( system, control, data, workspace, 
                 lists, out_control );
+    }
     /* Calculate Forces at time (t + dt) */
     Compute_Forces( system,control,data, workspace, lists, out_control );
     /* Compute iteration constants for each atom's velocity */
-    for( i = 0; i < system->N; ++i ) {
+    for( i = 0; i < system->N; ++i )
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         rvec_Scale( workspace->v_const[i], 
@@ -241,7 +165,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
     E_kin_new = G_xi_new = v_xi_old = 0;
     itr = 0;
@@ -258,7 +181,8 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
         //print_sys_atoms (system);
-        for( i = 0; i < system->N; ++i ) {
+        for( i = 0; i < system->N; ++i )
+        {
             rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] );
             E_kin_new += ( 0.5*system->reaxprm.sbp[system->atoms[i].type].mass * 
@@ -272,6 +196,7 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
         G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - 
                 data->N_f * K_B * control->T );
         v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
 #if defined(DEBUG)
         fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n",
                 itr, G_xi_new, v_xi_new, v_xi_old );
@@ -283,7 +208,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
     fprintf (stderr, " Iteration Count in NVE --> %d \n", itr );
 #ifndef __BUILD_DEBUG__
     therm->v_xi_old = therm->v_xi;
     therm->v_xi = v_xi_new;
@@ -296,215 +220,6 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
-//Cuda Function -- Velocity_Verlet_Nose_Hoover_NVT_Klein
-GLOBAL void Compute_X_t_dt (real dt, real dt_sqr, thermostat p_therm,
-        reax_atom *atoms, single_body_parameters *sbp, 
-        simulation_box *box,
-        static_storage p_workspace, int N)
-    real inv_m;
-    rvec dx;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    static_storage *workspace = &p_workspace;
-    thermostat *therm = &p_therm;
-    /* Compute x(t + dt) and copy old forces */
-    //for (i=0; i < system->N; i++) {
-    inv_m = 1.0 / sbp[atoms[i].type].mass;
-    rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, atoms[i].v,
-            0.5 * dt_sqr * inv_m * -F_CONV, atoms[i].f );
-    Inc_on_T3( atoms[i].x, dx, box );
-    rvec_Copy( workspace->f_old[i], atoms[i].f );
-    //}
-GLOBAL void Update_Velocity (reax_atom *atoms, single_body_parameters *sbp, 
-        static_storage p_workspace, real dt, thermostat p_therm, 
-        int N)
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    real inv_m;
-    static_storage *workspace = &p_workspace;
-    thermostat *therm = &p_therm;
-    //for( i = 0; i < system->N; ++i ) {
-    inv_m = 1.0 / sbp[atoms[i].type].mass;
-    rvec_Scale( workspace->v_const[i], 
-            1.0 - 0.5 * dt * therm->v_xi, atoms[i].v );
-    rvec_ScaledAdd( workspace->v_const[i], 
-            0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
-    rvec_ScaledAdd( workspace->v_const[i], 
-            0.5 * dt * inv_m * -F_CONV, atoms[i].f );
-    //}
-GLOBAL void E_Kin_Reduction (reax_atom *atoms, static_storage p_workspace,
-        single_body_parameters *sbp, 
-        real *per_block_results, real coef_v, const size_t n)
-    extern __shared__ real sdata[];
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    real x = 0;
-    static_storage *workspace = &p_workspace;
-    if(i < n)
-    {
-        rvec_Scale( atoms[i].v, coef_v, workspace->v_const[i] );
-        x = ( 0.5 * sbp[atoms[i].type].mass * 
-                rvec_Dot( atoms[i].v, atoms[i].v ) );
-    }
-    sdata[threadIdx.x] = x;
-    __syncthreads();
-    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-    {
-        if(threadIdx.x < offset)
-        {   
-            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-        }   
-        __syncthreads();
-    }
-    if(threadIdx.x == 0)
-    {
-        per_block_results[blockIdx.x] = sdata[0];
-    }
-void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, 
-        control_params* control, 
-        simulation_data *data, 
-        static_storage *workspace, 
-        list **lists, 
-        output_controls *out_control )
-    int i, itr, steps, renbr;
-    real inv_m, coef_v, dt, dt_sqr;
-    real E_kin_new, G_xi_new, v_xi_new, v_xi_old;
-    rvec dx;
-    thermostat *therm;
-    real *results = (real *)scratch;
-    dt = control->dt;
-    dt_sqr = SQR( dt );
-    therm = &( data->therm );
-    steps = data->step - data->prev_steps;
-    renbr = (steps % control->reneighbor == 0);
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Device: Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein --> coef to update velocity --> %6.10f\n", therm->v_xi_old);
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "step%d: ", data->step );
-    Compute_X_t_dt <<< BLOCKS, BLOCK_SIZE >>>
-        (dt, dt_sqr, data->therm, system->d_atoms, 
-         system->reaxprm.d_sbp, system->d_box, *dev_workspace, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    /* Compute xi(t + dt) */
-    therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "verlet1 - " );
-    Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
-    Cuda_Reset( system, control, data, workspace, lists );
-    if( renbr ) {
-        //generate_neighbor_lists here
-        Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true);
-    }
-    /* Calculate Forces at time (t + dt) */
-    Cuda_Compute_Forces( system,control,data, workspace, lists, out_control );
-    /* Compute iteration constants for each atom's velocity */
-    Update_Velocity <<< BLOCKS, BLOCK_SIZE >>>
-        (system->d_atoms, system->reaxprm.d_sbp, *dev_workspace,
-         dt, *therm, system->N );
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-    v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
-    E_kin_new = G_xi_new = v_xi_old = 0;
-    itr = 0;
-    do {
-        itr++;      
-        /* new values become old in this iteration */
-        v_xi_old = v_xi_new;
-        coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
-        E_kin_new = 0;
-        /*reduction for the E_Kin_new here*/
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, " Device: coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old);
-        cuda_memset (results, 0, 2 * BLOCK_SIZE * REAL_SIZE, RES_SCRATCH );
-        E_Kin_Reduction <<< BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>
-            (system->d_atoms, *dev_workspace, system->reaxprm.d_sbp, 
-             results, coef_v, system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
-            (results, results + BLOCKS_POW_2, BLOCKS_POW_2);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        copy_host_device (&E_kin_new, results + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); 
-        G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - 
-                data->N_f * K_B * control->T );
-        v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
-#if defined(DEBUG)
-        fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n",
-                itr, G_xi_new, v_xi_new, v_xi_old );
-    }
-    while( fabs(v_xi_new - v_xi_old ) > 1e-5 );
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Iteration Count in NVE --> %d \n", itr );
-    therm->v_xi_old = therm->v_xi;
-    therm->v_xi = v_xi_new;
-    therm->G_xi = G_xi_new;  
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr,"vel scale\n" );
-//Cuda Function -- Velocity_Verlet_Nose_Hoover_NVT_Klein
 /* uses Berendsen-type coupling for both T and P. 
    All box dimensions are scaled by the same amount, 
    there is no change in the angles between axes. */
@@ -522,6 +237,7 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
 #if defined(DEBUG_FOCUS)
     //fprintf( out_control->prs, 
     //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
@@ -530,7 +246,8 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
     /* velocity verlet, 1st part */
-    for( i = 0; i < system->N; i++ ) {
+    for( i = 0; i < system->N; i++ )
+    {
         inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
         /* Compute x(t + dt) */
         rvec_ScaledSum( dx, dt, system->atoms[i].v, 
@@ -546,6 +263,7 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
           0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
           0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
 #if defined(DEBUG_FOCUS)  
     fprintf( stderr, "verlet1 - " );
@@ -574,6 +292,7 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
     //Compute_Kinetic_Energy( system, data );   
     Compute_Pressure_Isotropic( system, control, data, out_control );
 #if defined(DEBUG_FOCUS)  
     fprintf( stderr, "verlet2 - " );
@@ -633,6 +352,7 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system,
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
 #if defined(DEBUG_FOCUS)
     //fprintf( out_control->prs, 
     //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
@@ -657,6 +377,7 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system,
           0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
           0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "verlet1 - " );
@@ -685,6 +406,7 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system,
     //Compute_Kinetic_Energy( system, data );   
     Compute_Pressure_Isotropic( system, control, data, out_control );
 #if defined(DEBUG_FOCUS)  
     fprintf( stderr, "verlet2 - " );
@@ -730,14 +452,12 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system,
 /*                                              */
 void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system, 
@@ -810,7 +530,6 @@ void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system,
 void Velocity_Verlet_Isotropic_NPT( reax_system* system, 
         control_params* control, 
         simulation_data *data,
@@ -983,8 +702,6 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
 /* uses Berendsen-type coupling for both T and P. 
    All box dimensions are scaled by the same amount, 
    there is no change in the angles between axes. */
@@ -1006,6 +723,7 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
 #if defined(DEBUG_FOCUS)
     fprintf( stderr, "step%d\n", data->step );
     dt = control->dt;
     steps = data->step - data->prev_steps;
     renbr = (steps % control->reneighbor == 0);
@@ -1020,6 +738,7 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
         /* Compute v(t + dt/2) */
         rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
 #if defined(DEBUG_FOCUS)
     fprintf(stderr, "step%d: verlet1 done\n", data->step);
@@ -1040,6 +759,7 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
         /* Compute v(t + dt) */
         rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
 #if defined(DEBUG_FOCUS)  
     fprintf(stderr, "step%d: verlet2 done\n", data->step);
@@ -1065,182 +785,3 @@ void Velocity_Verlet_Berendsen_NVT( reax_system* system,
             data->step );
-GLOBAL void ker_update_velocity_1 (reax_atom *atoms,
-        single_body_parameters *sbp,
-        real dt,
-        simulation_box *box,
-        int N)
-    real inv_m;
-    rvec dx;
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= N ) return;
-    /* velocity verlet, 1st part */
-    //for( i = 0; i < system->n; i++ ) { 
-    atom = &(atoms[i]);
-    inv_m = 1.0 / sbp[atom->type].mass;
-    /* Compute x(t + dt) */
-    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-    rvec_Add( atom->x, dx );
-    /* Metin's suggestion to rebox the atoms */
-    /* bNVT fix */
-    Inc_on_T3( atoms[i].x, dx, box );
-    /* bNVT fix */
-    /* Compute v(t + dt/2) */
-    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-    //}
-void bNVT_update_velocity_part1 (reax_system *system, simulation_box *box, real dt)
-    ker_update_velocity_1 <<< BLOCKS, BLOCK_SIZE>>>
-        (system->d_atoms, system->reaxprm.d_sbp, dt, box, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-GLOBAL void ker_update_velocity_2 (reax_atom *atoms,
-        single_body_parameters *sbp,
-        real dt,
-        int N)
-    reax_atom *atom;
-    real inv_m;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= N ) return;
-    /* velocity verlet, 2nd part */
-    //for( i = 0; i < system->n; i++ ) { 
-    atom = &(atoms[i]);
-    inv_m = 1.0 / sbp[atom->type].mass;
-    /* Compute v(t + dt) */
-    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-    //}
-void bNVT_update_velocity_part2 (reax_system *system, real dt)
-    ker_update_velocity_2 <<< BLOCKS, BLOCK_SIZE >>>
-        (system->d_atoms, system->reaxprm.d_sbp, dt, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-GLOBAL void ker_scale_velocities (reax_atom *atoms, real lambda, int N)
-    reax_atom *atom;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= N ) return;
-    /* Scale velocities and positions at t+dt */
-    //for( i = 0; i < system->n; ++i ) {
-    atom = &(atoms[i]);
-    rvec_Scale( atom->v, lambda, atom->v );
-    //}
-void bNVT_scale_velocities (reax_system *system, real lambda)
-    ker_scale_velocities <<< BLOCKS, BLOCK_SIZE >>>
-        (system->d_atoms, lambda, system->N);
-    cudaThreadSynchronize ();
-    cudaCheckError ();
-void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system,
-        control_params* control,
-        simulation_data *data,
-        static_storage *workspace,
-        list **lists,
-        output_controls *out_control
-        )
-    int i, steps, renbr;
-    real inv_m, dt, lambda;
-    rvec dx;
-    reax_atom *atom;
-#if defined(DEBUG_FOCUS)
-    fprintf( stderr, "step%d\n", data->step );
-    dt = control->dt;
-    steps = data->step - data->prev_steps;
-    renbr = (steps % control->reneighbor == 0);
-    /* velocity verlet, 1st part 
-       for( i = 0; i < system->N; i++ ) { 
-       atom = &(system->atoms[i]);
-       inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
-    // Compute x(t + dt) 
-    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-    rvec_Add( atom->x, dx );
-    // Compute v(t + dt/2) 
-    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-    }
-     */
-    bNVT_update_velocity_part1 (system, (simulation_box *) system->d_box, dt);
-#if defined(DEBUG_FOCUS)
-    fprintf(stderr, "step%d: verlet1 done\n", data->step);
-    Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
-    Cuda_Reset( system, control, data, workspace, lists );
-    if( renbr ) {
-        Cuda_Generate_Neighbor_Lists( system, workspace, control, true);
-    }
-    Cuda_Compute_Forces( system, control, data, workspace,
-            lists, out_control );
-    /* velocity verlet, 2nd part 
-       for( i = 0; i < system->N; i++ ) {
-       atom = &(system->atoms[i]);
-       inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
-    // Compute v(t + dt) 
-    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-    }
-     */
-    bNVT_update_velocity_part2 (system, dt);
-#if defined(DEBUG_FOCUS)  
-    fprintf(stderr, "step%d: verlet2 done\n", data->step);
-    /* temperature scaler */
-    Cuda_Compute_Kinetic_Energy( system, data );
-    //get the latest temperature from the device to the host.
-    copy_host_device (&data->therm, &((simulation_data *)data->d_simulation_data)->therm,
-            sizeof (thermostat), cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-    if( lambda < MIN_dT )
-        lambda = MIN_dT;
-    else if (lambda > MAX_dT )
-        lambda = MAX_dT;
-    lambda = SQRT( lambda );
-    //fprintf (stderr, "step:%d lambda -> %f \n", data->step, lambda);
-    /* Scale velocities and positions at t+dt 
-       for( i = 0; i < system->N; ++i ) {
-       atom = &(system->atoms[i]);
-       rvec_Scale( atom->v, lambda, atom->v );
-       }
-     */
-    bNVT_scale_velocities (system, lambda);
-    Cuda_Compute_Kinetic_Energy( system, data );
-#if defined(DEBUG_FOCUS)  
-    fprintf( stderr, "step%d: scaled velocities\n",
-            data->step );
diff --git a/PuReMD-GPU/src/integrate.h b/PuReMD-GPU/src/integrate.h
index 945b29fa09fb59e8e801b55a7527ca88814e7bc2..6f5848f0de84e8a50ef2c5090194618b61f185fc 100644
--- a/PuReMD-GPU/src/integrate.h
+++ b/PuReMD-GPU/src/integrate.h
@@ -24,24 +24,19 @@
 #include "mytypes.h"
 void Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
-                          static_storage*, list**, output_controls* );
-void Cuda_Velocity_Verlet_NVE( reax_system*, control_params*, simulation_data*,
-                               static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Velocity_Verlet_Nose_Hoover_NVT( reax_system*, control_params*,
-                                      simulation_data*, static_storage*,
-                                      list**, output_controls* );
-void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
         simulation_data*, static_storage*,
         list**, output_controls* );
-void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
+void Velocity_Verlet_Nose_Hoover_NVT_Klein( reax_system*, control_params*,
         simulation_data*, static_storage*,
         list**, output_controls* );
 void Velocity_Verlet_Flexible_NPT( reax_system*, control_params*,
-                                   simulation_data*, static_storage*,
-                                   list**, output_controls* );
+        simulation_data*, static_storage*,
+        list**, output_controls* );
 void Velocity_Verlet_Isotropic_NPT( reax_system*, control_params*,
-                                    simulation_data*, static_storage*,
-                                    list**, output_controls* );
+        simulation_data*, static_storage*,
+        list**, output_controls* );
 void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system*, control_params*,
         simulation_data*, static_storage*,
         list**, output_controls* );
@@ -50,9 +45,7 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system*, control_params*,
         static_storage*, list**,
         output_controls* );
 void Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* ,
-                                    simulation_data *, static_storage *,
-                                    list **, output_controls * );
-void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* , control_params* ,
         simulation_data *, static_storage *,
         list **, output_controls * );
diff --git a/PuReMD-GPU/src/lin_alg.c b/PuReMD-GPU/src/lin_alg.c
new file mode 100644
index 0000000000000000000000000000000000000000..cb141d475b0e2cf702901ed551287e0e238cdcd6
--- /dev/null
+++ b/PuReMD-GPU/src/lin_alg.c
@@ -0,0 +1,676 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "lin_alg.h"
+#include "list.h"
+#include "vector.h"
+#include "index_utils.h"
+void Sparse_MatVec( sparse_matrix *A, real *x, real *b )
+    int i, j, k, n, si, ei;
+    real H;
+    n = A->n;
+    for( i = 0; i < n; ++i )
+        b[i] = 0;
+    for( i = 0; i < n; ++i ) {
+        si = A->start[i];
+        ei = A->start[i+1]-1;
+        for( k = si; k < ei; ++k ) {
+            j = A->entries[k].j;
+            H = A->entries[k].val;
+            b[j] += H * x[i]; 
+            b[i] += H * x[j];
+        }
+        // the diagonal entry is the last one in
+        b[i] += A->entries[k].val * x[i]; 
+    }
+void Forward_Subs( sparse_matrix *L, real *b, real *y )
+    int i, pj, j, si, ei;
+    real val;
+    for( i = 0; i < L->n; ++i ) {
+        y[i] = b[i];
+        si = L->start[i];
+        ei = L->start[i+1];
+        for( pj = si; pj < ei-1; ++pj ){
+            j = L->entries[pj].j;
+            val = L->entries[pj].val;
+            y[i] -= val * y[j];
+        }
+        y[i] /= L->entries[pj].val;
+    }
+void Backward_Subs( sparse_matrix *U, real *y, real *x )
+    int i, pj, j, si, ei;
+    real val;
+    for( i = U->n-1; i >= 0; --i ) {
+        x[i] = y[i];
+        si = U->start[i];
+        ei = U->start[i+1];
+        for( pj = si+1; pj < ei; ++pj ){
+            j = U->entries[pj].j;
+            val = U->entries[pj].val;
+            x[i] -= val * x[j];
+        }
+        x[i] /= U->entries[si].val;
+    }
+int GMRES( static_storage *workspace, sparse_matrix *H, 
+        real *b, real tol, real *x, FILE *fout, reax_system* system)
+    int i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+    N = H->n;
+    bnorm = Norm( b, N );
+    /* apply the diagonal pre-conditioner to rhs */
+    for( i = 0; i < N; ++i )
+        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr ) {
+        /* calculate r0 */
+        Sparse_MatVec( H, x, workspace->b_prm );      
+        for( i = 0; i < N; ++i )
+            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */    
+        Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system->N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
+        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system->N)], N );
+        Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system->N) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system->N)], N );
+        /* GMRES inner-loop */
+        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
+            /* matvec */
+            Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
+            for( k = 0; k < N; ++k )  
+                workspace->v[ index_wkspace_sys (j+1,k,system->N)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ 
+            /* apply modified Gram-Schmidt to orthogonalize the new residual */
+            for( i = 0; i <= j; i++ ) {
+                workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
+                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system->N)], 
+                        -workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
+            }
+            workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
+            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system->N)], 
+                    1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
+            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
+            /* Givens rotations on the upper-Hessenberg matrix to make it U */
+            for( i = 0; i <= j; i++ )    {
+                if( i == j ) {
+                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
+                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
+                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
+                }
+                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
+                    workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ];
+                tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + 
+                    workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ];
+                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
+                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
+            } 
+            /* apply Givens rotations to the rhs as well */
+            tmp1 =  workspace->hc[j] * workspace->g[j];
+            tmp2 = -workspace->hs[j] * workspace->g[j];
+            workspace->g[j] = tmp1;
+            workspace->g[j+1] = tmp2;
+            // fprintf( stderr, "h: " );
+            // for( i = 0; i <= j+1; ++i )
+            //  fprintf( stderr, "%.6f ", workspace->h[i][j] );
+            // fprintf( stderr, "\n" );
+            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
+        }
+        /* solve Hy = g.
+           H is now upper-triangular, do back-substitution */
+        for( i = j-1; i >= 0; i-- ) {
+            temp = workspace->g[i];      
+            for( k = j-1; k > i; k-- )
+                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
+        }
+        /* update x = x_0 + Vy */
+        for( i = 0; i < j; i++ )
+            Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
+        /* stopping condition */
+        if( fabs(workspace->g[j]) / bnorm <= tol )
+            break;
+    }
+    // Sparse_MatVec( H, x, workspace->b_prm );
+    // for( i = 0; i < N; ++i )
+    // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
+    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
+    // for( i = 0; i < N; ++i )
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
+    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
+    //          itr, j, fabs( workspace->g[j] ) / bnorm );
+    // data->timing.matvec += itr * RESTART + j;
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        // return -1;
+        return itr * (RESTART+1) + j + 1;
+    }
+    return itr * (RESTART+1) + j + 1;
+int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, 
+        real *b, real tol, real *x, FILE *fout, reax_system *system)
+    int  i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+    real v[10000], z[RESTART+2][10000], w[RESTART+2];
+    real u[RESTART+2][10000];
+    N = H->n;
+    bnorm = Norm( b, N );
+    /* apply the diagonal pre-conditioner to rhs */
+    for( i = 0; i < N; ++i )
+        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
+    // memset( x, 0, sizeof(real) * N );
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr ) {
+        /* compute z = r0 */
+        Sparse_MatVec( H, x, workspace->b_prm );      
+        for( i = 0; i < N; ++i )
+            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
+        Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
+        Vector_MakeZero( w, RESTART+1 );
+        w[0] = Norm( z[0], N );
+        Vector_Copy( u[0], z[0], N );
+        u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
+        Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
+        w[0]    *= ( u[0][0] < 0.0 ?  1 :-1 );
+        // fprintf( stderr, "\n\n%12.6f\n", w[0] );
+        /* GMRES inner-loop */
+        for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) {
+            /* compute v_j */
+            Vector_Scale( z[j], -2 * u[j][j], u[j], N );
+            z[j][j] += 1.; /* due to e_j */
+            for( i = j-1; i >= 0; --i )
+                Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i );
+            /* matvec */
+            Sparse_MatVec( H, z[j], v );
+            for( k = 0; k < N; ++k )
+                v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
+            for( i = 0; i <= j; ++i )
+                Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i );
+            if( !Vector_isZero( v + (j+1), N - (j+1) ) ) {
+                /* compute the HouseHolder unit vector u_j+1 */
+                for( i = 0; i <= j; ++i )  
+                    u[j+1][i] = 0;
+                Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) );
+                u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) );
+                Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N );
+                /* overwrite v with P_m+1 * v */
+                v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1];
+                Vector_MakeZero( v + (j+2), N - (j+2) );
+                // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
+            }
+            /* prev Givens rots on the upper-Hessenberg matrix to make it U */
+            for( i = 0; i < j; i++ ) {
+                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1];
+                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1];
+                v[i]   = tmp1;
+                v[i+1] = tmp2;
+            }
+            /* apply the new Givens rotation to H and right-hand side */
+            if( fabs(v[j+1]) >= ALMOST_ZERO )    {
+                cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) );
+                workspace->hc[j] = v[j] / cc;
+                workspace->hs[j] = v[j+1] / cc;
+                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1];
+                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1];
+                v[j]   = tmp1;
+                v[j+1] = tmp2;
+                /* Givens rotations to rhs */
+                tmp1 =  workspace->hc[j] * w[j];
+                tmp2 = -workspace->hs[j] * w[j];
+                w[j]   = tmp1;
+                w[j+1] = tmp2;
+            }
+            /* extend R */
+            for( i = 0; i <= j; ++i )
+                workspace->h[ index_wkspace_res (i,j) ] = v[i];
+            // fprintf( stderr, "h:" );
+            // for( i = 0; i <= j+1 ; ++i )
+            // fprintf( stderr, "%.6f ", h[i][j] );
+            // fprintf( stderr, "\n" );
+            // fprintf( stderr, "%12.6f\n", w[j+1] );
+        }
+        /* solve Hy = w.
+           H is now upper-triangular, do back-substitution */
+        for( i = j-1; i >= 0; i-- ) {
+            temp = w[i];      
+            for( k = j-1; k > i; k-- )
+                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
+        }
+        // fprintf( stderr, "y: " );
+        // for( i = 0; i < RESTART+1; ++i )
+        //   fprintf( stderr, "%8.3f ", workspace->y[i] );
+        /* update x = x_0 + Vy */
+        // memset( z, 0, sizeof(real) * N );
+        // for( i = j-1; i >= 0; i-- )
+        //   {
+        //     Vector_Copy( v, z, N );
+        //     v[i] += workspace->y[i];
+        //    
+        //     Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N );
+        //   }      
+        //
+        // fprintf( stderr, "\nz: " );
+        // for( k = 0; k < N; ++k )
+        // fprintf( stderr, "%6.2f ", z[k] );
+        // fprintf( stderr, "\nx_bef: " );
+        // for( i = 0; i < N; ++i )
+        //   fprintf( stderr, "%6.2f ", x[i] );
+        // Vector_Add( x, 1, z, N );
+        for( i = j-1; i >= 0; i-- )
+            Vector_Add( x, workspace->y[i], z[i], N );
+        // fprintf( stderr, "\nx_aft: " );
+        // for( i = 0; i < N; ++i )
+        //   fprintf( stderr, "%6.2f ", x[i] );
+        /* stopping condition */
+        if( fabs( w[j] ) / bnorm <= tol )
+            break;
+    }
+    // Sparse_MatVec( H, x, workspace->b_prm );
+    // for( i = 0; i < N; ++i )
+    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
+    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
+    // for( i = 0; i < N; ++i )
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // workspace->b_prc[i], workspace->b_prm[i], x[i] );
+    //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", 
+    //         itr, j, fabs( workspace->g[j] ) / bnorm );
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        // return -1;
+        return itr * (RESTART+1) + j + 1;
+    }
+    return itr * (RESTART+1) + j + 1;
+int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, 
+        sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system )
+    int i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+    N = H->n;
+    bnorm = Norm( b, N );
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr )
+    {
+        /* calculate r0 */
+        Sparse_MatVec( H, x, workspace->b_prm );      
+        Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system->N)], 1., b, -1., workspace->b_prm, N );
+        Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system->N)], &workspace->v[index_wkspace_sys(0,0,system->N)] );
+        Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system->N)], &workspace->v[index_wkspace_sys(0,0,system->N)] );
+        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system->N)], N );
+        Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system->N)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system->N)], N );
+        //fprintf( stderr, "res: %.15e\n", workspace->g[0] );
+        /* GMRES inner-loop */
+        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ )
+        {
+            /* matvec */
+            Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system->N)], &workspace->v[index_wkspace_sys (j+1,0,system->N)] );
+            Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
+            Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)] );
+            /* apply modified Gram-Schmidt to orthogonalize the new residual */
+            for( i = 0; i < j-1; i++ )
+            {
+                workspace->h[ index_wkspace_res (i,j)] = 0;
+            }
+            //for( i = 0; i <= j; i++ ) {
+            for( i = MAX(j-1,0); i <= j; i++ ) {
+                workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system->N)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
+                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system->N)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
+            }
+            workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system->N)], N );
+            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system->N)], 
+                    1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system->N)], N );
+            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
+            /* Givens rotations on the upper-Hessenberg matrix to make it U */
+            for( i = MAX(j-1,0); i <= j; i++ )
+            {
+                if( i == j )
+                {
+                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
+                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
+                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
+                }
+                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
+                    workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ];
+                tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + 
+                    workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ];
+                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
+                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
+            } 
+            /* apply Givens rotations to the rhs as well */
+            tmp1 =  workspace->hc[j] * workspace->g[j];
+            tmp2 = -workspace->hs[j] * workspace->g[j];
+            workspace->g[j] = tmp1;
+            workspace->g[j+1] = tmp2;
+            //fprintf( stderr, "h: " );
+            //for( i = 0; i <= j+1; ++i )
+            //fprintf( stderr, "%.6f ", workspace->h[i][j] );
+            //fprintf( stderr, "\n" );
+            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
+        }
+        /* solve Hy = g: H is now upper-triangular, do back-substitution */
+        for( i = j-1; i >= 0; i-- )
+        {
+            temp = workspace->g[i];      
+            for( k = j-1; k > i; k-- )
+            {
+                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+            }
+            workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)];
+        }
+        /* update x = x_0 + Vy */
+        Vector_MakeZero( workspace->p, N );
+        for( i = 0; i < j; i++ )
+            Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system->N)], N );
+        //Backward_Subs( U, workspace->p, workspace->p );
+        //Forward_Subs( L, workspace->p, workspace->p );
+        Vector_Add( x, 1., workspace->p, N );
+        /* stopping condition */
+        if( fabs(workspace->g[j]) / bnorm <= tol )
+        {
+            break;
+        }
+    }
+    // Sparse_MatVec( H, x, workspace->b_prm );
+    // for( i = 0; i < N; ++i )
+    // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
+    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
+    // for( i = 0; i < N; ++i )
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
+    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
+    //          itr, j, fabs( workspace->g[j] ) / bnorm );
+    // data->timing.matvec += itr * RESTART + j;
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        // return -1;
+        return itr * (RESTART+1) + j + 1;
+    }
+    return itr * (RESTART+1) + j + 1;
+int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, 
+        sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system )
+    int  i, N;
+    real tmp, alpha, beta, b_norm, r_norm;
+    real sig0, sig_old, sig_new;
+    N = A->n;
+    b_norm = Norm( b, N );
+    //fprintf( stderr, "b_norm: %.15e\n", b_norm );
+    Sparse_MatVec( A, x, workspace->q );
+    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+    r_norm = Norm(workspace->r, N);
+    //Print_Soln( workspace, x, q, b, N );
+    //fprintf( stderr, "res: %.15e\n", r_norm );
+    Forward_Subs( L, workspace->r, workspace->d );
+    Backward_Subs( U, workspace->d, workspace->p );
+    sig_new = Dot( workspace->r, workspace->p, N );
+    sig0 = sig_new;
+    for( i = 0; i < 200 && r_norm/b_norm > tol; ++i )
+    {
+        //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) {
+        Sparse_MatVec( A, workspace->p, workspace->q );
+        tmp = Dot( workspace->q, workspace->p, N );
+        alpha = sig_new / tmp;
+        Vector_Add( x, alpha, workspace->p, N );
+        //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
+        //     i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp );
+        Vector_Add( workspace->r, -alpha, workspace->q, N );
+        r_norm = Norm(workspace->r, N);
+        //fprintf( stderr, "res: %.15e\n", r_norm );
+        Forward_Subs( L, workspace->r, workspace->d );
+        Backward_Subs( U, workspace->d, workspace->d );
+        sig_old = sig_new;
+        sig_new = Dot( workspace->r, workspace->d, N );
+        beta = sig_new / sig_old;
+        Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N );
+    }
+    //fprintf( fout, "CG took %d iterations\n", i );
+    if( i >= 200 ) {
+        fprintf( stderr, "CG convergence failed!\n" );
+        return i;
+    }
+    return i;
+int CG( static_storage *workspace, sparse_matrix *H, 
+        real *b, real tol, real *x, FILE *fout, reax_system *system)
+    int  i, j, N;
+    real tmp, alpha, beta, b_norm;
+    real sig_old, sig_new, sig0;
+    N = H->n;
+    b_norm = Norm( b, N );
+    //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
+    Sparse_MatVec( H, x, workspace->q );
+    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+    for( j = 0; j < N; ++j )
+        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+    sig_new = Dot( workspace->r, workspace->d, N );
+    sig0 = sig_new;
+    //Print_Soln( workspace, x, q, b, N );
+    //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", 
+    // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) );
+    //fprintf( stderr, "sig_new: %f\n", sig_new );
+    for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) {
+        //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) {
+        Sparse_MatVec( H, workspace->d, workspace->q );
+        tmp = Dot( workspace->d, workspace->q, N );
+        //fprintf( stderr, "tmp: %f\n", tmp );
+        alpha = sig_new / tmp;    
+        Vector_Add( x, alpha, workspace->d, N );
+        //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
+        //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
+        Vector_Add( workspace->r, -alpha, workspace->q, N );    
+        for( j = 0; j < N; ++j )
+            workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        sig_old = sig_new;
+        sig_new = Dot( workspace->r, workspace->p, N );
+        beta = sig_new / sig_old;
+        Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N );
+        //fprintf( stderr, "sig_new: %f\n", sig_new );
+    }
+    fprintf( stderr, "CG took %d iterations\n", i );
+    if( i >= 300 ) {
+        fprintf( stderr, "CG convergence failed!\n" );
+        return i;
+    }
+    return i;
+/* Steepest Descent */
+int SDM( static_storage *workspace, sparse_matrix *H, 
+        real *b, real tol, real *x, FILE *fout )
+    int  i, j, N;
+    real tmp, alpha, beta, b_norm;
+    real sig0, sig;
+    N = H->n;
+    b_norm = Norm( b, N );
+    //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
+    Sparse_MatVec( H, x, workspace->q );
+    Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+    for( j = 0; j < N; ++j )
+        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+    sig = Dot( workspace->r, workspace->d, N );
+    sig0 = sig;
+    for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) {
+        Sparse_MatVec( H, workspace->d, workspace->q );
+        sig = Dot( workspace->r, workspace->d, N );
+        tmp = Dot( workspace->d, workspace->q, N );
+        alpha = sig / tmp;    
+        Vector_Add( x, alpha, workspace->d, N );
+        Vector_Add( workspace->r, -alpha, workspace->q, N );
+        for( j = 0; j < N; ++j )
+            workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
+        //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
+    }
+    fprintf( stderr, "SDM took %d iterations\n", i );
+    if( i >= 300 ) {
+        fprintf( stderr, "SDM convergence failed!\n" );
+        return i;
+    }
+    return i;
diff --git a/PuReMD-GPU/src/GMRES.h b/PuReMD-GPU/src/lin_alg.h
similarity index 89%
rename from PuReMD-GPU/src/GMRES.h
rename to PuReMD-GPU/src/lin_alg.h
index 5f9dc46bcd853ec1b2036c995b9bfc0675fd635c..a515a959494a6eca40fe9f338d2a08118ff3e39a 100644
--- a/PuReMD-GPU/src/GMRES.h
+++ b/PuReMD-GPU/src/lin_alg.h
@@ -18,19 +18,17 @@
-#ifndef __GMRES_H_
-#define __GMRES_H_
+#ifndef __LIN_ALG_H_
+#define __LIN_ALG_H_
 #define SIGN(x) (x < 0.0 ? -1 : 1);
 #include "mytypes.h"
 int GMRES( static_storage*, sparse_matrix*,
            real*, real, real*, FILE* , reax_system* );
-int Cuda_GMRES( static_storage *, real *b, real tol, real *x );
-int Cublas_GMRES( reax_system *, static_storage *, real *b, real tol, real *x );
 int GMRES_HouseHolder( static_storage*, sparse_matrix*,
                        real*, real, real*, FILE* , reax_system*  );
@@ -46,4 +44,5 @@ int CG( static_storage*, sparse_matrix*,
 int uyduruk_GMRES( static_storage*, sparse_matrix*,
                    real*, real, real*, int, FILE*, reax_system* );
diff --git a/PuReMD-GPU/src/list.c b/PuReMD-GPU/src/list.c
new file mode 100644
index 0000000000000000000000000000000000000000..c6f0e55ebad4fc59c07f253a1d216d3242115aff
--- /dev/null
+++ b/PuReMD-GPU/src/list.c
@@ -0,0 +1,146 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "list.h"
+char Make_List(int n, int num_intrs, int type, list* l)
+    char success=1;
+    l->n = n;
+    l->num_intrs = num_intrs;
+    l->index = (int*) malloc( n * sizeof(int) );
+    l->end_index = (int*) malloc( n * sizeof(int) );
+    if (l->index == NULL) success = 0;
+    if (l->end_index == NULL) success = 0;
+    l->type = type;
+    switch(type)
+    {
+        case TYP_VOID:
+            l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
+            if (l->select.v == NULL) success = 0;
+            break;
+        case TYP_THREE_BODY:
+            l->select.three_body_list = (three_body_interaction_data*) 
+                malloc(l->num_intrs*sizeof(three_body_interaction_data));
+            if (l->select.three_body_list == NULL) success = 0;
+            break;
+        case TYP_BOND:
+            l->select.bond_list = (bond_data*) 
+                malloc(l->num_intrs * sizeof(bond_data));
+            if (l->select.bond_list == NULL) success = 0;
+            break;
+        case TYP_DBO:
+            l->select.dbo_list = (dbond_data*) 
+                malloc(l->num_intrs * sizeof(dbond_data));
+            if (l->select.dbo_list == NULL) success = 0;
+            break;
+        case TYP_DDELTA:
+            l->select.dDelta_list = (dDelta_data*) 
+                malloc(l->num_intrs*sizeof(dDelta_data));
+            if (l->select.dDelta_list == NULL) success = 0;
+            break;
+        case TYP_FAR_NEIGHBOR:
+            l->select.far_nbr_list = (far_neighbor_data*) 
+                malloc(l->num_intrs*sizeof(far_neighbor_data));
+            if (l->select.far_nbr_list == NULL) success = 0;
+            break;
+        case TYP_NEAR_NEIGHBOR:
+            l->select.near_nbr_list = (near_neighbor_data*) 
+                malloc(l->num_intrs*sizeof(near_neighbor_data));
+            if (l->select.near_nbr_list == NULL) success = 0;
+            break;
+        case TYP_HBOND:
+            l->select.hbond_list = (hbond_data*)
+                malloc( l->num_intrs * sizeof(hbond_data) );
+            if (l->select.hbond_list == NULL) success = 0;
+            break;            
+        default:
+            l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
+            if (l->select.v == NULL) success = 0;
+            l->type = TYP_VOID;
+            break;      
+    }
+    return success;
+void Delete_List(list* l)
+    if( l->index != NULL )
+        free(l->index);
+    if( l->end_index != NULL )
+        free(l->end_index);
+    switch(l->type)
+    {
+        case TYP_VOID:
+            if( l->select.v != NULL )
+                free(l->select.v);
+            break;
+        case TYP_THREE_BODY:
+            if( l->select.three_body_list != NULL )
+                free(l->select.three_body_list);
+            break;
+        case TYP_BOND:
+            if( l->select.bond_list != NULL )
+                free(l->select.bond_list);
+            break;
+        case TYP_DBO:
+            if( l->select.dbo_list != NULL )
+                free(l->select.dbo_list);
+            break;
+        case TYP_DDELTA:
+            if( l->select.dDelta_list != NULL )
+                free(l->select.dDelta_list);
+            break;
+        case TYP_FAR_NEIGHBOR:
+            if( l->select.far_nbr_list != NULL )
+                free(l->select.far_nbr_list);
+            break;
+        case TYP_NEAR_NEIGHBOR:
+            if( l->select.near_nbr_list != NULL )
+                free(l->select.near_nbr_list);
+            break;
+        case TYP_HBOND:
+            if( l->select.hbond_list != NULL )
+                free(l->select.hbond_list);
+            break;
+        default:
+            // Report fatal error
+            break;
+    }
diff --git a/PuReMD-GPU/src/list.cu b/PuReMD-GPU/src/list.cu
deleted file mode 100644
index 095409aa4fdc96102a5e321d1511737e37371eef..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/list.cu
+++ /dev/null
@@ -1,235 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#include "list.h"
-#include "cuda_utils.h"
-HOST char Make_List(int n, int num_intrs, int type, list* l, int proc)
-    char success=1;
-    if (proc == TYP_HOST) {
-        l->n = n;
-        l->num_intrs = num_intrs;
-        l->index = (int*) malloc( n * sizeof(int) );
-        l->end_index = (int*) malloc( n * sizeof(int) );
-        if (l->index == NULL) success = 0;
-        if (l->end_index == NULL) success = 0;
-        l->type = type;
-        switch(type)
-        {
-            case TYP_VOID:
-                l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
-                if (l->select.v == NULL) success = 0;
-                break;
-            case TYP_THREE_BODY:
-                l->select.three_body_list = (three_body_interaction_data*) 
-                    malloc(l->num_intrs*sizeof(three_body_interaction_data));
-                if (l->select.three_body_list == NULL) success = 0;
-                break;
-            case TYP_BOND:
-                l->select.bond_list = (bond_data*) 
-                    malloc(l->num_intrs * sizeof(bond_data));
-                if (l->select.bond_list == NULL) success = 0;
-                break;
-            case TYP_DBO:
-                l->select.dbo_list = (dbond_data*) 
-                    malloc(l->num_intrs * sizeof(dbond_data));
-                if (l->select.dbo_list == NULL) success = 0;
-                break;
-            case TYP_DDELTA:
-                l->select.dDelta_list = (dDelta_data*) 
-                    malloc(l->num_intrs*sizeof(dDelta_data));
-                if (l->select.dDelta_list == NULL) success = 0;
-                break;
-            case TYP_FAR_NEIGHBOR:
-                l->select.far_nbr_list = (far_neighbor_data*) 
-                    malloc(l->num_intrs*sizeof(far_neighbor_data));
-                if (l->select.far_nbr_list == NULL) success = 0;
-                break;
-            case TYP_NEAR_NEIGHBOR:
-                l->select.near_nbr_list = (near_neighbor_data*) 
-                    malloc(l->num_intrs*sizeof(near_neighbor_data));
-                if (l->select.near_nbr_list == NULL) success = 0;
-                break;
-            case TYP_HBOND:
-                l->select.hbond_list = (hbond_data*)
-                    malloc( l->num_intrs * sizeof(hbond_data) );
-                if (l->select.hbond_list == NULL) success = 0;
-                break;            
-            default:
-                l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
-                if (l->select.v == NULL) success = 0;
-                l->type = TYP_VOID;
-                break;      
-        }
-    }
-    else 
-    {
-        l->n = n;
-        l->num_intrs = num_intrs;
-        cuda_malloc ((void **)&l->index, n * sizeof(int), 1, LIST_INDEX );
-        cuda_malloc ((void **)&l->end_index, n * sizeof(int), 1, LIST_END_INDEX );
-        switch(type)
-        {
-            case TYP_FAR_NEIGHBOR:
-                cuda_malloc ((void **) &l->select.far_nbr_list, 
-                        l->num_intrs*sizeof(far_neighbor_data), 
-                        1, LIST_FAR_NEIGHBOR_DATA);
-                /*
-                   cudaHostAlloc ((void **) &l->select.far_nbr_list, 
-                   l->num_intrs*sizeof(far_neighbor_data),
-                   cudaHostAllocMapped);
-                   cudaHostGetDevicePointer ( (void **) &l->select.far_nbr_list, 
-                   (void *)l->select.far_nbr_list, 0);
-                 */
-                break;
-            case TYP_HBOND:
-                cuda_malloc ((void **) &l->select.hbond_list,
-                        l->num_intrs * sizeof(hbond_data),
-                        1, LIST_HBOND_DATA );
-                break;            
-            case TYP_BOND:
-                cuda_malloc ((void **) &l->select.bond_list,
-                        l->num_intrs * sizeof(bond_data),
-                        1, LIST_BOND_DATA );
-                break;            
-            case TYP_THREE_BODY:
-                cuda_malloc ( (void **) &l->select.three_body_list, 
-                        l->num_intrs * sizeof(three_body_interaction_data), 
-                        1, LIST_THREE_BODY_DATA );
-                break;
-            default: 
-                fprintf (stderr, "Unknown list creation \n" );
-                exit (1);
-        }
-    }
-    return success;
-HOST void Delete_List(list* l, int type)
-    if (type == TYP_HOST )
-    {
-        if( l->index != NULL )
-            free(l->index);
-        if( l->end_index != NULL )
-            free(l->end_index);
-        switch(l->type)
-        {
-            case TYP_VOID:
-                if( l->select.v != NULL )
-                    free(l->select.v);
-                break;
-            case TYP_THREE_BODY:
-                if( l->select.three_body_list != NULL )
-                    free(l->select.three_body_list);
-                break;
-            case TYP_BOND:
-                if( l->select.bond_list != NULL )
-                    free(l->select.bond_list);
-                break;
-            case TYP_DBO:
-                if( l->select.dbo_list != NULL )
-                    free(l->select.dbo_list);
-                break;
-            case TYP_DDELTA:
-                if( l->select.dDelta_list != NULL )
-                    free(l->select.dDelta_list);
-                break;
-            case TYP_FAR_NEIGHBOR:
-                if( l->select.far_nbr_list != NULL )
-                    free(l->select.far_nbr_list);
-                break;
-            case TYP_NEAR_NEIGHBOR:
-                if( l->select.near_nbr_list != NULL )
-                    free(l->select.near_nbr_list);
-                break;
-            case TYP_HBOND:
-                if( l->select.hbond_list != NULL )
-                    free(l->select.hbond_list);
-                break;
-            default:
-                // Report fatal error
-                break;
-        }
-    }
-    else
-    {
-        if (l->index != NULL)
-            cuda_free (l->index, LIST_INDEX );    
-        if (l->end_index != NULL)
-            cuda_free (l->end_index, LIST_END_INDEX );
-        switch(type)
-        {
-            case TYP_FAR_NEIGHBOR:
-                if (l->select.far_nbr_list != NULL)
-                    cuda_free (l->select.far_nbr_list, LIST_FAR_NEIGHBOR_DATA);
-                break;
-            case TYP_HBOND:
-                if (l->select.hbond_list != NULL)
-                    cuda_free (l->select.hbond_list, LIST_HBOND_DATA );
-                break;            
-            case TYP_BOND:
-                if (l->select.bond_list != NULL)
-                    cuda_free (l->select.bond_list, LIST_BOND_DATA );
-                break;            
-            case TYP_THREE_BODY:
-                if (l->select.three_body_list != NULL) 
-                    cuda_free ( l->select.three_body_list, LIST_THREE_BODY_DATA );
-                break;
-            default: 
-                fprintf (stderr, "Unknown list deletion \n" );
-                exit (1);
-        }
-    }
diff --git a/PuReMD-GPU/src/list.h b/PuReMD-GPU/src/list.h
index f341c2e270912e57beee4867bca9927a5c905633..b90c41419271ca6b859be08ea4005fbe9107c029 100644
--- a/PuReMD-GPU/src/list.h
+++ b/PuReMD-GPU/src/list.h
@@ -23,31 +23,36 @@
 #include "mytypes.h"
-HOST char Make_List( int, int, int, list* , int proc = TYP_HOST);
-HOST void Delete_List( list* , int proc = TYP_HOST);
+char Make_List( int, int, int, list* );
+void Delete_List( list* );
-inline HOST_DEVICE int Num_Entries(int i, list* l)
+static inline HOST_DEVICE int Num_Entries(int i, list* l)
     return l->end_index[i] - l->index[i];
-inline HOST_DEVICE int Start_Index(int i, list *l )
+static inline HOST_DEVICE int Start_Index(int i, list *l )
     return l->index[i];
-inline HOST_DEVICE int End_Index( int i, list *l )
+static inline HOST_DEVICE int End_Index( int i, list *l )
     return l->end_index[i];
-inline HOST_DEVICE void Set_Start_Index(int i, int val, list *l)
+static inline HOST_DEVICE void Set_Start_Index(int i, int val, list *l)
     l->index[i] = val;
-inline HOST_DEVICE void Set_End_Index(int i, int val, list *l)
+static inline HOST_DEVICE void Set_End_Index(int i, int val, list *l)
     l->end_index[i] = val;
diff --git a/PuReMD-GPU/src/lookup.c b/PuReMD-GPU/src/lookup.c
new file mode 100644
index 0000000000000000000000000000000000000000..c439709dc09c77775ed716a39db797fa8c831585
--- /dev/null
+++ b/PuReMD-GPU/src/lookup.c
@@ -0,0 +1,406 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "lookup.h"
+#include "two_body_interactions.h"
+#include "index_utils.h"
+void Make_Lookup_Table(real xmin, real xmax, int n,
+        lookup_function f, lookup_table* t)
+    int i;
+    t->xmin = xmin;
+    t->xmax = xmax;
+    t->n = n;
+    t->dx = (xmax - xmin)/(n-1);
+    t->inv_dx = 1.0 / t->dx;
+    t->a = (n-1)/(xmax-xmin);
+    t->y = (real*) malloc(n*sizeof(real));
+    for(i=0; i < n; i++)
+        t->y[i] = f(i*t->dx + t->xmin);
+    // //fprintf(stdout,"dx = %lf\n",t->dx);
+    // for(i=0; i < n; i++)
+    //   //fprintf( stdout,"%d %lf %lf %lf\n", 
+    //            i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) );
+/* Fills solution into x. Warning: will modify c and d! */
+void Tridiagonal_Solve( const real *a, const real *b,
+        real *c, real *d, real *x, unsigned int n){
+    int i;
+    real id;
+    /* Modify the coefficients. */
+    c[0] /= b[0];    /* Division by zero risk. */
+    d[0] /= b[0];    /* Division by zero would imply a singular matrix. */
+    for(i = 1; i < n; i++){
+        id = (b[i] - c[i-1] * a[i]);  /* Division by zero risk. */
+        c[i] /= id;            /* Last value calculated is redundant. */
+        d[i] = (d[i] - d[i-1] * a[i])/id;
+    }
+    /* Now back substitute. */
+    x[n - 1] = d[n - 1];
+    for(i = n - 2; i >= 0; i--)
+        x[i] = d[i] - c[i] * x[i + 1];
+void Natural_Cubic_Spline( const real *h, const real *f, 
+        cubic_spline_coef *coef, unsigned int n )
+    int i;
+    real *a, *b, *c, *d, *v;
+    /* allocate space for the linear system */
+    a = (real*) malloc( n * sizeof(real) );
+    b = (real*) malloc( n * sizeof(real) );
+    c = (real*) malloc( n * sizeof(real) );
+    d = (real*) malloc( n * sizeof(real) );
+    v = (real*) malloc( n * sizeof(real) );
+    /* build the linear system */
+    a[0] = a[1] = a[n-1] = 0;
+    for( i = 2; i < n-1; ++i )
+        a[i] = h[i-1];
+    b[0] = b[n-1] = 0;
+    for( i = 1; i < n-1; ++i )
+        b[i] = 2 * (h[i-1] + h[i]); 
+    c[0] = c[n-2] = c[n-1] = 0;
+    for( i = 1; i < n-2; ++i )
+        c[i] = h[i];
+    d[0] = d[n-1] = 0;
+    for( i = 1; i < n-1; ++i )
+        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+    /*//fprintf( stderr, "i  a        b        c        d\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+    v[0] = 0;
+    v[n-1] = 0;
+    Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
+    for( i = 1; i < n; ++i ){
+        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
+        coef[i-1].c = v[i]/2;
+        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
+        coef[i-1].a = f[i];
+    }
+    /*//fprintf( stderr, "i  v  coef\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
+    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast,
+        cubic_spline_coef *coef, unsigned int n )
+    int i;
+    real *a, *b, *c, *d, *v;
+    /* allocate space for the linear system */
+    a = (real*) malloc( n * sizeof(real) );
+    b = (real*) malloc( n * sizeof(real) );
+    c = (real*) malloc( n * sizeof(real) );
+    d = (real*) malloc( n * sizeof(real) );
+    v = (real*) malloc( n * sizeof(real) );
+    /* build the linear system */
+    a[0] = 0;
+    for( i = 1; i < n; ++i )
+        a[i] = h[i-1];
+    b[0] = 2*h[0];
+    for( i = 1; i < n; ++i )
+        b[i] = 2 * (h[i-1] + h[i]); 
+    c[n-1] = 0;
+    for( i = 0; i < n-1; ++i )
+        c[i] = h[i];
+    d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0;   
+    d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]);
+    for( i = 1; i < n-1; ++i )
+        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+    /*//fprintf( stderr, "i  a        b        c        d\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+    Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n );
+    // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
+    for( i = 1; i < n; ++i ){
+        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
+        coef[i-1].c = v[i]/2;
+        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
+        coef[i-1].a = f[i];
+    }
+    /*//fprintf( stderr, "i  v  coef\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
+    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+void LR_Lookup( LR_lookup_table *t, real r, LR_data *y )
+    int i;
+    real base, dif;
+    i = (int)(r * t->inv_dx);
+    if( i == 0 )  ++i;
+    base = (real)(i+1) * t->dx;
+    dif = r - base;
+    ////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif );
+    y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + 
+        t->vdW[i].a;
+    y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + 
+            t->CEvd[i].b)*dif + t->CEvd[i].a;
+    //y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b;
+    y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + 
+        t->ele[i].a;
+    y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif +
+        t->CEclmb[i].a;
+    y->H = y->e_ele * EV_to_KCALpMOL / C_ele;
+    //y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a;
+void Make_LR_Lookup_Table( reax_system *system, control_params *control )
+    int i, j, r;
+    int num_atom_types;
+    int existing_types[MAX_ATOM_TYPES];
+    real dr;
+    real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb;
+    real v0_vdw, v0_ele, vlast_vdw, vlast_ele;
+    /* real rand_dist;
+       real evdw_abserr, evdw_relerr, fvdw_abserr, fvdw_relerr;
+       real eele_abserr, eele_relerr, fele_abserr, fele_relerr;
+       real evdw_maxerr, eele_maxerr;
+       LR_data y, y_spline; */
+    /* initializations */
+    vlast_ele = 0;
+    vlast_vdw = 0;
+    v0_ele = 0;
+    v0_vdw = 0;
+    num_atom_types = system->reaxprm.num_atom_types;
+    dr = control->r_cut / control->tabulate;
+    h = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fh = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fele = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    /* allocate Long-Range LookUp Table space based on 
+       number of atom types in the ffield file */
+    //LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) );
+    //for( i = 0; i < num_atom_types; ++i )
+    // LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table));
+    LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table));
+    /* most atom types in ffield file will not exist in the current
+       simulation. to avoid unnecessary lookup table space, determine
+       the atom types that exist in the current simulation */
+    for( i = 0; i < MAX_ATOM_TYPES; ++i )
+        existing_types[i] = 0;
+    for( i = 0; i < system->N; ++i )
+        existing_types[ system->atoms[i].type ] = 1;
+    /* fill in the lookup table entries for existing atom types.
+       only lower half should be enough. */
+    for( i = 0; i < num_atom_types; ++i )
+        if( existing_types[i] )
+            for( j = i; j < num_atom_types; ++j )
+                if( existing_types[j] ) {
+                    LR[ index_lr (i,j,num_atom_types) ].xmin = 0;
+                    LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut;
+                    LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1;
+                    LR[ index_lr (i,j,num_atom_types) ].dx = dr;
+                    LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
+                    LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data));
+                    LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+                    LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+                    LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+                    LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+                    LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+                    for( r = 1; r <= control->tabulate; ++r ) {
+                        LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) );
+                        h[r] = LR[ index_lr (i,j,num_atom_types) ].dx;
+                        fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H;
+                        fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW;
+                        fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
+                        fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele;
+                        fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+                        if( r == 1 ){
+                            v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
+                            v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+                        }
+                        else if( r == control->tabulate ){
+                            vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
+                            vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+                        }
+                    }
+                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
+                      for( r = 1; r <= control->tabulate; ++r )
+                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
+                    Natural_Cubic_Spline( &h[1], &fh[1], 
+                            &(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 );
+                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
+                      for( r = 1; r <= control->tabulate; ++r )
+                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
+                    //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
+                     */
+                    Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, 
+                            &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 );
+                    Natural_Cubic_Spline( &h[1], &fCEvd[1], 
+                            &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 );
+                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
+                      for( r = 1; r <= control->tabulate; ++r )
+                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
+                    //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
+                     */
+                    Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, 
+                            &(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 );
+                    Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
+                            &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 );
+                }
+    /***** //test LR-Lookup table
+      evdw_maxerr = 0;
+      eele_maxerr = 0;
+      for( i = 0; i < num_atom_types; ++i )
+      if( existing_types[i] )
+      for( j = i; j < num_atom_types; ++j )
+      if( existing_types[j] ) {
+      for( r = 1; r <= 100; ++r ) {
+      rand_dist = (real)rand()/RAND_MAX * control->r_cut;
+      LR_vdW_Coulomb( system, control, i, j, rand_dist, &y );
+      LR_Lookup( &(LR[i][j]), rand_dist, &y_spline );
+      evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW);
+      evdw_relerr = fabs(evdw_abserr / y.e_vdW);
+      fvdw_abserr = fabs(y.CEvd - y_spline.CEvd);
+      fvdw_relerr = fabs(fvdw_abserr / y.CEvd);
+      eele_abserr = fabs(y.e_ele - y_spline.e_ele);
+      eele_relerr = fabs(eele_abserr / y.e_ele);
+      fele_abserr = fabs(y.CEclmb - y_spline.CEclmb);
+      fele_relerr = fabs(fele_abserr / y.CEclmb);
+      if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){
+    //fprintf( stderr, "rand_dist = %24.15e\n", rand_dist );
+    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+    y.H, y_spline.H, 
+    fabs(y.H-y_spline.H), fabs((y.H-y_spline.H)/y.H) );  
+    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+    y.e_vdW, y_spline.e_vdW, evdw_abserr, evdw_relerr ); 
+    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+    y.CEvd, y_spline.CEvd, fvdw_abserr, fvdw_relerr ); 
+    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+    y.e_ele, y_spline.e_ele, eele_abserr, eele_relerr ); 
+    //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
+    y.CEclmb, y_spline.CEclmb, fele_abserr, fele_relerr ); 
+    }
+    if( evdw_relerr > evdw_maxerr )
+    evdw_maxerr = evdw_relerr;
+    if( eele_relerr > eele_maxerr )
+    eele_maxerr = eele_relerr;
+    }
+    }
+    //fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr );
+    //fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr );
+         *******/
+    free(h);
+    free(fh);
+    free(fvdw);
+    free(fCEvd);
+    free(fele);
+    free(fCEclmb);
+int Lookup_Index_Of( real x, lookup_table* t )
+    return (int)( t->a * ( x - t->xmin ) );
+real Lookup( real x, lookup_table* t )
+    real x1, x2;
+    real b;
+    int i;
+    /* if ( x < t->xmin) 
+       {
+    //fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x);
+    exit(0);
+    }
+    if ( x > t->xmax) 
+    {
+    //fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x);
+    exit(0);
+    } */
+    i = Lookup_Index_Of( x, t );
+    x1 = i * t->dx + t->xmin;
+    x2 = (i+1) * t->dx + t->xmin;
+    b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx;
+    // //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n",
+    //          i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x));
+    return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b;
diff --git a/PuReMD-GPU/src/lookup.h b/PuReMD-GPU/src/lookup.h
index 9ea972f8eeaa07e307516149658c07bfc68f37f4..7dac3e4f41764caa53cf0e2313f8c566f4e0e634 100644
--- a/PuReMD-GPU/src/lookup.h
+++ b/PuReMD-GPU/src/lookup.h
@@ -23,14 +23,20 @@
 #include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
 void Make_Lookup_Table( real, real, int, lookup_function, lookup_table* );
 int  Lookup_Index_Of( real, lookup_table* );
 real Lookup( real, lookup_table* );
 void Make_LR_Lookup_Table( reax_system*, control_params* );
-//CUDA Functions
-void Cuda_Make_LR_Lookup_Table( reax_system*, control_params* );
-void copy_LR_table_to_device ( reax_system*, control_params* );
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/matvec.cu b/PuReMD-GPU/src/matvec.cu
deleted file mode 100644
index bf08cdf83dc8e14e31b48a6eaac41b5bfa8cf97e..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/matvec.cu
+++ /dev/null
@@ -1,89 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#include "matvec.h"
-//one thread per row
-GLOBAL void Cuda_Matvec (sparse_matrix H, real *vec, real *results, int rows)
-    real results_row = 0;
-    int col;
-    real val;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= rows) return;
-    for (int c = H.start[i]; c < H.end[i]; c++)
-    {
-        col = H.entries [c].j;
-        val = H.entries[c].val;
-        results_row += val * vec [col];
-    }
-    results [i] = results_row;
-//32 thread warp per matrix row.
-//invoked as follows
-// <<< system->N, 32 >>>
-GLOBAL void Cuda_Matvec_csr (sparse_matrix H, real *vec, real *results, int num_rows)
-    extern __shared__ real vals [];
-    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-    int warp_id = thread_id / 32;
-    int lane = thread_id & (32 - 1);
-    int row_start;
-    int row_end;
-    // one warp per row
-    //int row = warp_id;
-    int row = warp_id;
-    //if (row < num_rows)
-    {
-        vals[threadIdx.x] = 0;
-        if (row < num_rows) {
-            row_start = H.start[row];
-            row_end = H.end[row];
-            // compute running sum per thread
-            for(int jj = row_start + lane; jj < row_end; jj += 32)
-                vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ];
-            //vals[threadIdx.x] += H.val[jj] * vec [ H.j[jj] ];
-        }
-        __syncthreads ();
-        // parallel reduction in shared memory
-        //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
-        if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads();
-        if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads ();
-        if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads ();
-        if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads ();
-        if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads ();
-        // first thread writes the result
-        if (lane == 0 && row < num_rows)
-            results[row] = vals[threadIdx.x];
-    }
diff --git a/PuReMD-GPU/src/mytypes.h b/PuReMD-GPU/src/mytypes.h
index c7d42ee97e386aae84f4f9d9bc51d2767d866f29..273f95976159c55461a84668c4a5f2494f1d9522 100644
--- a/PuReMD-GPU/src/mytypes.h
+++ b/PuReMD-GPU/src/mytypes.h
@@ -18,8 +18,42 @@
-#ifndef __MYTYPES_H_
-#define __MYTYPES_H_
+#if !(defined(__MYTYPES_H_) || defined(__CUDA_MYTYPES_H_))
+#ifdef __CUDACC__
+  #ifndef __CUDA_MYTYPES_H_
+    #define __CUDA_MYTYPES_H_
+    #define HOST __host__
+    #define DEVICE __device__
+    #define GLOBAL __global__
+    #define HOST_DEVICE __host__ __device__
+    #include <cuda_runtime.h>
+    #include <cuda.h>
+    #include <cuda_runtime_api.h>
+    #include <cublas_v2.h>
+    #include <cusparse_v2.h>
+    #if __CUDA_ARCH__ < 600
+      #define MYATOMICADD myAtomicAdd
+    #else
+      #define MYATOMICADD atomicAdd
+    #endif
+  #endif
+  #ifndef __MYTYPES_H_
+    #define __MYTYPES_H_
+    #define HOST
+    #define DEVICE
+    #define GLOBAL
+    #define HOST_DEVICE
+  #endif
+#if (defined(HAVE_CONFIG_H) && !defined(__CONFIG_H_))
+  #define __CONFIG_H_
+  #include "config.h"
 #include "math.h"
 //#include "random.h"
@@ -30,28 +64,16 @@
 #include "time.h"
 #include "zlib.h"
 //#define DEBUG_FOCUS
 //#define TEST_FORCES
 //#define TEST_ENERGY
 //#define REORDER_ATOMS  // turns on nbrgen opt by re-ordering atoms
 //#define LGJ
-#ifdef __USE_GPU__
-#include "cublas_v2.h"
-#include "cusparse_v2.h"
-#define HOST __host__
-#define DEVICE __device__
-#define GLOBAL __global__
-#define HOST_DEVICE __host__ __device__
-#define HOST
-#define DEVICE
-#define GLOBAL
-#define HOST_DEVICE
+#define SUCCESS  1
+#define FAILURE  0
+#define TRUE  1
+#define FALSE 0
 #define EXP    exp
 #define SQRT   sqrt
@@ -140,10 +162,10 @@
 #define RES_GRID_MARK       0x03
 #define RES_GRID_START      0x04
 #define RES_GRID_END        0x05
-#define     RES_GRID_NBRS       0x06
-#define     RES_GRID_NBRS_CP    0x07
+#define RES_GRID_NBRS       0x06
+#define RES_GRID_NBRS_CP    0x07
-#define     RES_SYSTEM_ATOMS            0x10
+#define RES_SYSTEM_ATOMS            0x10
 #define RES_REAX_INT_SBP    0x20
@@ -154,58 +176,58 @@
-#define      RES_STORAGE                        0x401
-#define      RES_STORAGE_HBOND_INDEX        0x402
-#define      RES_STORAGE_TOTAL_BOND_ORDER   0x403
-#define      RES_STORAGE_DELTAP             0x404
-#define      RES_STORAGE_DELTAP_BOC         0x404
-#define      RES_STORAGE_DDELTAP_SELF       0x405
-#define      RES_STORAGE_DELTA              0x406
-#define      RES_STORAGE_DELTA_LP           0x407
-#define      RES_STORAGE_DELTA_LP_TEMP      0x408
-#define      RES_STORAGE_DDELTA_LP          0x409
-#define      RES_STORAGE_DDELTA_LP_TEMP 0x40A
-#define      RES_STORAGE_DELTA_E                0x40B
-#define      RES_STORAGE_DELTA_BOC          0x40C
-#define      RES_STORAGE_NL                 0x40D
-#define      RES_STORAGE_NLP_TEMP           0x40E
-#define      RES_STORAGE_CLP                    0x40F
-#define      RES_STORAGE_CDDELTA                0x410
-#define      RES_STORAGE_VLPEX              0x411
-#define      RES_STORAGE_DROPTOL                0x412
-#define      RES_STORAGE_W                      0x413
-#define      RES_STORAGE_HDIA_INV           0x414
-#define      RES_STORAGE_B                      0x415
-#define      RES_STORAGE_B_S                    0x416
-#define      RES_STORAGE_B_T                    0x417
-#define      RES_STORAGE_B_PRC              0x418
-#define      RES_STORAGE_B_PRM              0x419
-#define      RES_STORAGE_S_T                    0x41A
-#define      RES_STORAGE_S                      0x41B
-#define      RES_STORAGE_T                      0x41C
-#define      RES_STORAGE_Y                      0x41D
-#define      RES_STORAGE_Z                      0x41E
-#define      RES_STORAGE_G                      0x41F
-#define      RES_STORAGE_HS                 0x420
-#define      RES_STORAGE_HC                 0x421
-#define      RES_STORAGE_RN                 0x422
-#define      RES_STORAGE_V                  0x423
-#define      RES_STORAGE_H                      0x424
-#define      RES_STORAGE_R                      0x425
-#define      RES_STORAGE_D                      0x426
-#define      RES_STORAGE_Q                      0x427
-#define      RES_STORAGE_P                      0x428
-#define      RES_STORAGE_A                      0x429
-#define      RES_STORAGE_F_OLD              0x42A
-#define      RES_STORAGE_V_CONST                0x42B
-#define      RES_STORAGE_MARK                   0x42C
-#define      RES_STORAGE_OLD_MARK           0x42D
-#define      RES_STORAGE_X_OLD              0x42E
-#define      RES_STORAGE_NLP                    0x42F
-#define      RES_STORAGE_MAP_SERIALS        0x430
-#define     RES_STORAGE_RESTRICTED          0x431
-#define      RES_STORAGE_RESTRICTED_LIST    0x432
-#define      RES_STORAGE_ORIG_ID                0x433
+#define RES_STORAGE                    0x401
+#define RES_STORAGE_HBOND_INDEX        0x402
+#define RES_STORAGE_DELTAP             0x404
+#define RES_STORAGE_DELTAP_BOC         0x404
+#define RES_STORAGE_DDELTAP_SELF       0x405
+#define RES_STORAGE_DELTA              0x406
+#define RES_STORAGE_DELTA_LP           0x407
+#define RES_STORAGE_DELTA_LP_TEMP      0x408
+#define RES_STORAGE_DDELTA_LP          0x409
+#define RES_STORAGE_DELTA_E                0x40B
+#define RES_STORAGE_DELTA_BOC          0x40C
+#define RES_STORAGE_NL                 0x40D
+#define RES_STORAGE_NLP_TEMP           0x40E
+#define RES_STORAGE_CLP                    0x40F
+#define RES_STORAGE_CDDELTA                0x410
+#define RES_STORAGE_VLPEX              0x411
+#define RES_STORAGE_DROPTOL                0x412
+#define RES_STORAGE_W                      0x413
+#define RES_STORAGE_HDIA_INV           0x414
+#define RES_STORAGE_B                      0x415
+#define RES_STORAGE_B_S                    0x416
+#define RES_STORAGE_B_T                    0x417
+#define RES_STORAGE_B_PRC              0x418
+#define RES_STORAGE_B_PRM              0x419
+#define RES_STORAGE_S_T                    0x41A
+#define RES_STORAGE_S                      0x41B
+#define RES_STORAGE_T                      0x41C
+#define RES_STORAGE_Y                      0x41D
+#define RES_STORAGE_Z                      0x41E
+#define RES_STORAGE_G                      0x41F
+#define RES_STORAGE_HS                 0x420
+#define RES_STORAGE_HC                 0x421
+#define RES_STORAGE_RN                 0x422
+#define RES_STORAGE_V                  0x423
+#define RES_STORAGE_H                      0x424
+#define RES_STORAGE_R                      0x425
+#define RES_STORAGE_D                      0x426
+#define RES_STORAGE_Q                      0x427
+#define RES_STORAGE_P                      0x428
+#define RES_STORAGE_A                      0x429
+#define RES_STORAGE_F_OLD              0x42A
+#define RES_STORAGE_V_CONST                0x42B
+#define RES_STORAGE_MARK                   0x42C
+#define RES_STORAGE_OLD_MARK           0x42D
+#define RES_STORAGE_X_OLD              0x42E
+#define RES_STORAGE_NLP                    0x42F
+#define RES_STORAGE_MAP_SERIALS        0x430
+#define RES_STORAGE_RESTRICTED          0x431
+#define RES_STORAGE_ORIG_ID                0x433
 #define RES_CONTROL_PARAMS  0x50
@@ -224,7 +246,6 @@
 #define RES_SCRATCH                     0x90
 #define LIST_INDEX                      0x00
 #define  LIST_END_INDEX                 0x01
 #define LIST_FAR_NEIGHBOR_DATA      0x10
@@ -288,9 +309,6 @@
 #define MATVEC_THREADS_PER_ROW              32
 typedef double real;
 typedef real rvec[3];
 typedef int  ivec[3];
@@ -309,7 +327,6 @@ enum {WRITE_ASCII, WRITE_BINARY, RF_N};
 /* Global params mapping */
 l[0]  = p_boc1
@@ -352,7 +369,6 @@ l[36] = N/A
 l[37] = version number
 l[38] = p_coa3
 typedef struct
     int n_global;
@@ -361,7 +377,6 @@ typedef struct
 } global_parameters;
 typedef struct
     /* Line one in field file */
@@ -405,7 +420,6 @@ typedef struct
 } single_body_parameters;
 /* Two Body Parameters */
 typedef struct
@@ -435,7 +449,6 @@ typedef struct
 } two_body_parameters;
 /* 3-body parameters */
 typedef struct
@@ -458,7 +471,6 @@ typedef struct
 } three_body_header;
 /* hydrogen-bond parameters */
 typedef struct
@@ -466,7 +478,6 @@ typedef struct
 } hbond_parameters;
 /* 4-body parameters */
 typedef struct
@@ -560,7 +571,6 @@ typedef struct
     int   *end;
     ivec  *nbrs;
     rvec  *nbrs_cp;
 } grid;
@@ -768,8 +778,6 @@ typedef struct
     reax_timing timing;
     reax_timing d_timing;
     void *d_simulation_data;
 } simulation_data;
@@ -837,6 +845,7 @@ typedef struct
     rvec dBO, dBOpi, dBOpi2;
 } dbond_data;
 typedef struct
     real BO, BO_s, BO_pi, BO_pi2;
@@ -847,6 +856,7 @@ typedef struct
     rvec dBOp, dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
 } bond_order_data;
 typedef struct
     int nbr;
@@ -886,6 +896,7 @@ typedef struct
     real val;
 } sparse_matrix_entry;
 typedef struct
     int n, m;
@@ -914,6 +925,7 @@ typedef struct
     int gcell_atoms;
 } reallocate_data;
 typedef struct
     /* bond order related storage */
@@ -999,7 +1011,6 @@ typedef struct
 } list;
 typedef struct
     FILE *trj;
@@ -1070,12 +1081,12 @@ typedef struct
 } LR_data;
 typedef struct
     real a, b, c, d;
 } cubic_spline_coef;
 typedef struct
     real xmin, xmax;
@@ -1126,8 +1137,7 @@ typedef void (*get_far_neighbors_function)(rvec, rvec, simulation_box*,
-// CUDA structures
+/* CUDA structures */
 extern list *dev_lists;
 extern static_storage *dev_workspace;
 extern LR_lookup_table *d_LR;
@@ -1138,15 +1148,5 @@ extern void *scratch;
 extern int MATVEC_BLOCKS;
-#ifdef __USE_GPU__
-extern cublasStatus_t cublasStatus;
-extern cublasHandle_t cublasHandle;
-extern cusparseHandle_t cusparseHandle;
-extern cusparseStatus_t cusparseStatus;
-extern cusparseMatDescr_t matdescriptor;
diff --git a/PuReMD-GPU/src/neighbors.c b/PuReMD-GPU/src/neighbors.c
new file mode 100644
index 0000000000000000000000000000000000000000..5f425e672080d2d4a272f7aca1859c45d8dde17d
--- /dev/null
+++ b/PuReMD-GPU/src/neighbors.c
@@ -0,0 +1,698 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "neighbors.h"
+#include "box.h"
+#include "grid.h"
+#include "index_utils.h"
+#include "list.h"
+#include "reset_utils.h"
+#include "system_props.h"
+#include "vector.h"
+int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
+        real cutoff, far_neighbor_data *data )
+    real norm_sqr, d, tmp;
+    int i;
+    norm_sqr = 0;
+    for( i = 0; i < 3; i++ ) { 
+        d = x2[i] - x1[i];
+        tmp = SQR(d);
+        if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
+            if( x2[i] > x1[i] ) { 
+                d -= box->box_norms[i];
+                data->rel_box[i] = -1; 
+            }   
+            else {
+                d += box->box_norms[i];
+                data->rel_box[i] = +1; 
+            }   
+            data->dvec[i] = d;
+            norm_sqr += SQR(d);
+        }   
+        else {
+            data->dvec[i] = d;
+            norm_sqr += tmp;
+            data->rel_box[i] = 0;
+        }   
+    }
+    if( norm_sqr <= SQR(cutoff) ){
+        data->d = sqrt(norm_sqr);
+        return 1;
+    }
+    return 0;
+void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
+    int  i, j, k, l, m, itr;
+    int  x, y, z;
+    int  atom1, atom2, max;
+    int  num_far;
+    int  *nbr_atoms;
+    ivec *nbrs;
+    rvec *nbrs_cp;
+    grid *g;
+    list *far_nbrs;
+    far_neighbor_data *nbr_data;
+    real t_start, t_elapsed;
+    // fprintf( stderr, "\n\tentered nbrs - " );
+    g = &( system->g );
+    far_nbrs = (*lists) + FAR_NBRS;
+    Bin_Atoms( system, workspace );
+    t_start = Get_Time( );
+    // fprintf( stderr, "atoms sorted - " );
+    num_far = 0;
+    /* first pick up a cell in the grid */
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ ) {
+                nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ];
+                nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ];
+                //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
+                /* pick up an atom from the current cell */
+                for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){
+                    atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ];
+                    Set_Start_Index( atom1, num_far, far_nbrs );
+                    //fprintf( stderr, "\tatom %d\n", atom1 );
+                    itr = 0;
+                    while( nbrs[itr][0] >= 0 ){
+                        x = nbrs[itr][0];
+                        y = nbrs[itr][1];
+                        z = nbrs[itr][2];
+                        //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
+                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
+                                SQR(control->vlist_cut) ) {     
+                            nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ];
+                            max = g->top[ index_grid_3d (x,y,z,g) ];
+                            //fprintf( stderr, "\t\tmax: %d\n", max );
+                            /* pick up another atom from the neighbor cell */
+                            for( m = 0; m < max; ++m ) {
+                                atom2 = nbr_atoms[m];
+                                if( atom1 > atom2 ) {
+                                    nbr_data = &(far_nbrs->select.far_nbr_list[num_far]);
+                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
+                                                system->atoms[atom2].x, 
+                                                &(system->box), control->vlist_cut, 
+                                                nbr_data)) {
+                                        nbr_data->nbr = atom2;
+                                        ++num_far;
+                                    }
+                                }
+                            }
+                        }
+                        ++itr;
+                    }
+                    Set_End_Index( atom1, num_far, far_nbrs );
+                    //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", 
+                    //  atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs),
+                    //  itr); 
+                }
+            }
+    fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far);
+    if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) {
+        workspace->realloc.num_far = num_far;
+        if( num_far > far_nbrs->num_intrs ){
+            fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d",
+                    data->step, num_far, far_nbrs->num_intrs );
+            exit( INSUFFICIENT_SPACE );
+        }
+    }
+    t_elapsed = Get_Timing_Info( t_start );
+    data->timing.nbrs += t_elapsed;
+#if defined(DEBUG)
+    for( i = 0; i < system->N; ++i ) {
+        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
+                Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
+                compare_far_nbrs ); 
+    }
+#if defined(DEBUG_FOCUS)  
+    //fprintf( stderr, "nbrs - ");
+    //fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
+#if defined(TEST_ENERGY)
+    //Print_Far_Neighbors( system, control, workspace, lists );
+int Estimate_NumNeighbors( reax_system *system, control_params *control, 
+        static_storage *workspace, list **lists )
+    int  i, j, k, l, m, itr;
+    int  x, y, z;
+    int  atom1, atom2, max;
+    int  num_far;
+    int  *nbr_atoms;
+    ivec *nbrs;
+    rvec *nbrs_cp;
+    grid *g;
+    far_neighbor_data nbr_data;
+    int start = 0, finish = 0;
+    // fprintf( stderr, "\n\tentered nbrs - " );
+    g = &( system->g );
+    Bin_Atoms( system, workspace );
+    // fprintf( stderr, "atoms sorted - " );
+    num_far = 0;
+    g->max_cuda_nbrs = 0;
+    /* first pick up a cell in the grid */
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ ) {
+                nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ];
+                nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ];
+                //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
+                /* pick up an atom from the current cell */
+                for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){
+                    atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ];
+                    start = num_far;
+                    itr = 0;
+                    while( nbrs[itr][0] >= 0 ){
+                        x = nbrs[itr][0];
+                        y = nbrs[itr][1];
+                        z = nbrs[itr][2];
+                        //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
+                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
+                                SQR(control->vlist_cut) ) {     
+                            nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ];
+                            max = g->top[index_grid_3d (x,y,z,g) ];
+                            //fprintf( stderr, "\t\tmax: %d\n", max );
+                            /* pick up another atom from the neighbor cell -
+                               we have to compare atom1 with its own periodic images as well, 
+                               that's why there is also equality in the if stmt below */
+                            for( m = 0; m < max; ++m ) {
+                                atom2 = nbr_atoms[m];
+                                //if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) {
+                                if( atom1 > atom2 ) {
+                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
+                                                system->atoms[atom2].x, 
+                                                &(system->box), control->vlist_cut, 
+                                                &nbr_data))
+                                        ++num_far;
+                                }
+                            }
+                        }
+                        ++itr;
+                    }
+                    // finish note
+                    finish = num_far;
+                    if (g->max_cuda_nbrs <= (finish - start)){
+                        g->max_cuda_nbrs    = finish - start;
+                    }
+                }
+            }
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far );
+    return num_far * SAFE_ZONE;
+//Code not used anymore
+#if defined DONE
+void Choose_Neighbor_Finder( reax_system *system, control_params *control, 
+        get_far_neighbors_function *Get_Far_Neighbors )
+    if( control->periodic_boundaries )
+    {
+        if( system->box.box_norms[0] > 2.0 * control->vlist_cut &&
+                system->box.box_norms[1] > 2.0 * control->vlist_cut &&
+                system->box.box_norms[2] > 2.0 * control->vlist_cut )
+            (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box;
+        else  (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box;
+    }
+    else
+        (*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors;
+int compare_near_nbrs(const void *v1, const void *v2)
+    return ((*(near_neighbor_data *)v1).nbr - (*(near_neighbor_data *)v2).nbr);
+int compare_far_nbrs(const void *v1, const void *v2)
+    return ((*(far_neighbor_data *)v1).nbr - (*(far_neighbor_data *)v2).nbr);
+inline void Set_Far_Neighbor( far_neighbor_data *dest, int nbr, real d, real C,
+        rvec dvec, ivec rel_box/*, rvec ext_factor*/ )
+    dest->nbr = nbr;
+    dest->d = d;
+    rvec_Scale( dest->dvec, C, dvec );
+    ivec_Copy( dest->rel_box, rel_box );
+    // rvec_Scale( dest->ext_factor, C, ext_factor );
+inline void Set_Near_Neighbor(near_neighbor_data *dest, int nbr, real d, real C,
+        rvec dvec, ivec rel_box/*, rvec ext_factor*/)
+    dest->nbr = nbr;
+    dest->d = d;
+    rvec_Scale( dest->dvec, C, dvec );
+    ivec_Scale( dest->rel_box, C, rel_box );
+    // rvec_Scale( dest->ext_factor, C, ext_factor );
+/* In case bond restrictions are applied, this method checks if
+   atom1 and atom2 are allowed to bond with each other */
+inline int can_Bond( static_storage *workspace, int atom1, int atom2 )
+    int i;
+    // fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 );
+    if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] )
+        return 1;
+    for( i = 0; i < workspace->restricted[ atom1 ]; ++i )
+        if( workspace->restricted_list[ atom1 ][i] == atom2 )
+            return 1;
+    for( i = 0; i < workspace->restricted[ atom2 ]; ++i )
+        if( workspace->restricted_list[ atom2 ][i] == atom1 )
+            return 1;
+    return 0;
+/* check if atom2 is on atom1's near neighbor list */
+inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 )
+    int i;
+    for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i )
+        if( near_nbrs->select.near_nbr_list[i].nbr == atom2 )
+        {
+            // fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 );
+            return 1;
+        }
+    return 0;
+void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
+    int  i, j, k;
+    int  x, y, z;
+    int  *nbr_atoms;
+    int  atom1, atom2, max;
+    int   num_far;
+    int   c, count;
+    int   grid_top;
+    grid *g = &( system->g );  
+    list *far_nbrs = (*lists) + FAR_NBRS;
+    //int   hb_type1, hb_type2;
+    //list *hbonds = (*lists) + HBOND;
+    //int   top_hbond1, top_hbond2;
+    get_far_neighbors_function Get_Far_Neighbors;
+    far_neighbor_data new_nbrs[125];
+    int   l, m;
+    // fprintf( stderr, "\n\tentered nbrs - " );
+    if( control->ensemble == iNPT || control->ensemble == sNPT || 
+            control->ensemble == NPT )
+        Update_Grid( system );
+    // fprintf( stderr, "grid updated - " );
+    Bin_Atoms( system, out_control );
+    // fprintf( stderr, "atoms sorted - " );
+    Cluster_Atoms( system, workspace );
+    // fprintf( stderr, "atoms clustered - " );
+    Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
+    // fprintf( stderr, "function chosen - " );  
+    Reset_Neighbor_Lists( system, workspace, lists );  
+    // fprintf( stderr, "lists cleared - " );
+    num_far = 0;
+    num_near = 0;
+    c = 0;
+    /* first pick up a cell in the grid */
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ ) {
+                nbrs = g->nbrs[i][j][k];
+                nbrs_cp = g->nbrs_cp[i][j][k];
+                /* pick up an atom from the current cell */
+                //#ifdef REORDER_ATOMS
+                //  for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++)
+                //#else
+                for(l = 0; l < g->top[i][j][k]; ++l ){
+                    atom1 = g->atoms[i][j][k][l];
+                    Set_End_Index( atom1, num_far, far_nbrs );
+                    // fprintf( stderr, "atom %d:\n", atom1 );
+                    itr = 0;
+                    while( nbrs[itr][0] > 0 ){
+                        x = nbrs[itr][0];
+                        y = nbrs[itr][1];
+                        z = nbrs[itr][2];
+                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
+                        //     SQR(control->r_cut))     
+                        nbr_atoms = g->atoms[x][y][z];
+                        max_atoms = g->top[x][y][z];
+                        /* pick up another atom from the neighbor cell -
+                           we have to compare atom1 with its own periodic images as well, 
+                           that's why there is also equality in the if stmt below */
+                        //#ifdef REORDER_ATOMS
+                        //for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++)
+                        //#else
+                        for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
+                            if( atom1 >= atom2 ) {
+                                //fprintf( stderr, "\tatom2 %d", atom2 );
+                                //top_near1 = End_Index( atom1, near_nbrs );
+                                //Set_Start_Index( atom1, num_far, far_nbrs );
+                                //hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond;
+                                Get_Far_Neighbors( system->atoms[atom1].x,
+                                        system->atoms[atom2].x, 
+                                        &(system->box), control, new_nbrs, &count );
+                                fprintf( stderr, "\t%d count:%d\n", atom2, count );
+                                for( c = 0; c < count; ++c )
+                                    if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
+                                        Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
+                                                atom2, new_nbrs[c].d, 1.0, 
+                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                        ++num_far;
+                                        /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
+                                          atom1, atom2, new_nbrs[c].d, 
+                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
+                                          new_nbrs[c].dvec[2] ); */
+                                        /* hydrogen bond lists */ 
+                                        /*if( control->hb_cut > 0.1 && 
+                                          new_nbrs[c].d <= control->hb_cut ) {
+                                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                                        hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond;
+                                        if( hb_type1 == 1 && hb_type2 == 2 ) {
+                                        top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds);
+                                        Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]),
+                                        atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec,
+                                        new_nbrs[c].rel_box );
+                                        Set_End_Index( workspace->hbond_index[atom1], 
+                                        top_hbond1 + 1, hbonds );
+                                        }
+                                        else if( hb_type1 == 2 && hb_type2 == 1 ) {
+                                        top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds );
+                                        Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]),
+                                        atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, 
+                                        new_nbrs[c].rel_box );
+                                        Set_End_Index( workspace->hbond_index[atom2], 
+                                        top_hbond2 + 1, hbonds );
+                                        }*/
+                                    }
+                                }
+                        }
+                    Set_End_Index( atom1, top_far1, far_nbrs );
+                }
+            }
+    fprintf( stderr, "nbrs done-" );
+    /* apply restrictions on near neighbors only */
+    if( (data->step - data->prev_steps) < control->restrict_bonds ) {
+        for( atom1 = 0; atom1 < system->N; ++atom1 )
+            if( workspace->restricted[ atom1 ] ) {
+                // fprintf( stderr, "atom1: %d\n", atom1 );
+                top_near1 = End_Index( atom1, near_nbrs );
+                for( j = 0; j < workspace->restricted[ atom1 ]; ++j )
+                    if(!is_Near_Neighbor(near_nbrs, atom1, 
+                                atom2 = workspace->restricted_list[atom1][j])) {
+                        fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n",
+                                atom1, atom2 );
+                        top_near2 = End_Index( atom2, near_nbrs );          
+                        /* we just would like to get the nearest image, so a call to 
+                           Get_Periodic_Far_Neighbors_Big_Box is good enough. */
+                        Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, 
+                                system->atoms[ atom2 ].x, 
+                                &(system->box), control, 
+                                new_nbrs, &count );
+                        Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]),
+                                atom2, new_nbrs[c].d, 1.0, 
+                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                        ++top_near1;
+                        Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]),
+                                atom1, new_nbrs[c].d, -1.0, 
+                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                        Set_End_Index( atom2, top_near2+1, near_nbrs );
+                    }
+                Set_End_Index( atom1, top_near1, near_nbrs );
+            }
+    }
+    // fprintf( stderr, "restrictions applied-" );
+    /* verify nbrlists, count num_intrs, sort nearnbrs */
+    near_nbrs->num_intrs = 0;
+    far_nbrs->num_intrs = 0;
+    for( i = 0; i < system->N-1; ++i ) {
+        if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) {
+            fprintf( stderr, 
+                    "step%3d: nearnbr list of atom%d is overwritten by atom%d\n",
+                    data->step, i+1, i );
+            exit( 1 );
+        }
+        near_nbrs->num_intrs += Num_Entries(i, near_nbrs);
+        if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) {
+            fprintf( stderr, 
+                    "step%3d: farnbr list of atom%d is overwritten by atom%d\n", 
+                    data->step, i+1, i );
+            exit( 1 );
+        }
+        far_nbrs->num_intrs += Num_Entries(i, far_nbrs);
+    }
+    for( i = 0; i < system->N; ++i ) {
+        qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]),
+                Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), 
+                compare_near_nbrs );
+    }
+    // fprintf( stderr, "near nbrs sorted\n" );
+    /* for( i = 0; i < system->N; ++i ) {
+       qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
+       Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
+       compare_far_nbrs ); 
+       } */
+    fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", 
+            num_near / system->N );
+    fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", 
+            num_far / system->N, control->max_far_nbrs );
+    //fprintf( stderr, "step%d: num of nearnbrs = %6d   num of farnbrs: %6d\n",
+    //       data->step, num_near, num_far );
+    //fprintf( stderr, "\talloc nearnbrs = %6d   alloc farnbrs: %6d\n", 
+    //   system->N * near_nbrs->intrs_per_unit, 
+    //   system->N * far_nbrs->intrs_per_unit );
+void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
+    int  i, j, k, l, m, itr;
+    int  x, y, z;
+    int  atom1, atom2, max;
+    int  num_far, c, count;
+    int  *nbr_atoms;
+    ivec *nbrs;
+    rvec *nbrs_cp;
+    grid *g;
+    list *far_nbrs;
+    get_far_neighbors_function Get_Far_Neighbors;
+    far_neighbor_data new_nbrs[125];
+    g = &( system->g );
+    far_nbrs = (*lists) + FAR_NBRS;
+    // fprintf( stderr, "\n\tentered nbrs - " );
+    if( control->ensemble == iNPT || 
+            control->ensemble == sNPT || 
+            control->ensemble == NPT )
+        Update_Grid( system );
+    // fprintf( stderr, "grid updated - " );
+    Bin_Atoms( system, out_control );
+    // fprintf( stderr, "atoms sorted - " );
+    Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
+    // fprintf( stderr, "function chosen - " );  
+    Reset_Neighbor_Lists( system, workspace, lists );  
+    // fprintf( stderr, "lists cleared - " );
+    num_far = 0;
+    c = 0;
+    /* first pick up a cell in the grid */
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ ) {
+                nbrs = g->nbrs[i][j][k];
+                nbrs_cp = g->nbrs_cp[i][j][k];
+                fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
+                /* pick up an atom from the current cell */
+                for(l = 0; l < g->top[i][j][k]; ++l ){
+                    atom1 = g->atoms[i][j][k][l];
+                    Set_Start_Index( atom1, num_far, far_nbrs );
+                    fprintf( stderr, "\tatom %d\n", atom1 );
+                    itr = 0;
+                    while( nbrs[itr][0] > 0 ){
+                        x = nbrs[itr][0];
+                        y = nbrs[itr][1];
+                        z = nbrs[itr][2];
+                        fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
+                        // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
+                        //     SQR(control->r_cut))     
+                        nbr_atoms = g->atoms[x][y][z];
+                        max = g->top[x][y][z];
+                        fprintf( stderr, "\t\tmax: %d\n", max );
+                        /* pick up another atom from the neighbor cell -
+                           we have to compare atom1 with its own periodic images as well, 
+                           that's why there is also equality in the if stmt below */
+                        for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
+                            if( atom1 >= atom2 ) {
+                                Get_Far_Neighbors( system->atoms[atom1].x,
+                                        system->atoms[atom2].x, 
+                                        &(system->box), control, new_nbrs, &count );
+                                fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count );
+                                for( c = 0; c < count; ++c )
+                                    if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
+                                        Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
+                                                atom2, new_nbrs[c].d, 1.0, 
+                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                        ++num_far;
+                                        /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
+                                          atom1, atom2, new_nbrs[c].d, 
+                                          new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
+                                          new_nbrs[c].dvec[2] ); */
+                                    }
+                            }
+                        ++itr;
+                    }
+                    Set_End_Index( atom1, num_far, far_nbrs );
+                }
+            }
+    far_nbrs->num_intrs = num_far;  
+    fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
+#if defined(DEBUG)
+    for( i = 0; i < system->N; ++i ) {
+        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
+                Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
+                compare_far_nbrs ); 
+    }
+    fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far );
+    fprintf( stderr, "\tallocated farnbrs: %6d\n", 
+            system->N * far_nbrs->intrs_per_unit );
diff --git a/PuReMD-GPU/src/neighbors.cu b/PuReMD-GPU/src/neighbors.cu
deleted file mode 100644
index 90779538353a05eca7bb04215ebd33b3ffd81e35..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/neighbors.cu
+++ /dev/null
@@ -1,1413 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#include "neighbors.h"
-#include "box.h"
-#include "grid.h"
-#include "list.h"
-#include "reset_utils.h"
-#include "system_props.h"
-#include "vector.h"
-#include "index_utils.h"
-#include "cuda_utils.h"
-extern inline DEVICE int index_grid (int blocksize)
-    return blockIdx.x * gridDim.y * gridDim.z * blocksize +  
-        blockIdx.y * gridDim.z * blocksize +  
-        blockIdx.z * blocksize ;
-extern inline HOST_DEVICE int index_grid_debug (int x, int y, int z, int blocksize)
-    return x * 8 * 8 * blocksize +  
-        y * 8 * blocksize +  
-        z * blocksize ;
-inline HOST_DEVICE real DistSqr_to_CP( rvec cp, rvec x )
-    int  i;
-    real d_sqr = 0;
-    for( i = 0; i < 3; ++i )
-        if( cp[i] > NEG_INF )
-            d_sqr += SQR( cp[i] - x[i] );
-    return d_sqr;
-HOST_DEVICE int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
-        real cutoff, far_neighbor_data *data )
-    real norm_sqr, d, tmp;
-    int i;
-    norm_sqr = 0;
-    for( i = 0; i < 3; i++ ) { 
-        d = x2[i] - x1[i];
-        tmp = SQR(d);
-        if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
-            if( x2[i] > x1[i] ) { 
-                d -= box->box_norms[i];
-                data->rel_box[i] = -1; 
-            }   
-            else {
-                d += box->box_norms[i];
-                data->rel_box[i] = +1; 
-            }   
-            data->dvec[i] = d;
-            norm_sqr += SQR(d);
-        }   
-        else {
-            data->dvec[i] = d;
-            norm_sqr += tmp;
-            data->rel_box[i] = 0;
-        }   
-    }
-    if( norm_sqr <= SQR(cutoff) ){
-        data->d = sqrt(norm_sqr);
-        return 1;
-    }
-    return 0;
-void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control )
-    int  i, j, k, l, m, itr;
-    int  x, y, z;
-    int  atom1, atom2, max;
-    int  num_far;
-    int  *nbr_atoms;
-    ivec *nbrs;
-    rvec *nbrs_cp;
-    grid *g;
-    list *far_nbrs;
-    far_neighbor_data *nbr_data;
-    real t_start, t_elapsed;
-    // fprintf( stderr, "\n\tentered nbrs - " );
-    g = &( system->g );
-    far_nbrs = (*lists) + FAR_NBRS;
-    Bin_Atoms( system, workspace );
-    t_start = Get_Time( );
-    // fprintf( stderr, "atoms sorted - " );
-    num_far = 0;
-    /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
-                nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ];
-                nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ];
-                //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
-                /* pick up an atom from the current cell */
-                for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){
-                    atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ];
-                    Set_Start_Index( atom1, num_far, far_nbrs );
-                    //fprintf( stderr, "\tatom %d\n", atom1 );
-                    itr = 0;
-                    while( nbrs[itr][0] >= 0 ){
-                        x = nbrs[itr][0];
-                        y = nbrs[itr][1];
-                        z = nbrs[itr][2];
-                        //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
-                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                                SQR(control->vlist_cut) ) {     
-                            nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ];
-                            max = g->top[ index_grid_3d (x,y,z,g) ];
-                            //fprintf( stderr, "\t\tmax: %d\n", max );
-                            /* pick up another atom from the neighbor cell */
-                            for( m = 0; m < max; ++m ) {
-                                atom2 = nbr_atoms[m];
-                                if( atom1 > atom2 ) {
-                                    nbr_data = &(far_nbrs->select.far_nbr_list[num_far]);
-                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
-                                                system->atoms[atom2].x, 
-                                                &(system->box), control->vlist_cut, 
-                                                nbr_data)) {
-                                        nbr_data->nbr = atom2;
-                                        ++num_far;
-                                    }
-                                }
-                            }
-                        }
-                        ++itr;
-                    }
-                    Set_End_Index( atom1, num_far, far_nbrs );
-                    //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", 
-                    //  atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs),
-                    //  itr); 
-                }
-            }
-    fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far);
-    if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) {
-        workspace->realloc.num_far = num_far;
-        if( num_far > far_nbrs->num_intrs ){
-            fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d",
-                    data->step, num_far, far_nbrs->num_intrs );
-            exit( INSUFFICIENT_SPACE );
-        }
-    }
-    t_elapsed = Get_Timing_Info( t_start );
-    data->timing.nbrs += t_elapsed;
-#if defined(DEBUG)
-    for( i = 0; i < system->N; ++i ) {
-        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-                Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-                compare_far_nbrs ); 
-    }
-#if defined(DEBUG_FOCUS)  
-    //fprintf( stderr, "nbrs - ");
-    //fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
-#if defined(TEST_ENERGY)
-    //Print_Far_Neighbors( system, control, workspace, lists );
-int Estimate_NumNeighbors( reax_system *system, control_params *control, 
-        static_storage *workspace, list **lists )
-    int  i, j, k, l, m, itr;
-    int  x, y, z;
-    int  atom1, atom2, max;
-    int  num_far;
-    int  *nbr_atoms;
-    ivec *nbrs;
-    rvec *nbrs_cp;
-    grid *g;
-    far_neighbor_data nbr_data;
-    int     start = 0, finish = 0;
-    // fprintf( stderr, "\n\tentered nbrs - " );
-    g = &( system->g );
-    Bin_Atoms( system, workspace );
-    // fprintf( stderr, "atoms sorted - " );
-    num_far = 0;
-    g->max_cuda_nbrs = 0;
-    /* first pick up a cell in the grid */
-    for( i = 0; i < g->ncell[0]; i++ )
-        for( j = 0; j < g->ncell[1]; j++ )
-            for( k = 0; k < g->ncell[2]; k++ ) {
-                nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ];
-                nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ];
-                //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
-                /* pick up an atom from the current cell */
-                for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){
-                    atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ];
-                    start = num_far;
-                    itr = 0;
-                    while( nbrs[itr][0] >= 0 ){
-                        x = nbrs[itr][0];
-                        y = nbrs[itr][1];
-                        z = nbrs[itr][2];
-                        //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
-                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                                SQR(control->vlist_cut) ) {     
-                            nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ];
-                            max = g->top[index_grid_3d (x,y,z,g) ];
-                            //fprintf( stderr, "\t\tmax: %d\n", max );
-                            /* pick up another atom from the neighbor cell -
-                               we have to compare atom1 with its own periodic images as well, 
-                               that's why there is also equality in the if stmt below */
-                            for( m = 0; m < max; ++m ) {
-                                atom2 = nbr_atoms[m];
-                                //if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) {
-                                if( atom1 > atom2 ) {
-                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
-                                                system->atoms[atom2].x, 
-                                                &(system->box), control->vlist_cut, 
-                                                &nbr_data))
-                                        ++num_far;
-                                }
-                            }
-                            }
-                            ++itr;
-                        }
-                        // finish note
-                        finish = num_far;
-                        if (g->max_cuda_nbrs <= (finish - start)){
-                            g->max_cuda_nbrs    = finish - start;
-                        }
-                    }
-                }
-#if defined(DEBUG_FOCUS)  
-                fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far );
-                return num_far * SAFE_ZONE;
-            }
-    GLOBAL void Estimate_NumNeighbors ( reax_atom *sys_atoms,
-            grid g,
-            simulation_box *box,
-            control_params *control,
-            int *indices)
-    {
-        int *atoms = g.atoms;
-        int *top = g.top;
-        ivec *nbrs = g.nbrs; 
-        rvec *nbrs_cp = g.nbrs_cp;
-        int *nbr_atoms;
-        int atom1, atom2, l, iter, max, m, num_far;
-        far_neighbor_data nbr_data;
-        int x, y, z, i;
-        if (threadIdx.x >= *(top + index_grid(1))){
-            return;
-        } 
-        nbrs = nbrs + index_grid (g.max_nbrs);
-        nbrs_cp = nbrs_cp + index_grid (g.max_nbrs);
-        atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x];
-        num_far = 0;
-        iter = 0;
-        while (nbrs[iter][0] >= 0) {
-            x = nbrs[iter][0];
-            y = nbrs[iter][1];
-            z = nbrs[iter][2];
-            //condition check for cutoff here
-            if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
-                    SQR (control->vlist_cut)) 
-            {
-                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-                max = top [index_grid_3d(x, y, z, &g)];
-                for (m = 0; m < max; m++) {
-                    atom2 = nbr_atoms[m];
-                    //CHANGE ORIGINAL
-                    /*
-                       if (atom1 > atom2) {
-                       if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-                       control->vlist_cut, &nbr_data)){
-                       ++num_far;
-                       }
-                       }
-                     */
-                    if (atom1 > atom2) {
-                        if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-                                    control->vlist_cut, &nbr_data)){
-                            ++num_far;
-                        }
-                    }
-                    else if (atom1 < atom2) {
-                        if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
-                                    control->vlist_cut, &nbr_data)){
-                            ++num_far;
-                        }
-                    }
-                    //CHANGE ORIGINAL
-                }
-            }
-            ++iter;
-        }
-        //indices[ atom1 ] = num_far;// * SAFE_ZONE;
-        indices[ atom1 ] = num_far * SAFE_ZONE;
-    }
-    /*One thread per atom Implementation */
-    GLOBAL void New_Estimate_NumNeighbors (     reax_atom *sys_atoms,
-            grid g,
-            simulation_box *box,
-            control_params* control, 
-            int N, int *indices)
-    {
-        int *atoms = g.atoms;
-        int *top = g.top;
-        ivec *nbrs = g.nbrs; 
-        rvec *nbrs_cp = g.nbrs_cp;
-        int     *nbr_atoms;
-        int   atom1, atom2, iter, max, m, num_far;
-        int     x, y, z, i;
-        int atom_x, atom_y, atom_z;
-        far_neighbor_data temp;
-        rvec atom1_x;
-        int index = blockIdx.x * blockDim.x + threadIdx.x;
-        if (index > N) return;
-        atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]);
-        atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]);
-        atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]);
-#ifdef __BNVT_FIX__
-        if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
-        if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
-        if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
-        nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-        nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-        atom1 = index;
-        rvec_Copy (atom1_x, sys_atoms [atom1].x );
-        num_far = 0;
-        iter = 0;
-        while (nbrs[iter][0] >= 0) {
-            x = nbrs[iter][0];
-            y = nbrs[iter][1];
-            z = nbrs[iter][2];
-            if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= 
-                    SQR (control->vlist_cut)) 
-            {
-                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-                max = top [index_grid_3d(x, y, z, &g)];
-                for (m = 0; m < max; m++) 
-                {
-                    atom2 = nbr_atoms[m];
-                    if (atom1 > atom2) {
-                        if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
-                                    control->vlist_cut, &temp)){
-                            num_far++;
-                        }
-                    }
-                    else if (atom1 < atom2) {
-                        if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
-                                    control->vlist_cut, &temp)){
-                            num_far ++;
-                        }
-                    }
-                }
-            }
-            ++iter;
-        }
-        indices [atom1] = num_far * SAFE_ZONE;
-    }
-    /*One thread per entry in the gcell implementation */
-    GLOBAL void Generate_Neighbor_Lists (     reax_atom *sys_atoms,
-            grid g,
-            simulation_box *box,
-            control_params* control, 
-            list far_nbrs)
-    {
-        int *atoms = g.atoms;
-        int *top = g.top;
-        ivec *nbrs = g.nbrs; 
-        rvec *nbrs_cp = g.nbrs_cp;
-        int     *nbr_atoms;
-        int   atom1, atom2, l, iter, max, m, num_far;
-        int     x, y, z, i;
-        far_neighbor_data *nbr_data;
-        far_neighbor_data temp;
-        if (threadIdx.x >= *(top + index_grid(1))){
-            return;
-        } 
-        nbrs = nbrs + index_grid (g.max_nbrs);
-        nbrs_cp = nbrs_cp + index_grid (g.max_nbrs);
-        atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x];
-        num_far = Start_Index (atom1, &far_nbrs);
-        //Set_Start_Index (atom1, 0, &far_nbrs);
-        //num_far =  0;
-        iter = 0;
-        while (nbrs[iter][0] >= 0) {
-            x = nbrs[iter][0];
-            y = nbrs[iter][1];
-            z = nbrs[iter][2];
-            //condition check for cutoff here
-            if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
-                    SQR (control->vlist_cut)) 
-            {
-                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-                max = top [index_grid_3d(x, y, z, &g)];
-                for (m = 0; m < max; m++) {
-                    atom2 = nbr_atoms[m];
-                    //nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] );
-                    //CHANGE ORIGINAL
-                    /*
-                       if (atom1 > atom2) {
-                       if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-                       control->vlist_cut, &temp)){
-                       nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-                       nbr_data->nbr = atom2;
-                       nbr_data->rel_box[0] = temp.rel_box[0];
-                       nbr_data->rel_box[1] = temp.rel_box[1];
-                       nbr_data->rel_box[2] = temp.rel_box[2];
-                       nbr_data->d = temp.d;
-                       nbr_data->dvec[0] = temp.dvec[0];
-                       nbr_data->dvec[1] = temp.dvec[1];
-                       nbr_data->dvec[2] = temp.dvec[2];
-                       ++num_far;
-                       }
-                       }
-                     */
-                    if (atom1 > atom2) {
-                        if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-                                    control->vlist_cut, &temp)){
-                            nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-                            nbr_data->nbr = atom2;
-                            nbr_data->rel_box[0] = temp.rel_box[0];
-                            nbr_data->rel_box[1] = temp.rel_box[1];
-                            nbr_data->rel_box[2] = temp.rel_box[2];
-                            nbr_data->d = temp.d;
-                            nbr_data->dvec[0] = temp.dvec[0];
-                            nbr_data->dvec[1] = temp.dvec[1];
-                            nbr_data->dvec[2] = temp.dvec[2];
-                            ++num_far;
-                        }
-                    }
-                    else if (atom1 < atom2) {
-                        if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
-                                    control->vlist_cut, &temp)){
-                            nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-                            nbr_data->nbr = atom2;
-                            nbr_data->rel_box[0] = temp.rel_box[0];
-                            nbr_data->rel_box[1] = temp.rel_box[1];
-                            nbr_data->rel_box[2] = temp.rel_box[2];
-                            nbr_data->d = temp.d;
-                            nbr_data->dvec[0] = temp.dvec[0];
-                            nbr_data->dvec[1] = temp.dvec[1];
-                            nbr_data->dvec[2] = temp.dvec[2];
-                            ++num_far;
-                        }
-                    }
-                    //CHANGE ORIGINAL
-                }
-            }
-            ++iter;
-        }
-        //end the far_neighbor list here
-        Set_End_Index (atom1, num_far, &far_nbrs);
-    }
-    /*One thread per atom Implementation */
-    GLOBAL void New_Generate_Neighbor_Lists (     reax_atom *sys_atoms,
-            grid g,
-            simulation_box *box,
-            control_params* control, 
-            list far_nbrs, int N)
-    {
-        int *atoms = g.atoms;
-        int *top = g.top;
-        ivec *nbrs = g.nbrs; 
-        rvec *nbrs_cp = g.nbrs_cp;
-        int     *nbr_atoms;
-        int   atom1, atom2, l, iter, max, m, num_far;
-        int     x, y, z, i;
-        far_neighbor_data *nbr_data, *my_start;
-        far_neighbor_data temp;
-        int atom_x, atom_y, atom_z;
-        rvec atom1_x;
-        int index = blockIdx.x * blockDim.x + threadIdx.x;
-        if (index > N) return;
-        atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]);
-        atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]);
-        atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]);
-#ifdef __BNVT_FIX__
-        if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
-        if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
-        if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
-        nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-        nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-        atom1 = index;
-        rvec_Copy (atom1_x, sys_atoms [atom1].x );
-        num_far = Start_Index (atom1, &far_nbrs);
-        my_start = & (far_nbrs.select.far_nbr_list [num_far] );
-        //Set_Start_Index (atom1, 0, &far_nbrs);
-        //num_far =  0;
-        iter = 0;
-        while (nbrs[iter][0] >= 0) {
-            x = nbrs[iter][0];
-            y = nbrs[iter][1];
-            z = nbrs[iter][2];
-            //condition check for cutoff here
-            //if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
-            if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= 
-                    SQR (control->vlist_cut)) 
-            {
-                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-                max = top [index_grid_3d(x, y, z, &g)];
-                for (m = 0; m < max; m++) 
-                {
-                    atom2 = nbr_atoms[m];
-                    if (atom1 > atom2) {
-                        if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
-                                    control->vlist_cut, &temp)){
-                            //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-                            nbr_data = my_start;
-                            nbr_data->nbr = atom2;
-                            nbr_data->rel_box[0] = temp.rel_box[0];
-                            nbr_data->rel_box[1] = temp.rel_box[1];
-                            nbr_data->rel_box[2] = temp.rel_box[2];
-                            nbr_data->d = temp.d;
-                            nbr_data->dvec[0] = temp.dvec[0];
-                            nbr_data->dvec[1] = temp.dvec[1];
-                            nbr_data->dvec[2] = temp.dvec[2];
-                            num_far++;
-                            my_start ++;
-                        }
-                    }
-                    else if (atom1 < atom2) {
-                        if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
-                                    control->vlist_cut, &temp)){
-                            //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-                            nbr_data = my_start;
-                            nbr_data->nbr = atom2;
-                            nbr_data->rel_box[0] = temp.rel_box[0];
-                            nbr_data->rel_box[1] = temp.rel_box[1];
-                            nbr_data->rel_box[2] = temp.rel_box[2];
-                            nbr_data->d = temp.d;
-                            nbr_data->dvec[0] = temp.dvec[0];
-                            nbr_data->dvec[1] = temp.dvec[1];
-                            nbr_data->dvec[2] = temp.dvec[2];
-                            num_far ++;
-                            my_start ++;
-                        }
-                    }
-                    //CHANGE ORIGINAL
-                }
-            }
-            ++iter;
-        }
-        //end the far_neighbor list here
-        Set_End_Index (atom1, num_far, &far_nbrs);
-    }
-    /*Multiple threads per atom Implementation */
-    GLOBAL void Test_Generate_Neighbor_Lists (     reax_atom *sys_atoms,
-            grid g,
-            simulation_box *box,
-            control_params* control, 
-            list far_nbrs, int N )
-    {
-        extern __shared__ int __nbr[];
-        extern __shared__ int __sofar [];
-        bool    nbrgen;
-        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-        int warp_id = thread_id / __THREADS_PER_ATOM__;
-        int lane_id = thread_id & (__THREADS_PER_ATOM__ -1);
-        int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-        if (warp_id >= N ) return;
-        int *tnbr = __nbr;
-        //int *nbrssofar = __nbr + __THREADS_PER_ATOM__;
-        int *nbrssofar = __nbr + blockDim.x;
-        int *atoms = g.atoms;
-        int *top = g.top;
-        ivec *nbrs = g.nbrs; 
-        rvec *nbrs_cp = g.nbrs_cp;
-        int     *nbr_atoms;
-        int   atom1, atom2, l, iter, max, m, num_far;
-        int leader = -10;
-        int     x, y, z, i;
-        far_neighbor_data *nbr_data, *my_start;
-        far_neighbor_data temp;
-        int atom_x, atom_y, atom_z;
-        atom1 = warp_id;
-        atom_x = (int)(sys_atoms[atom1].x[0] * g.inv_len[0]);
-        atom_y = (int)(sys_atoms[atom1].x[1] * g.inv_len[1]);
-        atom_z = (int)(sys_atoms[atom1].x[2] * g.inv_len[2]);
-#ifdef __BNVT_FIX__
-        if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
-        if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
-        if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
-        nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-        nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-        num_far = Start_Index (atom1, &far_nbrs);
-        my_start = & (far_nbrs.select.far_nbr_list [num_far] );
-        iter = 0;
-        tnbr[threadIdx.x] = 0;
-        if (lane_id == 0) {
-            //nbrssofar [threadIdx.x /__THREADS_PER_ATOM__] = 0;
-            nbrssofar [my_bucket] = 0;
-        }
-        __syncthreads ();
-        while ((nbrs[iter][0] >= 0)) {
-            x = nbrs[iter][0];
-            y = nbrs[iter][1];
-            z = nbrs[iter][2];
-            tnbr[threadIdx.x] = 0;
-            nbrgen = false;
-            if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms [atom1].x) <= 
-                    SQR (control->vlist_cut)) 
-            {
-                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-                max = top [index_grid_3d(x, y, z, &g)];
-                tnbr[threadIdx.x] = 0;
-                nbrgen = false;
-                m = lane_id ; //0-31
-                int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1);
-                int iterations = 0;
-                //while (m < max)
-                while (iterations < loopcount)
-                {
-                    tnbr [threadIdx.x] = 0;
-                    nbrgen = false;
-                    if (m < max) {
-                        atom2 = nbr_atoms[m];
-                        if (atom1 > atom2) {
-                            if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-                                        control->vlist_cut, &temp))
-                            {
-                                tnbr [threadIdx.x] = 1;
-                                nbrgen = true;
-                            }
-                        }
-                        else if (atom1 < atom2) {
-                            if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
-                                        control->vlist_cut, &temp)){
-                                tnbr [threadIdx.x] = 1;
-                                nbrgen = true;
-                            }
-                        }
-                    }
-                    if (nbrgen)
-                    {
-                        //do leader selection here
-                        leader = -1;
-                        //for (l = threadIdx.x / __THREADS_PER_ATOM__; l < threadIdx.x / __THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++)
-                        for (l = my_bucket *__THREADS_PER_ATOM__; l < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++)
-                            if (tnbr[l]){
-                                leader = l;
-                                break;
-                            }
-                        //do the reduction;
-                        if (threadIdx.x == leader) 
-                            for (l = 1; l < __THREADS_PER_ATOM__; l++)
-                                //tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + l] += tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + (l-1)];    
-                                tnbr [my_bucket * __THREADS_PER_ATOM__ + l] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (l-1)];    
-                    }
-                    //__syncthreads ();
-                    //atomicAdd ( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1);
-                    //while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ;
-                    if (nbrgen)
-                    {
-                        //got the indices
-                        //nbr_data = my_start + nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] + tnbr [threadIdx.x] - 1;
-                        nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1;
-                        nbr_data->nbr = atom2;
-                        nbr_data->rel_box[0] = temp.rel_box[0];
-                        nbr_data->rel_box[1] = temp.rel_box[1];
-                        nbr_data->rel_box[2] = temp.rel_box[2];
-                        nbr_data->d = temp.d;
-                        nbr_data->dvec[0] = temp.dvec[0];
-                        nbr_data->dvec[1] = temp.dvec[1];
-                        nbr_data->dvec[2] = temp.dvec[2];
-                        if (threadIdx.x == leader)
-                            //nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] += tnbr[(threadIdx.x / __THREADS_PER_ATOM__)*__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
-                            nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
-                    }
-                    m += __THREADS_PER_ATOM__;
-                    iterations ++;
-                    //cleanup
-                    nbrgen = false;
-                    tnbr [threadIdx.x] = 0;
-                }
-            }
-            ++iter;
-        }
-        __syncthreads ();
-        //end the far_neighbor list here
-        if (lane_id == 0)
-            Set_End_Index (atom1, num_far + nbrssofar[my_bucket], &far_nbrs);
-        //Set_End_Index (atom1, num_far + tnbr[63], &far_nbrs);
-    }
-    void Cuda_Generate_Neighbor_Lists (reax_system *system, static_storage *workspace, control_params *control, bool estimate)
-    {
-        real t_start, t_elapsed;
-        real t_1, t_2;
-        list *far_nbrs = dev_lists + FAR_NBRS;
-        int *d_indices = (int *) scratch;
-        int *nbrs_start, *nbrs_end;
-        int i, max_nbrs = 0;
-        int nbs;
-        t_start = Get_Time (); 
-        Cuda_Bin_Atoms (system, workspace);
-        Cuda_Bin_Atoms_Sync ( system );
-        if (dev_workspace->realloc.estimate_nbrs > -1) {
-            /*reset the re-neighbor condition */
-            dev_workspace->realloc.estimate_nbrs = -1;
-            //#ifdef __DEBUG_CUDA__
-            fprintf (stderr, "Recomputing the neighbors estimate.... \n");
-            //#endif
-            cuda_memset (d_indices, 0, INT_SIZE * system->N, RES_SCRATCH );
-            /*
-               dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
-               dim3 threadsperblock (system->g.max_atoms);
-               Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>>
-               (system->d_atoms, system->d_g, system->d_box, 
-               (control_params *)control->d_control, d_indices);
-               cudaThreadSynchronize ();
-               cudaCheckError ();
-             */
-            nbs = (system->N / NBRS_BLOCK_SIZE) + (((system->N) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-            New_Estimate_NumNeighbors <<<nbs, NBRS_BLOCK_SIZE>>> 
-                (     system->d_atoms, system->d_g,
-                    system->d_box, (control_params *)control->d_control,
-                    system->N, d_indices);
-            cudaThreadSynchronize ();
-            cudaCheckError ();
-            int *nbrs_indices = NULL;
-            nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) );
-            if (nbrs_indices == NULL) 
-            {
-                fprintf (stderr, "Malloc failed for nbrs indices .... \n");
-                exit (1);
-            }
-            memset (nbrs_indices , 0, INT_SIZE * (system->N+1) ); 
-            copy_host_device (nbrs_indices+1, d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); 
-            for (int i = 1; i <= system->N; i++) 
-                nbrs_indices [i] += nbrs_indices [i-1];
-            copy_host_device (nbrs_indices, (far_nbrs->index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ );
-            copy_host_device (nbrs_indices, (far_nbrs->end_index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ );
-            free (nbrs_indices);
-        }
-        /*
-           One thread per atom Implementation
-           Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> 
-           (system->d_atoms, system->d_g, system->d_box, 
-           (control_params *)control->d_control, *far_nbrs);
-         */
-        nbs = (system->N * NBRS_THREADS_PER_ATOM/ NBRS_BLOCK_SIZE) + 
-            (((system->N *NBRS_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-        /* Multiple threads per atom Implementation */
-        Test_Generate_Neighbor_Lists <<<nbs, NBRS_BLOCK_SIZE, 
-                             (system->d_atoms, system->d_g, system->d_box, 
-                              (control_params *)control->d_control, *far_nbrs, system->N );
-        cudaThreadSynchronize (); 
-        cudaCheckError (); 
-        t_elapsed = Get_Timing_Info (t_start);
-        d_timing.nbrs += t_elapsed;
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, "Done with neighbor generation ---> %f \n", t_elapsed);
-        /*validate neighbors list*/
-        nbrs_start = (int *) calloc (system->N, INT_SIZE);
-        nbrs_end = (int *) calloc (system->N, INT_SIZE);
-        copy_host_device (nbrs_start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ );
-        copy_host_device (nbrs_end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ );
-        int device_nbrs = 0;
-        for(i = 0; i < system->N; i++)
-        {
-            if ((nbrs_end[i] - nbrs_start[i]) > max_nbrs)
-                max_nbrs = nbrs_end[i] - nbrs_start[i];
-            device_nbrs += nbrs_end[i] - nbrs_start[i]; 
-        }
-#ifdef __CUDA_TEST__
-        //fprintf (stderr, " New Device count is : %d \n", device_nbrs);
-        //dev_workspace->realloc.num_far = device_nbrs;
-#ifdef __DEBUG_CUDA__
-        fprintf (stderr, "Max neighbors is ---> %d \n", max_nbrs );
-        fprintf (stderr, "DEVICE NEIGHBORS ---> %d \n", device_nbrs);
-        //validate check here
-        //get the num_far from the list here
-        for (i = 0; i < system->N-1; i++)
-        {
-            if ((nbrs_end[i] - nbrs_start[i]) > (nbrs_start[i+1] - nbrs_start[i]) * DANGER_ZONE )
-            {
-                dev_workspace->realloc.num_far = device_nbrs;
-                //#ifdef __CUDA_MEM__
-                //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far);
-                //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d (%d %d %d) \n", 
-                //                            i, nbrs_start[i], nbrs_end[i], nbrs_start[i+1]);
-                //#endif
-            }
-            if (nbrs_end[i] > nbrs_start[i+1]) {
-                fprintf( stderr, "**ran out of space on far_nbrs: start[i] = %d, end[i]=%d, start[i+1]=%d, end[i+1] = %d",
-                        nbrs_start[i], nbrs_end[i], nbrs_start[i+1], nbrs_end[i+1]);
-                exit( INSUFFICIENT_SPACE );
-            }
-        }
-        if ((nbrs_end[i] - nbrs_start[i]) > (far_nbrs->num_intrs - nbrs_start[i]) * DANGER_ZONE ) {
-            dev_workspace->realloc.num_far = device_nbrs;
-            //#ifdef __CUDA_MEM__
-            //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far);
-            //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d start: %d, end: %d, count: %d\n"
-            //                    , i, nbrs_start[i], nbrs_end[i], far_nbrs->num_intrs);
-            //#endif
-        }
-        if (nbrs_end[i] > far_nbrs->num_intrs) {
-            fprintf( stderr, "**ran out of space on far_nbrs: top=%d, max=%d",
-                    nbrs_end[i], far_nbrs->num_intrs );
-            exit( INSUFFICIENT_SPACE );
-        }
-        free (nbrs_start);
-        free (nbrs_end);
-    }
-    //Code not used anymore
-#if defined DONE
-    void Choose_Neighbor_Finder( reax_system *system, control_params *control, 
-            get_far_neighbors_function *Get_Far_Neighbors )
-    {
-        if( control->periodic_boundaries )
-        {
-            if( system->box.box_norms[0] > 2.0 * control->vlist_cut &&
-                    system->box.box_norms[1] > 2.0 * control->vlist_cut &&
-                    system->box.box_norms[2] > 2.0 * control->vlist_cut )
-                (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box;
-            else  (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box;
-        }
-        else
-            (*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors;
-    }
-    int compare_near_nbrs(const void *v1, const void *v2)
-    {
-        return ((*(near_neighbor_data *)v1).nbr - (*(near_neighbor_data *)v2).nbr);
-    }
-    int compare_far_nbrs(const void *v1, const void *v2)
-    {
-        return ((*(far_neighbor_data *)v1).nbr - (*(far_neighbor_data *)v2).nbr);
-    }
-    inline void Set_Far_Neighbor( far_neighbor_data *dest, int nbr, real d, real C,
-            rvec dvec, ivec rel_box/*, rvec ext_factor*/ )
-    {
-        dest->nbr = nbr;
-        dest->d = d;
-        rvec_Scale( dest->dvec, C, dvec );
-        ivec_Copy( dest->rel_box, rel_box );
-        // rvec_Scale( dest->ext_factor, C, ext_factor );
-    }
-    inline void Set_Near_Neighbor(near_neighbor_data *dest, int nbr, real d, real C,
-            rvec dvec, ivec rel_box/*, rvec ext_factor*/)
-    {
-        dest->nbr = nbr;
-        dest->d = d;
-        rvec_Scale( dest->dvec, C, dvec );
-        ivec_Scale( dest->rel_box, C, rel_box );
-        // rvec_Scale( dest->ext_factor, C, ext_factor );
-    }
-    /* In case bond restrictions are applied, this method checks if
-       atom1 and atom2 are allowed to bond with each other */
-    inline int can_Bond( static_storage *workspace, int atom1, int atom2 )
-    {
-        int i;
-        // fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 );
-        if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] )
-            return 1;
-        for( i = 0; i < workspace->restricted[ atom1 ]; ++i )
-            if( workspace->restricted_list[ atom1 ][i] == atom2 )
-                return 1;
-        for( i = 0; i < workspace->restricted[ atom2 ]; ++i )
-            if( workspace->restricted_list[ atom2 ][i] == atom1 )
-                return 1;
-        return 0;
-    }
-    /* check if atom2 is on atom1's near neighbor list */
-    inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 )
-    {
-        int i;
-        for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i )
-            if( near_nbrs->select.near_nbr_list[i].nbr == atom2 )
-            {
-                // fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 );
-                return 1;
-            }
-        return 0;
-    }
-    void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
-            simulation_data *data, static_storage *workspace,
-            list **lists, output_controls *out_control )
-    {
-        int  i, j, k;
-        int  x, y, z;
-        int  *nbr_atoms;
-        int  atom1, atom2, max;
-        int   num_far;
-        int   c, count;
-        int   grid_top;
-        grid *g = &( system->g );  
-        list *far_nbrs = (*lists) + FAR_NBRS;
-        //int   hb_type1, hb_type2;
-        //list *hbonds = (*lists) + HBOND;
-        //int   top_hbond1, top_hbond2;
-        get_far_neighbors_function Get_Far_Neighbors;
-        far_neighbor_data new_nbrs[125];
-        int   l, m;
-        // fprintf( stderr, "\n\tentered nbrs - " );
-        if( control->ensemble == iNPT || control->ensemble == sNPT || 
-                control->ensemble == NPT )
-            Update_Grid( system );
-        // fprintf( stderr, "grid updated - " );
-        Bin_Atoms( system, out_control );
-        // fprintf( stderr, "atoms sorted - " );
-        Cluster_Atoms( system, workspace );
-        // fprintf( stderr, "atoms clustered - " );
-        Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
-        // fprintf( stderr, "function chosen - " );  
-        Reset_Neighbor_Lists( system, workspace, lists );  
-        // fprintf( stderr, "lists cleared - " );
-        num_far = 0;
-        num_near = 0;
-        c = 0;
-        /* first pick up a cell in the grid */
-        for( i = 0; i < g->ncell[0]; i++ )
-            for( j = 0; j < g->ncell[1]; j++ )
-                for( k = 0; k < g->ncell[2]; k++ ) {
-                    nbrs = g->nbrs[i][j][k];
-                    nbrs_cp = g->nbrs_cp[i][j][k];
-                    /* pick up an atom from the current cell */
-                    //#ifdef REORDER_ATOMS
-                    //  for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++)
-                    //#else
-                    for(l = 0; l < g->top[i][j][k]; ++l ){
-                        atom1 = g->atoms[i][j][k][l];
-                        Set_End_Index( atom1, num_far, far_nbrs );
-                        // fprintf( stderr, "atom %d:\n", atom1 );
-                        itr = 0;
-                        while( nbrs[itr][0] > 0 ){
-                            x = nbrs[itr][0];
-                            y = nbrs[itr][1];
-                            z = nbrs[itr][2];
-                            // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                            //     SQR(control->r_cut))     
-                            nbr_atoms = g->atoms[x][y][z];
-                            max_atoms = g->top[x][y][z];
-                            /* pick up another atom from the neighbor cell -
-                               we have to compare atom1 with its own periodic images as well, 
-                               that's why there is also equality in the if stmt below */
-                            //#ifdef REORDER_ATOMS
-                            //for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++)
-                            //#else
-                            for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
-                                if( atom1 >= atom2 ) {
-                                    //fprintf( stderr, "\tatom2 %d", atom2 );
-                                    //top_near1 = End_Index( atom1, near_nbrs );
-                                    //Set_Start_Index( atom1, num_far, far_nbrs );
-                                    //hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond;
-                                    Get_Far_Neighbors( system->atoms[atom1].x,
-                                            system->atoms[atom2].x, 
-                                            &(system->box), control, new_nbrs, &count );
-                                    fprintf( stderr, "\t%d count:%d\n", atom2, count );
-                                    for( c = 0; c < count; ++c )
-                                        if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
-                                            Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
-                                                    atom2, new_nbrs[c].d, 1.0, 
-                                                    new_nbrs[c].dvec, new_nbrs[c].rel_box );
-                                            ++num_far;
-                                            /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
-                                              atom1, atom2, new_nbrs[c].d, 
-                                              new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
-                                              new_nbrs[c].dvec[2] ); */
-                                            /* hydrogen bond lists */ 
-                                            /*if( control->hb_cut > 0.1 && 
-                                              new_nbrs[c].d <= control->hb_cut ) {
-                                            // fprintf( stderr, "%d %d\n", atom1, atom2 );
-                                            hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond;
-                                            if( hb_type1 == 1 && hb_type2 == 2 ) {
-                                            top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds);
-                                            Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]),
-                                            atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec,
-                                            new_nbrs[c].rel_box );
-                                            Set_End_Index( workspace->hbond_index[atom1], 
-                                            top_hbond1 + 1, hbonds );
-                                            }
-                                            else if( hb_type1 == 2 && hb_type2 == 1 ) {
-                                            top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds );
-                                            Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]),
-                                            atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, 
-                                            new_nbrs[c].rel_box );
-                                            Set_End_Index( workspace->hbond_index[atom2], 
-                                            top_hbond2 + 1, hbonds );
-                                            }*/
-                                        }
-                                        }
-                                }
-                            Set_End_Index( atom1, top_far1, far_nbrs );
-                        }
-                    }
-                    fprintf( stderr, "nbrs done-" );
-                    /* apply restrictions on near neighbors only */
-                    if( (data->step - data->prev_steps) < control->restrict_bonds ) {
-                        for( atom1 = 0; atom1 < system->N; ++atom1 )
-                            if( workspace->restricted[ atom1 ] ) {
-                                // fprintf( stderr, "atom1: %d\n", atom1 );
-                                top_near1 = End_Index( atom1, near_nbrs );
-                                for( j = 0; j < workspace->restricted[ atom1 ]; ++j )
-                                    if(!is_Near_Neighbor(near_nbrs, atom1, 
-                                                atom2 = workspace->restricted_list[atom1][j])) {
-                                        fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n",
-                                                atom1, atom2 );
-                                        top_near2 = End_Index( atom2, near_nbrs );          
-                                        /* we just would like to get the nearest image, so a call to 
-                                           Get_Periodic_Far_Neighbors_Big_Box is good enough. */
-                                        Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, 
-                                                system->atoms[ atom2 ].x, 
-                                                &(system->box), control, 
-                                                new_nbrs, &count );
-                                        Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]),
-                                                atom2, new_nbrs[c].d, 1.0, 
-                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
-                                        ++top_near1;
-                                        Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]),
-                                                atom1, new_nbrs[c].d, -1.0, 
-                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
-                                        Set_End_Index( atom2, top_near2+1, near_nbrs );
-                                    }
-                                Set_End_Index( atom1, top_near1, near_nbrs );
-                            }
-                    }
-                    // fprintf( stderr, "restrictions applied-" );
-                    /* verify nbrlists, count num_intrs, sort nearnbrs */
-                    near_nbrs->num_intrs = 0;
-                    far_nbrs->num_intrs = 0;
-                    for( i = 0; i < system->N-1; ++i ) {
-                        if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) {
-                            fprintf( stderr, 
-                                    "step%3d: nearnbr list of atom%d is overwritten by atom%d\n",
-                                    data->step, i+1, i );
-                            exit( 1 );
-                        }
-                        near_nbrs->num_intrs += Num_Entries(i, near_nbrs);
-                        if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) {
-                            fprintf( stderr, 
-                                    "step%3d: farnbr list of atom%d is overwritten by atom%d\n", 
-                                    data->step, i+1, i );
-                            exit( 1 );
-                        }
-                        far_nbrs->num_intrs += Num_Entries(i, far_nbrs);
-                    }
-                    for( i = 0; i < system->N; ++i ) {
-                        qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]),
-                                Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), 
-                                compare_near_nbrs );
-                    }
-                    // fprintf( stderr, "near nbrs sorted\n" );
-                    /* for( i = 0; i < system->N; ++i ) {
-                       qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-                       Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-                       compare_far_nbrs ); 
-                       } */
-                    fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", 
-                            num_near / system->N );
-                    fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", 
-                            num_far / system->N, control->max_far_nbrs );
-                    //fprintf( stderr, "step%d: num of nearnbrs = %6d   num of farnbrs: %6d\n",
-                    //       data->step, num_near, num_far );
-                    //fprintf( stderr, "\talloc nearnbrs = %6d   alloc farnbrs: %6d\n", 
-                    //   system->N * near_nbrs->intrs_per_unit, 
-                    //   system->N * far_nbrs->intrs_per_unit );
-                }
-        void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
-                simulation_data *data, static_storage *workspace,
-                list **lists, output_controls *out_control )
-        {
-            int  i, j, k, l, m, itr;
-            int  x, y, z;
-            int  atom1, atom2, max;
-            int  num_far, c, count;
-            int  *nbr_atoms;
-            ivec *nbrs;
-            rvec *nbrs_cp;
-            grid *g;
-            list *far_nbrs;
-            get_far_neighbors_function Get_Far_Neighbors;
-            far_neighbor_data new_nbrs[125];
-            g = &( system->g );
-            far_nbrs = (*lists) + FAR_NBRS;
-            // fprintf( stderr, "\n\tentered nbrs - " );
-            if( control->ensemble == iNPT || 
-                    control->ensemble == sNPT || 
-                    control->ensemble == NPT )
-                Update_Grid( system );
-            // fprintf( stderr, "grid updated - " );
-            Bin_Atoms( system, out_control );
-            // fprintf( stderr, "atoms sorted - " );
-            Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
-            // fprintf( stderr, "function chosen - " );  
-            Reset_Neighbor_Lists( system, workspace, lists );  
-            // fprintf( stderr, "lists cleared - " );
-            num_far = 0;
-            c = 0;
-            /* first pick up a cell in the grid */
-            for( i = 0; i < g->ncell[0]; i++ )
-                for( j = 0; j < g->ncell[1]; j++ )
-                    for( k = 0; k < g->ncell[2]; k++ ) {
-                        nbrs = g->nbrs[i][j][k];
-                        nbrs_cp = g->nbrs_cp[i][j][k];
-                        fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
-                        /* pick up an atom from the current cell */
-                        for(l = 0; l < g->top[i][j][k]; ++l ){
-                            atom1 = g->atoms[i][j][k][l];
-                            Set_Start_Index( atom1, num_far, far_nbrs );
-                            fprintf( stderr, "\tatom %d\n", atom1 );
-                            itr = 0;
-                            while( nbrs[itr][0] > 0 ){
-                                x = nbrs[itr][0];
-                                y = nbrs[itr][1];
-                                z = nbrs[itr][2];
-                                fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
-                                // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-                                //     SQR(control->r_cut))     
-                                nbr_atoms = g->atoms[x][y][z];
-                                max = g->top[x][y][z];
-                                fprintf( stderr, "\t\tmax: %d\n", max );
-                                /* pick up another atom from the neighbor cell -
-                                   we have to compare atom1 with its own periodic images as well, 
-                                   that's why there is also equality in the if stmt below */
-                                for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
-                                    if( atom1 >= atom2 ) {
-                                        Get_Far_Neighbors( system->atoms[atom1].x,
-                                                system->atoms[atom2].x, 
-                                                &(system->box), control, new_nbrs, &count );
-                                        fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count );
-                                        for( c = 0; c < count; ++c )
-                                            if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
-                                                Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
-                                                        atom2, new_nbrs[c].d, 1.0, 
-                                                        new_nbrs[c].dvec, new_nbrs[c].rel_box );
-                                                ++num_far;
-                                                /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
-                                                  atom1, atom2, new_nbrs[c].d, 
-                                                  new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
-                                                  new_nbrs[c].dvec[2] ); */
-                                            }
-                                    }
-                                ++itr;
-                            }
-                            Set_End_Index( atom1, num_far, far_nbrs );
-                        }
-                    }
-            far_nbrs->num_intrs = num_far;  
-            fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
-#if defined(DEBUG)
-            for( i = 0; i < system->N; ++i ) {
-                qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-                        Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-                        compare_far_nbrs ); 
-            }
-            fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far );
-            fprintf( stderr, "\tallocated farnbrs: %6d\n", 
-                    system->N * far_nbrs->intrs_per_unit );
-        }
diff --git a/PuReMD-GPU/src/neighbors.h b/PuReMD-GPU/src/neighbors.h
index 465d61de9775bdc20130d5f69f537031aaa98ff8..64c14ad29d5194006aacb057a7d80ef54aeee8e4 100644
--- a/PuReMD-GPU/src/neighbors.h
+++ b/PuReMD-GPU/src/neighbors.h
@@ -23,25 +23,35 @@
 #include "mytypes.h"
 void Generate_Neighbor_Lists( reax_system*, control_params*, simulation_data*,
-                              static_storage*, list**, output_controls* );
-void Cuda_Generate_Neighbor_Lists (reax_system *system,
-                                   static_storage *workspace, control_params *control, bool);
+   static_storage*, list**, output_controls* );
 int Estimate_NumNeighbors( reax_system*, control_params*,
-                           static_storage*, list** );
+   static_storage*, list** );
+int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* );
+static inline HOST_DEVICE int index_grid_debug (int x, int y, int z, int blocksize)
+    return x * 8 * 8 * blocksize +  
+        y * 8 * blocksize +  
+        z * blocksize ;
-HOST_DEVICE int Are_Far_Neighbors( rvec, rvec, simulation_box*, real, far_neighbor_data* );
+static inline HOST_DEVICE real DistSqr_to_CP( rvec cp, rvec x )
+    int  i;
+    real d_sqr = 0;
-GLOBAL void Estimate_NumNeighbors ( reax_atom *, grid , simulation_box *, control_params *, int *);
-GLOBAL void Generate_Neighbor_Lists( reax_atom *, grid , simulation_box *, control_params *, list );
+    for( i = 0; i < 3; ++i )
+        if( cp[i] > NEG_INF )
+            d_sqr += SQR( cp[i] - x[i] );
-GLOBAL void Estimate_NumNeighbors ( reax_atom *,
-                                    grid ,
-                                    simulation_box *,
-                                    control_params *,
-                                    int *, int *, int, int , int, int);
-GLOBAL void fix_sym_indices_far_nbrs (list , int );
+    return d_sqr;
diff --git a/PuReMD-GPU/src/param.h b/PuReMD-GPU/src/param.h
index f8101896932a4f06a53d3d99c6b18a5be1710078..2b24b056983233840966a8de29ce902ca6beb981 100644
--- a/PuReMD-GPU/src/param.h
+++ b/PuReMD-GPU/src/param.h
@@ -27,13 +27,15 @@
 #define MAX_TOKENS 20
 #define MAX_TOKEN_LEN 1024
-int  Get_Atom_Type( reax_interaction*, char* );
-int  Tokenize( char*, char*** );
+int Get_Atom_Type( reax_interaction*, char* );
+int Tokenize( char*, char*** );
 char Read_Force_Field( FILE*, reax_interaction* );
 char Read_Control_File( FILE*, reax_system*, control_params*,
-                        output_controls* );
+        output_controls* );
diff --git a/PuReMD-GPU/src/random.h b/PuReMD-GPU/src/random.h
index f7edb397293676c6e5a7a7d34ba5d12b8f3dab4a..b19bc58e3dcef04a324b108be718bfbff3e5c06c 100644
--- a/PuReMD-GPU/src/random.h
+++ b/PuReMD-GPU/src/random.h
@@ -23,31 +23,29 @@
 #include "mytypes.h"
-HOST_DEVICE inline double Random(double);
-HOST_DEVICE inline void Randomize();
-HOST_DEVICE inline double GRandom(double , double );
 /* System random number generator used linear congruance method with
    large periodicity for generation of pseudo random number. function
    Random returns this random number appropriately scaled so that
    0 <= Random(range) < range */
-HOST_DEVICE inline double Random(double range)
+static inline HOST_DEVICE double Random(double range)
     return (random() * range) / 2147483647L;
 /* This function seeds the system pseudo random number generator with
    current time. Use this function once in the begining to initialize
    the system */
-HOST_DEVICE inline void Randomize()
+static inline HOST_DEVICE void Randomize( )
-    srandom(time(NULL));
+    srandom( time(NULL) );
 /* GRandom return random number with gaussian distribution with mean
    and standard deviation "sigma" */
-HOST_DEVICE inline double GRandom(double mean, double sigma)
+static inline HOST_DEVICE double GRandom(double mean, double sigma)
     double v1 = Random(2.0) - 1.0;
     double v2 = Random(2.0) - 1.0;
@@ -63,4 +61,5 @@ HOST_DEVICE inline double GRandom(double mean, double sigma)
     return mean + v1 * sigma * sqrt(-2.0 * log(rsq) / rsq);
diff --git a/PuReMD-GPU/src/reset_utils.c b/PuReMD-GPU/src/reset_utils.c
new file mode 100644
index 0000000000000000000000000000000000000000..f79596aa9d29a65f673448d18a28c73c00444e43
--- /dev/null
+++ b/PuReMD-GPU/src/reset_utils.c
@@ -0,0 +1,162 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "reset_utils.h"
+#include "list.h"
+#include "vector.h"
+void Reset_Atoms( reax_system* system )
+    int i;
+    for( i = 0; i < system->N; ++i )
+        memset( system->atoms[i].f, 0.0, RVEC_SIZE );
+void Reset_Pressures( simulation_data *data )
+    rtensor_MakeZero( data->flex_bar.P );  
+    data->iso_bar.P = 0;
+    rvec_MakeZero( data->int_press );
+    rvec_MakeZero( data->ext_press );
+    /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", 
+       data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
+void Reset_Simulation_Data( simulation_data* data )
+    data->E_BE = 0;
+    data->E_Ov = 0;
+    data->E_Un = 0;
+    data->E_Lp = 0;
+    data->E_Ang = 0;
+    data->E_Pen = 0;
+    data->E_Coa = 0;
+    data->E_HB = 0;
+    data->E_Tor = 0;
+    data->E_Con = 0;
+    data->E_vdW = 0;
+    data->E_Ele = 0;
+    data->E_Kin = 0;
+void Reset_Test_Forces( reax_system *system, static_storage *workspace )
+    memset( workspace->f_ele, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_vdw, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_bo, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_be, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_lp, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_ov, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_un, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_ang, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_coa, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_pen, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_hb, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_tor, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_con, 0, system->N * sizeof(rvec) );
+void Reset_Workspace( reax_system *system, static_storage *workspace )
+    memset( workspace->total_bond_order, 0, system->N * sizeof( real ) );
+    memset( workspace->dDeltap_self, 0, system->N * sizeof( rvec ) );
+    memset( workspace->CdDelta, 0, system->N * sizeof( real ) );
+    //memset( workspace->virial_forces, 0, system->N * sizeof( rvec ) );
+    memset( workspace->dDelta, 0, sizeof(rvec) * system->N );
+    Reset_Test_Forces( system, workspace );
+void Reset_Neighbor_Lists( reax_system *system, control_params *control, 
+        static_storage *workspace, list **lists )
+    int i, tmp;
+    list *bonds = (*lists) + BONDS;
+    list *hbonds = (*lists) + HBONDS;
+    for( i = 0; i < system->N; ++i ) {
+        tmp = Start_Index( i, bonds );
+        Set_End_Index( i, tmp, bonds );
+    }
+    //TODO check if this is needed
+    memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs );
+    if( control->hb_cut > 0 )
+        for( i = 0; i < system->N; ++i )
+            if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) {
+                tmp = Start_Index( workspace->hbond_index[i], hbonds );
+                Set_End_Index( workspace->hbond_index[i], tmp, hbonds );
+                /* fprintf( stderr, "i:%d, hbond: %d-%d\n", 
+                   i, Start_Index( workspace->hbond_index[i], hbonds ), 
+                   End_Index( workspace->hbond_index[i], hbonds ) );*/
+            }
+void Reset( reax_system *system, control_params *control,  
+        simulation_data *data, static_storage *workspace, list **lists  )
+    Reset_Atoms( system );
+    Reset_Simulation_Data( data );
+    if( control->ensemble == NPT || control->ensemble == sNPT || 
+            control->ensemble == iNPT )
+        Reset_Pressures( data );
+    Reset_Workspace( system, workspace );  
+    Reset_Neighbor_Lists( system, control, workspace, lists );
+#if defined(DEBUG_FOCUS)  
+    fprintf( stderr, "reset - ");
+void Reset_Grid( grid *g )
+    memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]);
+void Reset_Marks( grid *g, ivec *grid_stack, int grid_top )
+    int i;
+    for( i = 0; i < grid_top; ++i )
+        g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + 
+            grid_stack[i][1] * g->ncell[2] + 
+            grid_stack[i][2]] = 0;
diff --git a/PuReMD-GPU/src/reset_utils.h b/PuReMD-GPU/src/reset_utils.h
index 7fb318e8e6b9fec7941ea01f1102886aa7425e3e..190bd7f5632f3b0a2b3291edf82e03aa4922793e 100644
--- a/PuReMD-GPU/src/reset_utils.h
+++ b/PuReMD-GPU/src/reset_utils.h
@@ -23,6 +23,11 @@
 #include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
 void Reset_Atoms( reax_system* );
 void Reset_Pressures( simulation_data* );
@@ -36,10 +41,10 @@ void Reset_Test_Forces( reax_system*, static_storage* );
 void Reset_Workspace( reax_system*, static_storage* );
 void Reset_Neighbor_Lists( reax_system*, control_params*,
-                           static_storage*, list** );
+        static_storage*, list** );
 void Reset( reax_system*, control_params*, simulation_data*,
-            static_storage*, list** );
+        static_storage*, list** );
 //void Reset_Neighbor_Lists( reax_system*, static_storage*, list** );
@@ -47,12 +52,9 @@ void Reset_Grid( grid* );
 void Reset_Marks( grid*, ivec*, int );
-void Cuda_Reset_Grid( grid* );
+#ifdef __cplusplus
-//CUDA functions
-void Cuda_Reset_Workspace (reax_system *, static_storage *);
-void Cuda_Reset( reax_system*, control_params*, simulation_data*,
-                 static_storage*, list** );
-void Cuda_Reset_Atoms (reax_system *);
diff --git a/PuReMD-GPU/src/single_body_interactions.c b/PuReMD-GPU/src/single_body_interactions.c
new file mode 100644
index 0000000000000000000000000000000000000000..b26f493e703819f066389991a4845acab113b326
--- /dev/null
+++ b/PuReMD-GPU/src/single_body_interactions.c
@@ -0,0 +1,314 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "single_body_interactions.h"
+#include "bond_orders.h"
+#include "index_utils.h"
+#include "list.h"
+#include "lookup.h"
+#include "vector.h"
+void LonePair_OverUnder_Coordination_Energy( reax_system *system, 
+        control_params *control, 
+        simulation_data *data,
+        static_storage *workspace, 
+        list **lists, 
+        output_controls *out_control )
+    int i, j, pj, type_i, type_j;
+    real Delta_lpcorr, dfvl;
+    real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
+    real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
+    real e_ov, CEover1, CEover2, CEover3, CEover4;
+    real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
+    real exp_ovun2n, exp_ovun6, exp_ovun8;
+    real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
+    real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
+    real p_lp1, p_lp2, p_lp3;
+    real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_data *pbond;
+    bond_order_data *bo_ij; 
+    list *bonds = (*lists) + BONDS;
+    /* Initialize parameters */
+    p_lp1 = system->reaxprm.gp.l[15];
+    p_lp3 = system->reaxprm.gp.l[5];
+    p_ovun3 = system->reaxprm.gp.l[32];
+    p_ovun4 = system->reaxprm.gp.l[31];
+    p_ovun6 = system->reaxprm.gp.l[6];
+    p_ovun7 = system->reaxprm.gp.l[8];
+    p_ovun8 = system->reaxprm.gp.l[9];
+    for( i = 0; i < system->N; ++i ) {
+        /* set the parameter pointer */
+        type_i = system->atoms[i].type;
+        sbp_i = &(system->reaxprm.sbp[ type_i ]);
+        /* lone-pair Energy */
+        p_lp2 = sbp_i->p_lp2;      
+        expvd2 = EXP( -75 * workspace->Delta_lp[i] );
+        inv_expvd2 = 1. / (1. + expvd2 );
+        /* calculate the energy */
+        data->E_Lp += e_lp = 
+            p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+        dElp = p_lp2 * inv_expvd2 + 
+            75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
+        CElp = dElp * workspace->dDelta_lp[i];
+        workspace->CdDelta[i] += CElp;      // lp - 1st term
+        fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", 
+                p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
+        fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n",
+                workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp );
+        Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
+        /* correction for C2 */
+        if( system->reaxprm.gp.l[5] > 0.001 && 
+                !strcmp( system->reaxprm.sbp[type_i].name, "C" ) )
+            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+                if( i < bonds->select.bond_list[pj].nbr ) {
+                    j = bonds->select.bond_list[pj].nbr;
+                    type_j = system->atoms[j].type;
+                    if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) {
+                        twbp = &( system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
+                        bo_ij = &( bonds->select.bond_list[pj].bo_data );
+                        Di = workspace->Delta[i];
+                        vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+                        if( vov3 > 3. ) {
+                            data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0);
+                            //estrain(i) += e_lph;
+                            deahu2dbo = 2.*p_lp3*(vov3 - 3.);
+                            deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+                            bo_ij->Cdbo += deahu2dbo;
+                            workspace->CdDelta[i] += deahu2dsbo;
+                            fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n",
+                                    // workspace->orig_id[i], workspace->orig_id[j],
+                                    i+1, j+1, e_lph, deahu2dbo, deahu2dsbo );
+                            Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
+                            Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
+                        }
+                    }
+                }
+    }
+    for( i = 0; i < system->N; ++i ) {
+        type_i = system->atoms[i].type;
+        sbp_i = &(system->reaxprm.sbp[ type_i ]);
+        /* over-coordination energy */
+        if( sbp_i->mass > 21.0 ) 
+            dfvl = 0.0;
+        else dfvl = 1.0; // only for 1st-row elements
+        p_ovun2 = sbp_i->p_ovun2;
+        sum_ovun1 = 0;
+        sum_ovun2 = 0;
+        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
+            j = bonds->select.bond_list[pj].nbr;
+            type_j = system->atoms[j].type;      
+            bo_ij = &(bonds->select.bond_list[pj].bo_data);
+            sbp_j = &(system->reaxprm.sbp[ type_j ]);
+            twbp = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
+            sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
+            sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
+                ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+            /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", 
+              i+1, j+1, 
+              dfvl * workspace->Delta_lp_temp[j],
+              sbp_j->nlp_opt,
+              workspace->nlp_temp[j] );*/
+        }
+        exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
+        inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
+        Delta_lpcorr  = workspace->Delta[i] - 
+            (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
+        exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
+        inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
+        DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 );
+        CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
+        data->E_Ov += e_ov = sum_ovun1 * CEover1;
+        CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
+            ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
+        CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
+        CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
+            p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
+        /* under-coordination potential */
+        p_ovun2 = sbp_i->p_ovun2;
+        p_ovun5 = sbp_i->p_ovun5;
+        exp_ovun2n = 1.0 / exp_ovun2;
+        exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
+        exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
+        inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
+        inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
+        data->E_Un += e_un =
+            -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+        CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
+                p_ovun2 * e_un * exp_ovun2n);
+        CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
+        CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
+        CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
+            p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
+        //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n",
+        //       i+1, sum_ovun2, e_ov, e_un );
+        /* forces */
+        workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
+        workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
+        Add_dDelta( system, lists, i, CEover3, workspace->f_ov );  // OvCoor - 2nd
+        Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st
+        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+            pbond = &(bonds->select.bond_list[pj]);
+            j = pbond->nbr;
+            type_j = system->atoms[j].type;
+            bo_ij = &(pbond->bo_data);
+            twbp  = &(system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ]);
+            bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
+            workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])*
+                (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
+            bo_ij->Cdbopi += CEover4 * 
+                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+            bo_ij->Cdbopi2 += CEover4 * 
+                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+            workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) *
+                (bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
+            bo_ij->Cdbopi += CEunder4 * 
+                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
+            bo_ij->Cdbopi2 += CEunder4 * 
+                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
+            /* fprintf( out_control->eov, "%6d%23.15e%23.15e"
+               workspace->orig_id[j]+1,
+            //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
+            CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */
+            /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
+              workspace->orig_id[j]+1, 
+              CEover4,
+              CEover4*
+              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+              CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), 
+              (1.0 - dfvl*workspace->dDelta_lp[j]),
+              CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
+            /* fprintf( out_control->eun, "%6d%23.15e\n",
+               workspace->orig_id[j]+1, CEunder3 ); */
+            /*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n",
+              workspace->orig_id[j]+1,
+              CEunder4,
+              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+              CEunder4*
+              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+              CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
+              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
+            Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, 
+                    workspace->f_ov ); // OvCoor - 1st term
+            Add_dDelta( system, lists, j,
+                    CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+                    (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a
+            Add_dBOpinpi2( system, lists, i, pj, 
+                    CEover4 * (workspace->Delta[j] - 
+                        dfvl * workspace->Delta_lp_temp[j]),
+                    CEover4 * (workspace->Delta[j] - 
+                        dfvl * workspace->Delta_lp_temp[j]),
+                    workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
+            Add_dDelta( system, lists, j,
+                    CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+                    (bo_ij->BO_pi + bo_ij->BO_pi2),
+                    workspace->f_un ); // UnCoor - 2a
+            Add_dBOpinpi2( system, lists, i, pj, 
+                    CEunder4 * (workspace->Delta[j] - 
+                        dfvl * workspace->Delta_lp_temp[j]),
+                    CEunder4 * (workspace->Delta[j] - 
+                        dfvl * workspace->Delta_lp_temp[j]),
+                    workspace->f_un, workspace->f_un ); // UnCoor - 2b
+        }
+#ifdef TEST_ENERGY      
+        fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", 
+                i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); 
+        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
+                i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un );
+        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
+                i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un );
+    }
diff --git a/PuReMD-GPU/src/single_body_interactions.h b/PuReMD-GPU/src/single_body_interactions.h
index dd26679755915e5faeecb1c3da90e81ff686132f..5ebe03b85785e956074423a17f37494f1c6ae36c 100644
--- a/PuReMD-GPU/src/single_body_interactions.h
+++ b/PuReMD-GPU/src/single_body_interactions.h
@@ -21,33 +21,13 @@
-#include <mytypes.h>
+#include "mytypes.h"
 void LonePair_OverUnder_Coordination_Energy( reax_system*, control_params*,
         simulation_data*, static_storage*,
         list**, output_controls* );
-//CUDA Functions...
-GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters ,
-        single_body_parameters *, two_body_parameters *,
-        static_storage , simulation_data *,
-        list , int , int );
-GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *, global_parameters ,
-        single_body_parameters *, two_body_parameters *,
-        static_storage , simulation_data *,
-        list , int , int,
-        real *, real *, real *);
-GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *, global_parameters ,
-        single_body_parameters *, two_body_parameters *,
-        static_storage , simulation_data *,
-        list , int , int,
-        real *, real *, real *);
-GLOBAL void test_LonePair_Postprocess ( reax_atom *, global_parameters ,
-                                        single_body_parameters *, two_body_parameters *,
-                                        static_storage , simulation_data *,
-                                        list , int , int );
diff --git a/PuReMD-GPU/src/sort.h b/PuReMD-GPU/src/sort.h
index 1ccd7116ea529a817496d1265e83b6d81d186430..11bb61288384191a057e58788b03883a7703e5a1 100644
--- a/PuReMD-GPU/src/sort.h
+++ b/PuReMD-GPU/src/sort.h
@@ -23,14 +23,16 @@
 #include "mytypes.h"
-HOST_DEVICE inline void h_swap(sparse_matrix_entry *array, int index1, int index2)
+static inline HOST_DEVICE void h_swap(sparse_matrix_entry *array, int index1, int index2)
     sparse_matrix_entry temp = array[index1];
     array[index1] = array[index2];
     array[index2] = temp;
-HOST_DEVICE inline void h_quick_sort(sparse_matrix_entry *array, int start, int end)
+static inline HOST_DEVICE void h_quick_sort(sparse_matrix_entry *array, int start, int end)
     int i = start;
     int k = end;
@@ -51,14 +53,16 @@ HOST_DEVICE inline void h_quick_sort(sparse_matrix_entry *array, int start, int
-inline void d_swap(sparse_matrix_entry *array, int index1, int index2)
+static inline void d_swap(sparse_matrix_entry *array, int index1, int index2)
     sparse_matrix_entry temp = array[index1];
     array[index1] = array[index2];
     array[index2] = temp;
-inline void d_quick_sort(sparse_matrix_entry *array, int start, int end)
+static inline void d_quick_sort(sparse_matrix_entry *array, int start, int end)
     int i = start;
     int k = end;
@@ -82,5 +86,4 @@ inline void d_quick_sort(sparse_matrix_entry *array, int start, int end)
diff --git a/PuReMD-GPU/src/system_props.c b/PuReMD-GPU/src/system_props.c
new file mode 100644
index 0000000000000000000000000000000000000000..0126b86b776dce8fd30aea0c228731b95104b216
--- /dev/null
+++ b/PuReMD-GPU/src/system_props.c
@@ -0,0 +1,348 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "system_props.h"
+#include "box.h"
+#include "vector.h"
+HOST real Get_Time( )
+    struct timeval tim;
+    gettimeofday(&tim, NULL );
+    return( tim.tv_sec + (tim.tv_usec / 1000000.0) );
+HOST real Get_Timing_Info( real t_start )
+    struct timeval tim;
+    real t_end;
+    gettimeofday(&tim, NULL );
+    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
+    return (t_end - t_start);
+void Temperature_Control( control_params *control, simulation_data *data, 
+        output_controls *out_control )
+    real tmp;
+    if( control->T_mode == 1 ) { // step-wise temperature control
+        if( (data->step - data->prev_steps) % 
+                ((int)(control->T_freq / control->dt)) == 0 ) {
+            if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
+                control->T += control->T_rate;
+            else control->T = control->T_final;     
+        }
+    }
+    else if( control->T_mode == 2 ) { // constant slope control
+        tmp = control->T_rate * control->dt / control->T_freq;
+        if( fabs( control->T - control->T_final ) >= fabs( tmp ) )
+            control->T += tmp;       
+    }
+void Compute_Total_Mass( reax_system *system, simulation_data *data )
+    int i;
+    int blocks;
+    int block_size;
+    real    *partial_sums = 0;
+    data->M = 0;
+    for( i = 0; i < system->N; i++ ) 
+        data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;  
+    data->inv_M = 1. / data->M;    
+void Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
+        FILE *fout )
+    int i;
+    real m, xx, xy, xz, yy, yz, zz, det;
+    rvec tvec, diff;
+    rtensor mat, inv;
+    int blocks;
+    int block_size;
+    rvec *l_xcm, *l_vcm, *l_amcm;
+    real t_start, t_end;
+    rvec_MakeZero( data->xcm );  // position of CoM
+    rvec_MakeZero( data->vcm );  // velocity of CoM
+    rvec_MakeZero( data->amcm ); // angular momentum of CoM
+    rvec_MakeZero( data->avcm ); // angular velocity of CoM
+    /* Compute the position, velocity and angular momentum about the CoM */
+    for( i = 0; i < system->N; ++i ) {
+        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
+        rvec_ScaledAdd( data->xcm, m, system->atoms[i].x );
+        rvec_ScaledAdd( data->vcm, m, system->atoms[i].v );
+        rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v );
+        rvec_ScaledAdd( data->amcm, m, tvec );
+        /*fprintf( fout,"%3d  %g %g %g\n",
+          i+1, 
+          system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2]  );
+          fprintf( fout, "vcm:  %g %g %g\n", 
+          data->vcm[0], data->vcm[1], data->vcm[2] );  
+         */
+    }
+    rvec_Scale( data->xcm, data->inv_M, data->xcm );
+    rvec_Scale( data->vcm, data->inv_M, data->vcm );
+    rvec_Cross( tvec, data->xcm, data->vcm );
+    rvec_ScaledAdd( data->amcm, -data->M, tvec );
+    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
+    /* Calculate and then invert the inertial tensor */
+    xx = xy = xz = yy = yz = zz = 0;
+    for( i = 0; i < system->N; ++i ) {
+        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
+        xx += diff[0] * diff[0] * m;
+        xy += diff[0] * diff[1] * m;
+        xz += diff[0] * diff[2] * m;
+        yy += diff[1] * diff[1] * m;
+        yz += diff[1] * diff[2] * m;
+        zz += diff[2] * diff[2] * m;      
+    }
+#ifdef __DEBUG_CUDA__
+    fprintf (stderr, " xx: %f \n", xx);
+    fprintf (stderr, " xy: %f \n", xy);
+    fprintf (stderr, " xz: %f \n", xz);
+    fprintf (stderr, " yy: %f \n", yy);
+    fprintf (stderr, " yz: %f \n", yz);
+    fprintf (stderr, " zz: %f \n", zz);
+    mat[0][0] = yy + zz;     
+    mat[0][1] = mat[1][0] = -xy;
+    mat[0][2] = mat[2][0] = -xz;
+    mat[1][1] = xx + zz;
+    mat[2][1] = mat[1][2] = -yz;
+    mat[2][2] = xx + yy;
+    /* invert the inertial tensor */
+    det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
+            mat[0][1] * mat[1][2] * mat[2][0] + 
+            mat[0][2] * mat[1][0] * mat[2][1] ) -
+        ( mat[0][0] * mat[1][2] * mat[2][1] + 
+          mat[0][1] * mat[1][0] * mat[2][2] + 
+          mat[0][2] * mat[1][1] * mat[2][0] );
+    inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
+    inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
+    inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
+    inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
+    inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
+    inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
+    inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
+    inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
+    inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
+    if( fabs(det) > ALMOST_ZERO )
+        rtensor_Scale( inv, 1./det, inv );
+    else 
+        rtensor_MakeZero( inv );
+    /* Compute the angular velocity about the centre of mass */
+    rtensor_MatVec( data->avcm, inv, data->amcm );  
+    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
+#if defined(DEBUG)
+    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
+            data->xcm[0], data->xcm[1], data->xcm[2] );
+    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
+            data->vcm[0], data->vcm[1], data->vcm[2] );
+    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
+            data->amcm[0], data->amcm[1], data->amcm[2] );
+    /* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
+       mat[0][0], mat[0][1], mat[0][2], 
+       mat[1][0], mat[1][1], mat[1][2], 
+       mat[2][0], mat[2][1], mat[2][2] );
+       fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
+       inv[0][0], inv[0][1], inv[0][2], 
+       inv[1][0], inv[1][1], inv[1][2], 
+       inv[2][0], inv[2][1], inv[2][2] );
+       fflush( fout ); */
+    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
+            data->avcm[0], data->avcm[1], data->avcm[2] );
+void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
+    int i;
+    rvec p;
+    real m;
+    data->E_Kin = 0.0;
+    for (i=0; i < system->N; i++) {
+        m = system->reaxprm.sbp[system->atoms[i].type].mass;
+        rvec_Scale( p, m, system->atoms[i].v );
+        data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v );
+        /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n",
+           i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
+           system->reaxprm.sbp[system->atoms[i].type].mass); */
+    }
+    data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
+    if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */
+        data->therm.T = ALMOST_ZERO;
+/* IMPORTANT: This function assumes that current kinetic energy and 
+ *  the center of mass of the system is already computed before. 
+ *
+ * IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
+ *  to be added when there are long-range interactions or long-range 
+ *  corrections to short-range interactions present.
+ *  We may want to add that for more accuracy. 
+ */
+void Compute_Pressure_Isotropic( reax_system* system, control_params *control, 
+        simulation_data* data, 
+        output_controls *out_control )
+    int i;
+    reax_atom *p_atom;
+    rvec tx;
+    rvec tmp;
+    simulation_box *box = &(system->box);
+    /* Calculate internal pressure */
+    rvec_MakeZero( data->int_press );
+    // 0: both int and ext, 1: ext only, 2: int only
+    if( control->press_mode == 0 || control->press_mode == 2 ) {
+        for( i = 0; i < system->N; ++i ) {
+            p_atom = &( system->atoms[i] );
+            /* transform x into unitbox coordinates */
+            Transform_to_UnitBox( p_atom->x, box, 1, tx );
+            /* this atom's contribution to internal pressure */
+            rvec_Multiply( tmp, p_atom->f, tx );
+            rvec_Add( data->int_press, tmp );
+            if( out_control->debug_level > 0 ) {
+                fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", 
+                        i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f", 
+                        p_atom->f[0], p_atom->f[1], p_atom->f[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", 
+                        data->int_press[0],data->int_press[1],data->int_press[2]);
+            }
+        }
+    }
+    /* kinetic contribution */
+    data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV );
+    /* Calculate total pressure in each direction */  
+    data->tot_press[0] = data->kin_press - 
+        ((data->int_press[0] + data->ext_press[0]) /
+         (box->box_norms[1] * box->box_norms[2] * P_CONV));
+    data->tot_press[1] = data->kin_press - 
+        ((data->int_press[1] + data->ext_press[1])/
+         (box->box_norms[0] * box->box_norms[2] * P_CONV));
+    data->tot_press[2] = data->kin_press - 
+        ((data->int_press[2] + data->ext_press[2])/
+         (box->box_norms[0] * box->box_norms[1] * P_CONV));
+    /* Average pressure for the whole box */
+    data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3;
+void Compute_Pressure_Isotropic_Klein( reax_system* system, 
+        simulation_data* data )
+    int i;
+    reax_atom *p_atom;
+    rvec dx;
+    // IMPORTANT: This function assumes that current kinetic energy and 
+    // the center of mass of the system is already computed before.
+    data->iso_bar.P = 2.0 * data->E_Kin;
+    for( i = 0; i < system->N; ++i )
+    {
+        p_atom = &( system->atoms[i] );
+        rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm);
+        data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) );
+    }
+    data->iso_bar.P /= (3.0 * system->box.volume);
+    // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
+    // to be added when there are long-range interactions or long-range 
+    // corrections to short-range interactions present.
+    // We may want to add that for more accuracy.
+void Compute_Pressure( reax_system* system, simulation_data* data, 
+        static_storage *workspace )
+    int i;
+    reax_atom *p_atom;
+    rtensor temp;
+    rtensor_MakeZero( data->flex_bar.P );
+    for( i = 0; i < system->N; ++i ) {
+        p_atom = &( system->atoms[i] );
+        // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx );
+        rvec_OuterProduct( temp, p_atom->v, p_atom->v );
+        rtensor_ScaledAdd( data->flex_bar.P, 
+                system->reaxprm.sbp[ p_atom->type ].mass, temp );
+        // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); 
+        rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp );
+    }
+    rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P );
+    data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0;
diff --git a/PuReMD-GPU/src/system_props.h b/PuReMD-GPU/src/system_props.h
index d287992f7b178f589808acfbaae1ed474a7a65e2..874132451d02b2d62d87c82065874f04a35b2d37 100644
--- a/PuReMD-GPU/src/system_props.h
+++ b/PuReMD-GPU/src/system_props.h
@@ -21,7 +21,12 @@
 #ifndef __SYSTEM_PROP_H_
 #define __SYSTEM_PROP_H_
-#include <mytypes.h>
+#include "mytypes.h"
+#ifdef __cplusplus
+extern "C"  {
 real Get_Time( );
@@ -30,21 +35,18 @@ real Get_Timing_Info( real );
 void Temperature_Control( control_params*, simulation_data*, output_controls* );
 void Compute_Total_Mass( reax_system*, simulation_data* );
-void Cuda_Compute_Total_Mass( reax_system*, simulation_data* );
 void Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* );
-void Cuda_Compute_Center_of_Mass( reax_system*, simulation_data*, FILE* );
 void Compute_Kinetic_Energy( reax_system*, simulation_data* );
-void Cuda_Compute_Kinetic_Energy( reax_system*, simulation_data* );
 void Compute_Pressure( reax_system*, simulation_data*, static_storage* );
 void Compute_Pressure_Isotropic( reax_system*, control_params*, simulation_data*, output_controls* );
-void prep_dev_system (reax_system *system);
-GLOBAL void Compute_Total_Mass (single_body_parameters *, reax_atom *, real *, size_t );
-//GLOBAL void Compute_Kinetic_Energy (single_body_parameters *, reax_atom *, unsigned int , simulation_data *, real *);
+#ifdef __cplusplus
diff --git a/PuReMD-GPU/src/testmd.cu b/PuReMD-GPU/src/testmd.c
similarity index 51%
rename from PuReMD-GPU/src/testmd.cu
rename to PuReMD-GPU/src/testmd.c
index 93f286cc2c0f99cc4aa9788195881dd7ffa15f9f..afe0cd4a5cf7e7282d7e2409f7f0edc66c34296a 100644
--- a/PuReMD-GPU/src/testmd.cu
+++ b/PuReMD-GPU/src/testmd.c
@@ -19,9 +19,11 @@
 #include "mytypes.h"
 #include "analyze.h"
 #include "box.h"
 #include "forces.h"
+#include "grid.h"
 #include "init_md.h"
 #include "integrate.h"
 #include "neighbors.h"
@@ -34,11 +36,17 @@
 #include "traj.h"
 #include "vector.h"
-#include "grid.h"
-#include "cuda_utils.h"
-#include "cuda_copy.h"
-#include "validation.h"
+#include "cuda_environment.h"
+#include "cuda_forces.h"
+#include "cuda_init_md.h"
+#include "cuda_neighbors.h"
+#include "cuda_post_evolve.h"
+#include "cuda_reset_utils.h"
+#include "cuda_system_props.h"
+#ifdef __BUILD_DEBUG__
+  #include "validation.h"
 interaction_function Interaction_Functions[NO_OF_INTERACTIONS];
@@ -48,11 +56,10 @@ print_interaction Print_Interactions[NO_OF_INTERACTIONS];
 LR_lookup_table *LR;
 LR_lookup_table *d_LR;
-list        *dev_lists;
+list *dev_lists;
 static_storage *dev_workspace;
 reax_timing d_timing;
 real *testdata;
@@ -61,13 +68,6 @@ void *scratch;
-cublasStatus_t cublasStatus;
-cublasHandle_t cublasHandle;
-cusparseHandle_t cusparseHandle;
-cusparseStatus_t cusparseStatus;
-cusparseMatDescr_t matdescriptor;
 void Post_Evolve( reax_system* system, control_params* control, 
         simulation_data* data, static_storage* workspace, 
@@ -90,12 +90,14 @@ void Post_Evolve( reax_system* system, control_params* control,
     /* remove rotational and translational velocity of the center of mass */
     if( control->ensemble != NVE && 
             control->remove_CoM_vel && 
-            data->step && data->step % control->remove_CoM_vel == 0 ) {
+            data->step && data->step % control->remove_CoM_vel == 0 )
+    {
         /* compute velocity of the center of mass */
         Compute_Center_of_Mass( system, data, out_control->prs );
-        for( i = 0; i < system->N; i++ ) {
+        for( i = 0; i < system->N; i++ )
+        {
             // remove translational
             rvec_ScaledAdd( system->atoms[i].v, -1., data->vcm ); 
@@ -107,98 +109,6 @@ void Post_Evolve( reax_system* system, control_params* control,
-GLOBAL void Update_Atoms_Post_Evolve (reax_atom *atoms, simulation_data *data, int N)
-    rvec diff, cross;
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= N) return;
-    //for( i = 0; i < system->N; i++ ) {
-    // remove translational
-    rvec_ScaledAdd( atoms[i].v, -1., data->vcm ); 
-    // remove rotational
-    rvec_ScaledSum( diff, 1., atoms[i].x, -1., data->xcm );
-    rvec_Cross( cross, data->avcm, diff );
-    rvec_ScaledAdd( atoms[i].v, -1., cross );
-    //}
-void Cuda_Post_Evolve( reax_system* system, control_params* control, 
-        simulation_data* data, static_storage* workspace, 
-        list** lists, output_controls *out_control )
-    int i;
-    rvec diff, cross;
-    /* compute kinetic energy of the system */
-    /*
-       real *results = (real *) scratch;
-       cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
-       Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-       (system->reaxprm.d_sbp, system->d_atoms, system->N, 
-       (simulation_data *)data->d_simulation_data, (real *) results);
-       cudaThreadSynchronize ();
-       cudaCheckError ();
-     */
-    //fprintf (stderr, "Cuda_Post_Evolve: Begin\n");
-    Cuda_Compute_Kinetic_Energy (system, data);
-    //fprintf (stderr, " Cuda_Compute_Kinetic_Energy done.... \n");
-    /* remove rotational and translational velocity of the center of mass */
-    if( control->ensemble != NVE && 
-            control->remove_CoM_vel && 
-            data->step && data->step % control->remove_CoM_vel == 0 ) {
-        /*
-           rvec t_xcm, t_vcm, t_avcm;
-           rvec_MakeZero (t_xcm);
-           rvec_MakeZero (t_vcm);
-           rvec_MakeZero (t_avcm);
-           rvec_Copy (t_xcm, data->xcm);
-           rvec_Copy (t_vcm, data->vcm);
-           rvec_Copy (t_avcm, data->avcm);
-         */
-        /* compute velocity of the center of mass */
-        Cuda_Compute_Center_of_Mass( system, data, out_control->prs );
-        //fprintf (stderr, "Cuda_Compute_Center_of_Mass done... \n");
-        /*
-           fprintf (stderr, "center of mass done on the device \n");
-           fprintf (stderr, "xcm --> %4.10f %4.10f \n", t_xcm, data->xcm );
-           fprintf (stderr, "vcm --> %4.10f %4.10f \n", t_vcm, data->vcm );
-           fprintf (stderr, "avcm --> %4.10f %4.10f \n", t_avcm, data->avcm );
-           if (check_zero (t_xcm, data->xcm) || 
-           check_zero (t_vcm, data->vcm) ||
-           check_zero (t_avcm, data->avcm)){
-           fprintf (stderr, "SimulationData (xcm, vcm, avcm) does not match between device and host \n");
-           exit (0);
-           }
-         */
-        //xcm, avcm, 
-        copy_host_device (data->vcm, ((simulation_data *)data->d_simulation_data)->vcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
-        copy_host_device (data->xcm, ((simulation_data *)data->d_simulation_data)->xcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
-        copy_host_device (data->avcm, ((simulation_data *)data->d_simulation_data)->avcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
-        //fprintf (stderr, "data copied.... \n");
-        Update_Atoms_Post_Evolve  <<< BLOCKS, BLOCK_SIZE >>>
-            (system->d_atoms, (simulation_data *)data->d_simulation_data, system->N);
-        cudaThreadSynchronize ();
-        cudaCheckError ();
-        //fprintf (stderr, " Cuda_Post_Evolve:End \n");
-    }
 void Read_System( char *geof, char *ff, char *ctrlf, 
         reax_system *system, control_params *control, 
@@ -217,15 +127,21 @@ void Read_System( char *geof, char *ff, char *ctrlf,
     Read_Control_File( ctrl, system, control, out_control );
     /* geo file */
-    if( control->geo_format == XYZ ) {
+    if( control->geo_format == XYZ )
+    {
         fprintf( stderr, "xyz input is not implemented yet\n" );
-        exit(1);
+        exit( 1 );
     else if( control->geo_format == PDB ) 
+    {
         Read_PDB( geof, system, control, data, workspace );
+    }
     else if( control->geo_format == BGF ) 
+    {
         Read_BGF( geof, system, control, data, workspace );
-    else if( control->geo_format == ASCII_RESTART ) {
+    }
+    else if( control->geo_format == ASCII_RESTART )
+    {
         Read_ASCII_Restart( geof, system, control, data, workspace );
         control->restart = 1;
@@ -233,9 +149,10 @@ void Read_System( char *geof, char *ff, char *ctrlf,
         Read_Binary_Restart( geof, system, control, data, workspace );
         control->restart = 1;
-    else {
+    else
+    {
         fprintf( stderr, "unknown geo file format. terminating!\n" );
-        exit(1);
+        exit( 1 );
 #if defined(DEBUG_FOCUS)
@@ -244,17 +161,18 @@ void Read_System( char *geof, char *ff, char *ctrlf,
-void Init_Data_Structures (simulation_data *data)
+void Init_Data_Structures( simulation_data *data )
     //data->step = 0;
     //data->prev_steps = 0;
     //data->time = 0;
-    memset (data, 0, SIMULATION_DATA_SIZE );
+    memset( data, 0, SIMULATION_DATA_SIZE );
-int main(int argc, char* argv[])
+int main( int argc, char* argv[] )
     reax_system system;
     control_params control;
@@ -271,16 +189,7 @@ int main(int argc, char* argv[])
     lists = (list*) malloc( sizeof(list) * LIST_N );
-    cudaDeviceSetLimit (cudaLimitStackSize, 8192);
-    cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
-    cudaCheckError ();
-    cublasCheckError (cublasStatus = cublasCreate (&cublasHandle));  
-    cusparseCheckError (cusparseStatus = cusparseCreate (&cusparseHandle));
-    cusparseCheckError (cusparseCreateMatDescr (&matdescriptor));
-    cusparseSetMatType (matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
-    cusparseSetMatIndexBase (matdescriptor, CUSPARSE_INDEX_BASE_ZERO);
+    Setup_Cuda_Environment( 0, 1, 1 );
     dev_lists = (list *) malloc (sizeof (list) * LIST_N );
     dev_workspace = (static_storage *) malloc (STORAGE_SIZE);
@@ -289,32 +198,14 @@ int main(int argc, char* argv[])
     dev_workspace->realloc.estimate_nbrs = -1;
     //Cleanup before usage.
-    Init_Data_Structures (&data);
-    system.init_thblist = false;
+    Init_Data_Structures( &data );
+    system.init_thblist = FALSE;
     Read_System( argv[1], argv[2], argv[3], &system, &control, 
             &data, &workspace, &out_control );
-    compute_blocks (&BLOCKS, &BLOCK_SIZE, system.N);
-    compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2);
-    //MATVEC_BLOCKS = system.N;
-    //MATVEC_BLOCK_SIZE = 32;
-        ((system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) == 0 ? 0 : 1);
-#ifdef __DEBUG_CUDA__
-    fprintf (stderr, " MATVEC Blocks : %d, Block_Size : %d \n", MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
-    fprintf (stderr, " Blocks : %d, Blocks_Pow_2 : %d, Block_Size : %d \n", BLOCKS, BLOCKS_POW_2, BLOCK_SIZE );
-    fprintf (stderr, " Size of far neighbor data %d \n", sizeof (far_neighbor_data));
-    fprintf (stderr, " Size of reax_atom %d \n", sizeof (reax_atom));
-    fprintf (stderr, " size of sparse matrix entry %d \n", sizeof (sparse_matrix_entry));
-    fprintf (stderr, " TOTAL NUMBER OF ATOMS IN THE SYSTEM --> %d \n", system.N);
 #ifdef __CUDA_MEM__
-    print_device_mem_usage ();
+    print_device_mem_usage( );
 #ifdef __BUILD_DEBUG__
@@ -322,71 +213,51 @@ int main(int argc, char* argv[])
             &out_control, &Evolve );
-    t_start = Get_Time ();
+    t_start = Get_Time( );
     Cuda_Initialize( &system, &control, &data, &workspace, &lists, 
-            &out_control, &Cuda_Evolve);
-    t_elapsed = Get_Timing_Info (t_start);
+            &out_control, &Cuda_Evolve );
+    t_elapsed = Get_Timing_Info( t_start );
 #ifdef __DEBUG_CUDA__
-    fprintf (stderr, " Cuda Initialize timing ---> %f \n", t_elapsed );
+    fprintf( stderr, " Cuda Initialize timing ---> %f \n", t_elapsed );
 #ifdef __CUDA_MEM__
-    print_device_mem_usage ();
+    print_device_mem_usage( );
 #ifdef __BUILD_DEBUG__
     Reset( &system, &control, &data, &workspace, &lists );
-    Cuda_Reset( &system, &control, &data, &workspace, &lists );
+    Cuda_Reset( &system, &control, &data, &workspace, &lists );
 #ifdef __BUILD_DEBUG__
-    Generate_Neighbor_Lists ( &system, &control, &data, &workspace, 
+    Generate_Neighbor_Lists( &system, &control, &data, &workspace, 
             &lists, &out_control );
-    /*
-       dim3 blockspergrid (system.g.ncell[0], system.g.ncell[1], system.g.ncell[2]);
-       dim3 threadsperblock (system.g.max_atoms);
-       t_start = Get_Time ();
-       Cuda_Bin_Atoms (&system, &workspace);
-       Cuda_Bin_Atoms_Sync ( &system );
-       Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> 
-       (system.d_atoms, system.d_g, system.d_box, 
-       (control_params *)control.d_control, *(dev_lists + FAR_NBRS));
-       cudaThreadSynchronize (); 
-       cudaCheckError ();
-       t_elapsed = Get_Timing_Info (t_start);
-       d_timing.nbrs += t_elapsed;
-     */
-    Cuda_Generate_Neighbor_Lists (&system, &workspace, &control, false);
+    Cuda_Generate_Neighbor_Lists( &system, &workspace, &control, FALSE );
 #ifdef __BUILD_DEBUG__
     Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
-    Cuda_Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
+    Cuda_Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
 #ifdef __BUILD_DEBUG__
     Compute_Kinetic_Energy( &system, &data );
-    Cuda_Compute_Kinetic_Energy (&system, &data);
+    Cuda_Compute_Kinetic_Energy (&system, &data);
 #ifndef __BUILD_DEBUG__
-    // Here sync the simulation data, because it has been changed.
-    Prep_Device_For_Output ( &system, &data );
+    Cuda_Setup_Output( &system, &data );
     Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
 #ifdef __BUILD_DEBUG__
-    if (!validate_device (&system, &data, &workspace, &lists) )
+    if( !validate_device (&system, &data, &workspace, &lists) )
         fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step);
         exit (1);
@@ -397,50 +268,45 @@ int main(int argc, char* argv[])
     fprintf (stderr, "step -> %d <- done. \n", data.step);
-    for( ; data.step <= control.nsteps; data.step++ ) {      
-        //fprintf (stderr, "Begin ... \n");
-        //to Sync step to the device.
-        //Sync_Host_Device (&data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice );
-        copy_host_device (&data.step, &((simulation_data *)data.d_simulation_data)->step, 
-                INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+    for( ; data.step <= control.nsteps; data.step++ )
+    {
+        Cuda_Setup_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
         //fprintf (stderr, "Synched data .... \n");
-        if( control.T_mode ) {
+        if( control.T_mode )
+        {
             Temperature_Control( &control, &data, &out_control );
-            Sync_Host_Device (&control, (control_params *)control.d_control, cudaMemcpyHostToDevice );
+            Cuda_Sync_Temp( &control );
         //fprintf (stderr, "Temp. Control done ... \n");
 #ifdef __BUILD_DEBUG__
         Evolve( &system, &control, &data, &workspace, &lists, &out_control );
-        Cuda_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
+        Cuda_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
         //fprintf (stderr, "Evolve done \n");
 #ifdef __BUILD_DEBUG__
         Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
         Cuda_Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
         //fprintf (stderr, "Post Evolve done \n");
 #ifndef __BUILD_DEBUG__
-        Prep_Device_For_Output ( &system, &data );
-        Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
+        Cuda_Setup_Output( &system, &data );
+        Output_Results( &system, &control, &data, &workspace, &lists, &out_control );
-        /*
-           Analysis( &system, &control, &data, &workspace, &lists, &out_control );
-         */
+        //Analysis( &system, &control, &data, &workspace, &lists, &out_control );
         steps = data.step - data.prev_steps;
         if( steps && out_control.restart_freq && 
                 steps % out_control.restart_freq == 0 )
+        {
             Write_Restart( &system, &control, &data, &workspace, &out_control );
+        }
 #ifdef __BUILD_DEBUG__
@@ -463,5 +329,7 @@ int main(int argc, char* argv[])
     data.timing.elapsed = Get_Timing_Info( data.timing.start );
     fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed );
+    Cleanup_Cuda_Environment( );
     return 0;
diff --git a/PuReMD-GPU/src/three_body_interactions.c b/PuReMD-GPU/src/three_body_interactions.c
new file mode 100644
index 0000000000000000000000000000000000000000..7ac96e057c6c799ba88204f3f6339fe54b3c61da
--- /dev/null
+++ b/PuReMD-GPU/src/three_body_interactions.c
@@ -0,0 +1,801 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "three_body_interactions.h"
+#include "bond_orders.h"
+#include "list.h"
+#include "lookup.h"
+#include "vector.h"
+#include "index_utils.h"
+/* calculates the theta angle between i-j-k */
+void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
+        real *theta, real *cos_theta )
+    (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
+    if( *cos_theta > 1. ) *cos_theta  = 1.0;
+    if( *cos_theta < -1. ) *cos_theta  = -1.0;
+    (*theta) = ACOS( *cos_theta );
+/* calculates the derivative of the cosine of the angle between i-j-k */
+void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
+        rvec* dcos_theta_di, rvec* dcos_theta_dj, 
+        rvec* dcos_theta_dk )
+    int  t;
+    real sqr_d_ji   = SQR(d_ji);
+    real sqr_d_jk   = SQR(d_jk);
+    real inv_dists  = 1.0 / (d_ji * d_jk);
+    real inv_dists3 = POW( inv_dists, 3 );
+    real dot_dvecs  = Dot( dvec_ji, dvec_jk, 3 );
+    real Cdot_inv3  = dot_dvecs * inv_dists3;
+    for( t = 0; t < 3; ++t ) {
+        (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - 
+            Cdot_inv3 * sqr_d_jk * dvec_ji[t];
+        (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists +
+            Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
+        (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - 
+            Cdot_inv3 * sqr_d_ji * dvec_jk[t];
+    }
+    /*fprintf( stderr, 
+      "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+      dvec_jk[t] * inv_dists*/
+/* this is a 3-body interaction in which the main role is 
+   played by j which sits in the middle of the other two. */
+void Three_Body_Interactions( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
+    int  i, j, pi, k, pk, t;
+    int  type_i, type_j, type_k;
+    int  start_j, end_j, start_pk, end_pk;
+    int  flag, cnt, num_thb_intrs;
+    real temp, temp_bo_jt, pBOjt7;
+    real p_val1, p_val2, p_val3, p_val4, p_val5;
+    real p_val6, p_val7, p_val8, p_val9, p_val10;
+    real p_pen1, p_pen2, p_pen3, p_pen4;
+    real p_coa1, p_coa2, p_coa3, p_coa4;
+    real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
+    real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
+    real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
+    real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
+    real CEpen1, CEpen2, CEpen3;
+    real e_ang, e_coa, e_pen;
+    real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
+    real Cf7ij, Cf7jk, Cf8j, Cf9j;
+    real f7_ij, f7_jk, f8_Dj, f9_Dj;
+    real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
+    real r_ij, r_jk;
+    real BOA_ij, BOA_jk;
+    real vlpadj;
+    rvec force, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    real *total_bo;
+    three_body_header *thbh;
+    three_body_parameters *thbp;
+    three_body_interaction_data *p_ijk, *p_kji;
+    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+    bond_order_data *bo_ij, *bo_jk, *bo_jt;
+    list *bonds, *thb_intrs;
+    bond_data *bond_list;
+    three_body_interaction_data *thb_list;
+    total_bo = workspace->total_bond_order;
+    bonds = (*lists) + BONDS;
+    bond_list = bonds->select.bond_list;
+    thb_intrs = (*lists) + THREE_BODIES;
+    thb_list = thb_intrs->select.three_body_list;
+    /* global parameters used in these calculations */
+    p_val6 = system->reaxprm.gp.l[14];
+    p_val8 = system->reaxprm.gp.l[33];
+    p_val9 = system->reaxprm.gp.l[16];
+    p_val10 = system->reaxprm.gp.l[17];
+    num_thb_intrs = 0;
+    for( j = 0; j < system->N; ++j ) {
+        // fprintf( out_control->eval, "j: %d\n", j );
+        type_j = system->atoms[j].type;
+        start_j = Start_Index(j, bonds);
+        end_j = End_Index(j, bonds);
+        p_val3 = system->reaxprm.sbp[ type_j ].p_val3;
+        p_val5 = system->reaxprm.sbp[ type_j ].p_val5;
+        SBOp = 0, prod_SBO = 1;
+        for( t = start_j; t < end_j; ++t ) {
+            bo_jt = &(bond_list[t].bo_data);
+            SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
+            temp = SQR( bo_jt->BO );
+            temp *= temp; 
+            temp *= temp;
+            prod_SBO *= EXP( -temp );
+        }
+        /* modifications to match Adri's code - 09/01/09 */
+        if( workspace->vlpex[j] >= 0 ){
+            vlpadj = 0;
+            dSBO2 = prod_SBO - 1;
+        }
+        else{
+            vlpadj = workspace->nlp[j];
+            dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
+        }
+        SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
+        dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
+        if( SBO <= 0 )
+            SBO2 = 0, CSBO2 = 0;
+        else if( SBO > 0 && SBO <= 1 ) {
+            SBO2 = POW( SBO, p_val9 );
+            CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
+        }
+        else if( SBO > 1 && SBO < 2 ) {
+            SBO2 = 2 - POW( 2-SBO, p_val9 );
+            CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
+        }
+        else 
+            SBO2 = 2, CSBO2 = 0;  
+        expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
+        /* unlike 2-body intrs where we enforce i<j, we cannot put any such 
+           restrictions here. such a restriction would prevent us from producing 
+           all 4-body intrs correctly */
+        for( pi = start_j; pi < end_j; ++pi ) {
+            Set_Start_Index( pi, num_thb_intrs, thb_intrs );
+            pbond_ij = &(bond_list[pi]);
+            bo_ij = &(pbond_ij->bo_data);
+            BOA_ij = bo_ij->BO - control->thb_cut;
+            if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) {
+                i = pbond_ij->nbr;
+                r_ij = pbond_ij->d;     
+                type_i = system->atoms[i].type;
+                // fprintf( out_control->eval, "i: %d\n", i );
+                /* first copy 3-body intrs from previously computed ones where i>k.
+                   IMPORTANT: if it is less costly to compute theta and its 
+                   derivative, we should definitely re-compute them, 
+                   instead of copying!
+                   in the second for-loop below, we compute only new 3-body intrs 
+                   where i < k */
+                for( pk = start_j; pk < pi; ++pk ) {
+                    // fprintf( out_control->eval, "pk: %d\n", pk );
+                    start_pk = Start_Index( pk, thb_intrs );
+                    end_pk = End_Index( pk, thb_intrs );
+                    for( t = start_pk; t < end_pk; ++t )
+                        if( thb_list[t].thb == i ) {
+                            p_ijk = &(thb_list[num_thb_intrs]);
+                            p_kji = &(thb_list[t]);
+                            p_ijk->thb = bond_list[pk].nbr;
+                            p_ijk->pthb  = pk;
+                            p_ijk->theta = p_kji->theta;              
+                            rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
+                            rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
+                            rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
+                            //if (j == 12)
+                            //fprintf (stderr, "Adding one for matched atom %d \n", i);
+                            ++num_thb_intrs;
+                            break;
+                        }
+                }
+                /* and this is the second for loop mentioned above */
+                for( pk = pi+1; pk < end_j; ++pk ) {
+                    pbond_jk = &(bond_list[pk]);
+                    bo_jk    = &(pbond_jk->bo_data);
+                    BOA_jk   = bo_jk->BO - control->thb_cut;
+                    k        = pbond_jk->nbr;
+                    type_k   = system->atoms[k].type;
+                    p_ijk    = &( thb_list[num_thb_intrs] );
+                    //TODO - CHANGE ORIGINAL
+                    if (BOA_jk <= 0) continue;
+                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
+                            pbond_jk->dvec, pbond_jk->d,
+                            &theta, &cos_theta );
+                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
+                            pbond_jk->dvec, pbond_jk->d, 
+                            &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
+                            &(p_ijk->dcos_dk) );
+                    p_ijk->thb = k;
+                    p_ijk->pthb = pk;
+                    p_ijk->theta = theta;
+                    //if (j == 12)
+                    //fprintf (stderr, "Adding one for the rest %d \n", k);
+                    sin_theta = SIN( theta );
+                    if( sin_theta < 1.0e-5 )
+                        sin_theta = 1.0e-5;
+                    ++num_thb_intrs;
+                    if( BOA_jk > 0.0 && 
+                            (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
+                        r_jk = pbond_jk->d;              
+                        thbh = &( system->reaxprm.thbp[ index_thbp(type_i,type_j,type_k,system->reaxprm.num_atom_types) ] );
+                        flag = 0;
+                        /* if( workspace->orig_id[i] < workspace->orig_id[k] )
+                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                           workspace->orig_id[i], workspace->orig_id[j],
+                           workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
+                           else 
+                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                           workspace->orig_id[k], workspace->orig_id[j],
+                           workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
+                        for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
+                            // fprintf( out_control->eval, 
+                            // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
+                            if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
+                                thbp = &( thbh->prm[cnt] );
+                                /* ANGLE ENERGY */
+                                p_val1 = thbp->p_val1;
+                                p_val2 = thbp->p_val2;
+                                p_val4 = thbp->p_val4;
+                                p_val7 = thbp->p_val7;
+                                theta_00 = thbp->theta_00;
+                                exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
+                                f7_ij = 1.0 - exp3ij;
+                                Cf7ij = p_val3 * p_val4 * 
+                                    POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+                                exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
+                                f7_jk = 1.0 - exp3jk;
+                                Cf7jk = p_val3 * p_val4 * 
+                                    POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+                                expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
+                                trm8 = 1.0 + expval6 + expval7;
+                                f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
+                                Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
+                                    (p_val6 * expval6 * trm8 - 
+                                     (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
+                                theta_0 = 180.0 - 
+                                    theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
+                                theta_0 = DEG2RAD( theta_0 );              
+                                expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
+                                if( p_val1 >= 0 )
+                                    expval12theta = p_val1 * (1.0 - expval2theta);
+                                else // To avoid linear Me-H-Me angles (6/6/06)
+                                    expval12theta = p_val1 * -expval2theta;
+                                CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
+                                CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
+                                CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
+                                CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
+                                    expval2theta * (theta_0 - theta);
+                                Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
+                                    exp( -p_val10 * (2.0 - SBO2) );
+                                CEval5 = -CEval4 * Ctheta_0 * CSBO2;
+                                CEval6 = CEval5 * dSBO1;
+                                CEval7 = CEval5 * dSBO2;
+                                CEval8 = -CEval4 / sin_theta;
+                                data->E_Ang += e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
+                                /* END ANGLE ENERGY*/
+                                /* PENALTY ENERGY */
+                                p_pen1 = thbp->p_pen1;
+                                p_pen2 = system->reaxprm.gp.l[19];
+                                p_pen3 = system->reaxprm.gp.l[20];
+                                p_pen4 = system->reaxprm.gp.l[21];
+                                exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                                exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                                exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
+                                exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
+                                trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
+                                f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
+                                Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
+                                        (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
+                                            p_pen4 * exp_pen4 )) /
+                                    SQR( trm_pen34 );
+                                data->E_Pen += e_pen = 
+                                    p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                                CEpen1 = e_pen * Cf9j / f9_Dj;
+                                temp   = -2.0 * p_pen2 * e_pen;
+                                CEpen2 = temp * (BOA_ij - 2.0);
+                                CEpen3 = temp * (BOA_jk - 2.0);
+                                /* END PENALTY ENERGY */
+                                /* COALITION ENERGY */
+                                p_coa1 = thbp->p_coa1;
+                                p_coa2 = system->reaxprm.gp.l[2];
+                                p_coa3 = system->reaxprm.gp.l[38];
+                                p_coa4 = system->reaxprm.gp.l[30];
+                                exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
+                                data->E_Coa += e_coa = 
+                                    p_coa1 / (1. + exp_coa2) *
+                                    EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
+                                    EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
+                                    EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
+                                    EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+                                CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
+                                CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
+                                CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
+                                CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
+                                CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
+                                /* END COALITION ENERGY */
+                                /* FORCES */
+                                bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4));
+                                bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5));
+                                workspace->CdDelta[j] += ((CEval3 + CEval7) + 
+                                        CEpen1 + CEcoa3);
+                                workspace->CdDelta[i] += CEcoa4;
+                                workspace->CdDelta[k] += CEcoa5;              
+                                for( t = start_j; t < end_j; ++t ) {
+                                    pbond_jt = &( bond_list[t] );
+                                    bo_jt = &(pbond_jt->bo_data);
+                                    temp_bo_jt = bo_jt->BO;
+                                    temp = CUBE( temp_bo_jt );
+                                    pBOjt7 = temp * temp * temp_bo_jt; 
+                                    // fprintf( out_control->eval, "%6d%12.8f\n", 
+                                    // workspace->orig_id[ bond_list[t].nbr ], 
+                                    //    (CEval6 * pBOjt7) );
+                                    bo_jt->Cdbo += (CEval6 * pBOjt7);
+                                    bo_jt->Cdbopi += CEval5;
+                                    bo_jt->Cdbopi2 += CEval5;
+                                }              
+                                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                                    rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di );
+                                    rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                    rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk );
+                                    /*
+                                       if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j);
+                                       if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j);
+                                     */
+                                }
+                                else {
+                                    /* terms not related to bond order derivatives
+                                       are added directly into 
+                                       forces and pressure vector/tensor */
+                                    rvec_Scale( force, CEval8, p_ijk->dcos_di );
+                                    rvec_Add( system->atoms[i].f, force );
+                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                    rvec_Add( data->ext_press, ext_press );
+                                    rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                    rvec_Scale( force, CEval8, p_ijk->dcos_dk );
+                                    rvec_Add( system->atoms[k].f, force );
+                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                    rvec_Add( data->ext_press, ext_press );
+                                    /* This part is for a fully-flexible box */
+                                    /* rvec_OuterProduct( temp_rtensor, 
+                                       p_ijk->dcos_di, system->atoms[i].x );
+                                       rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
+                                       rvec_OuterProduct( temp_rtensor, 
+                                       p_ijk->dcos_dj, system->atoms[j].x );
+                                       rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+                                       rvec_OuterProduct( temp_rtensor, 
+                                       p_ijk->dcos_dk, system->atoms[k].x );
+                                       rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+                                       if( pbond_ij->imaginary || pbond_jk->imaginary )
+                                       rtensor_ScaledAdd( data->flex_bar.P, 
+                                       -1.0, total_rtensor );
+                                       else
+                                       rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                                }
+                                fprintf( out_control->eval, 
+                                        //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
+                                        "%6d%6d%6d%23.15e%23.15e%23.15e\n",
+                                        i+1, j+1, k+1,
+                                        //workspace->orig_id[i]+1,  
+                                        //workspace->orig_id[j]+1,
+                                        //workspace->orig_id[k]+1,
+                                        //workspace->Delta_boc[j], 
+                                        RAD2DEG(theta), /*BOA_ij, BOA_jk, */
+                                        e_ang, data->E_Ang );
+                                /*fprintf( out_control->eval, 
+                                  "%23.15e%23.15e%23.15e%23.15e",
+                                  p_val3, p_val4, BOA_ij, BOA_jk );
+                                  fprintf( out_control->eval, 
+                                  "%23.15e%23.15e%23.15e%23.15e",
+                                  f7_ij, f7_jk, f8_Dj, expval12theta );
+                                  fprintf( out_control->eval, 
+                                  "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                  CEval1, CEval2, CEval3, CEval4, CEval5
+                                //CEval6, CEval7, CEval8  );*/
+                                /*fprintf( out_control->eval, 
+                                  "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                  -p_ijk->dcos_di[0]/sin_theta, 
+                                  -p_ijk->dcos_di[1]/sin_theta, 
+                                  -p_ijk->dcos_di[2]/sin_theta, 
+                                  -p_ijk->dcos_dj[0]/sin_theta, 
+                                  -p_ijk->dcos_dj[1]/sin_theta, 
+                                  -p_ijk->dcos_dj[2]/sin_theta, 
+                                  -p_ijk->dcos_dk[0]/sin_theta, 
+                                  -p_ijk->dcos_dk[1]/sin_theta, 
+                                  -p_ijk->dcos_dk[2]/sin_theta );*/
+                                /* fprintf( out_control->epen, 
+                                   "%23.15e%23.15e%23.15e\n", 
+                                   CEpen1, CEpen2, CEpen3 );
+                                   fprintf( out_control->epen, 
+                                   "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                   workspace->orig_id[i],  workspace->orig_id[j],
+                                   workspace->orig_id[k], RAD2DEG(theta), 
+                                   BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
+                                fprintf( out_control->ecoa, 
+                                        "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                        workspace->orig_id[i], 
+                                        workspace->orig_id[j],
+                                        workspace->orig_id[k], 
+                                        RAD2DEG(theta), BOA_ij, BOA_jk, 
+                                        e_coa, data->E_Coa );
+#ifdef TEST_FORCES            /* angle forces */
+                                Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
+                                Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
+                                Add_dDelta( system, lists, 
+                                        j, CEval3 + CEval7, workspace->f_ang );
+                                for( t = start_j; t < end_j; ++t ) {
+                                    pbond_jt = &( bond_list[t] );
+                                    bo_jt = &(pbond_jt->bo_data);
+                                    temp_bo_jt = bo_jt->BO;
+                                    temp = CUBE( temp_bo_jt );
+                                    pBOjt7 = temp * temp * temp_bo_jt; 
+                                    Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
+                                            workspace->f_ang );
+                                    Add_dBOpinpi2( system, lists, j, t, 
+                                            CEval5, CEval5, 
+                                            workspace->f_ang, workspace->f_ang );
+                                }
+                                rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
+                                rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
+                                rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
+                                /* end angle forces */
+                                /* penalty forces */
+                                Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
+                                Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
+                                Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
+                                /* end penalty forces */
+                                /* coalition forces */
+                                Add_dBO( system, lists, 
+                                        j, pi, CEcoa1-CEcoa4, workspace->f_coa );
+                                Add_dBO( system, lists, 
+                                        j, pk, CEcoa2-CEcoa5, workspace->f_coa );
+                                Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
+                                Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
+                                Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
+                                /* end coalition forces */
+                            }
+                        }
+                    }
+                }
+            }
+            Set_End_Index(pi, num_thb_intrs, thb_intrs );
+        }
+    }
+    if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
+        workspace->realloc.num_3body = num_thb_intrs;
+        if( num_thb_intrs > thb_intrs->num_intrs ) {
+            fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
+                    data->step, num_thb_intrs, thb_intrs->num_intrs );
+            exit( INSUFFICIENT_SPACE );
+        }
+    }
+    //fprintf( stderr,"%d: Number of angle interactions: %d\n", 
+    // data->step, num_thb_intrs );
+    fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
+    fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
+            data->E_Ang, data->E_Pen, data->E_Coa );
+    fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
+            data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+void Hydrogen_Bonds( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+    int i, j, k, pi, pk, itr, top;
+    int type_i, type_j, type_k;
+    int start_j, end_j, hb_start_j, hb_end_j;
+    int hblist[MAX_BONDS];
+    int num_hb_intrs = 0;
+    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+    rvec dvec_jk, force, ext_press;
+    ivec rel_jk;
+    // rtensor temp_rtensor, total_rtensor;
+    hbond_parameters *hbp;
+    bond_order_data *bo_ij;
+    bond_data *pbond_ij;
+    far_neighbor_data *nbr_jk;
+    list *bonds, *hbonds;
+    bond_data *bond_list;
+    hbond_data *hbond_list;
+    bonds = (*lists) + BONDS;
+    bond_list = bonds->select.bond_list;
+    hbonds = (*lists) + HBONDS;
+    hbond_list = hbonds->select.hbond_list;
+    /* loops below discover the Hydrogen bonds between i-j-k triplets.
+       here j is H atom and there has to be some bond between i and j.
+       Hydrogen bond is between j and k.
+       so in this function i->X, j->H, k->Z when we map 
+       variables onto the ones in the handout.*/
+    for( j = 0; j < system->N; ++j )
+        if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H
+            /*set j's variables */
+            type_j  = system->atoms[j].type;
+            start_j = Start_Index(j, bonds);
+            end_j   = End_Index(j, bonds);
+            hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+            hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
+            top = 0;
+            for( pi = start_j; pi < end_j; ++pi ) {
+                pbond_ij = &( bond_list[pi] );
+                i = pbond_ij->nbr;
+                bo_ij = &(pbond_ij->bo_data);
+                type_i = system->atoms[i].type;
+                if( system->reaxprm.sbp[type_i].p_hbond == 2 && 
+                        bo_ij->BO >= HB_THRESHOLD )
+                    hblist[top++] = pi;
+            }
+            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+            //          j, top, hb_start_j, hb_end_j );
+            for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
+                /* set k's varibles */
+                k = hbond_list[pk].nbr;
+                type_k = system->atoms[k].type;
+                nbr_jk = hbond_list[pk].ptr;
+                r_jk = nbr_jk->d;
+                rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+                for( itr=0; itr < top; ++itr ) {
+                    pi = hblist[itr];
+                    pbond_ij = &( bond_list[pi] );
+                    i = pbond_ij->nbr;
+                    if( i != k ) {
+                        bo_ij = &(pbond_ij->bo_data);
+                        type_i = system->atoms[i].type;
+                        r_ij = pbond_ij->d;         
+                        hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, system->reaxprm.num_atom_types) ]);
+                        ++num_hb_intrs;
+                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                                &theta, &cos_theta );
+                        /* the derivative of cos(theta) */
+                        Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                                &dcos_theta_di, &dcos_theta_dj, 
+                                &dcos_theta_dk );
+                        /* hydrogen bond energy*/
+                        sin_theta2 = SIN( theta/2.0 );
+                        sin_xhz4 = SQR(sin_theta2);
+                        sin_xhz4 *= sin_xhz4;
+                        cos_xhz1 = ( 1.0 - cos_theta );
+                        exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
+                                    r_jk / hbp->r0_hb - 2.0 ) );
+                        data->E_HB += e_hb = 
+                            hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                        CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
+                        CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
+                                1.0 / hbp->r0_hb);
+                        /* hydrogen bond forces */
+                        bo_ij->Cdbo += CEhb1;   // dbo term
+                        if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
+                            rvec_ScaledAdd( system->atoms[i].f, 
+                                    +CEhb2, dcos_theta_di ); //dcos terms
+                            rvec_ScaledAdd( system->atoms[j].f, 
+                                    +CEhb2, dcos_theta_dj );
+                            //TODO
+                            rvec_ScaledAdd( system->atoms[k].f, 
+                                    +CEhb2, dcos_theta_dk );
+                            //dr terms
+                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                            //TODO
+                            rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk );
+                        }
+                        else
+                        {
+                            /* for pressure coupling, terms that are not related 
+                               to bond order derivatives are added directly into 
+                               pressure vector/tensor */
+                            rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
+                            rvec_Add( system->atoms[i].f, force );
+                            rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
+                            rvec_ScaledAdd( system->atoms[j].f, +CEhb2, dcos_theta_dj );
+                            ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                            rvec_Scale( force, +CEhb2, dcos_theta_dk );
+                            //TODO
+                            rvec_Add( system->atoms[k].f, force );
+                            rvec_iMultiply( ext_press, rel_jk, force );
+                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
+                            //dr terms
+                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                            rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                            rvec_Add( system->atoms[k].f, force );
+                            rvec_iMultiply( ext_press, rel_jk, force );
+                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
+                            /* This part is intended for a fully-flexible box */
+                            /* rvec_OuterProduct( temp_rtensor, 
+                               dcos_theta_di, system->atoms[i].x );
+                               rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor );
+                               rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj,
+                               -CEhb3/r_jk, pbond_jk->dvec );
+                               rvec_OuterProduct( temp_rtensor, 
+                               temp_rvec, system->atoms[j].x );
+                               rtensor_Add( total_rtensor, temp_rtensor );
+                               rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk,
+                               +CEhb3/r_jk, pbond_jk->dvec );
+                               rvec_OuterProduct( temp_rtensor, 
+                               temp_rvec, system->atoms[k].x );
+                               rtensor_Add( total_rtensor, temp_rtensor );
+                               if( pbond_ij->imaginary || pbond_jk->imaginary )
+                               rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
+                               else
+                               rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                        }
+                        /*fprintf( out_control->ehb, 
+                          "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n",
+                          dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], 
+                          dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], 
+                          dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
+                          fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n",
+                          CEhb1, CEhb2, CEhb3 ); */
+                        fprintf( stderr, //out_control->ehb, 
+                                "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                workspace->orig_id[i], 
+                                workspace->orig_id[j], 
+                                workspace->orig_id[k], 
+                                r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
+                        // dbo term
+                        Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb );
+                        // dcos terms
+                        rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
+                        // dr terms
+                        rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk );
+                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk );
+                    }
+                }
+            }
+        }
+    /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", 
+       data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
+    fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs );
+    fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->E_HB );
diff --git a/PuReMD-GPU/src/three_body_interactions.cu b/PuReMD-GPU/src/three_body_interactions.cu
deleted file mode 100644
index c2eed63bc52c8e5f3db040d74d9d8d083478ec82..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/three_body_interactions.cu
+++ /dev/null
@@ -1,2462 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#include "three_body_interactions.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "lookup.h"
-#include "vector.h"
-#include "index_utils.h"
-#include "cuda_helpers.h"
-/* calculates the theta angle between i-j-k */
-HOST_DEVICE void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
-        real *theta, real *cos_theta )
-    (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
-    if( *cos_theta > 1. ) *cos_theta  = 1.0;
-    if( *cos_theta < -1. ) *cos_theta  = -1.0;
-    (*theta) = ACOS( *cos_theta );
-/* calculates the derivative of the cosine of the angle between i-j-k */
-HOST_DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
-        rvec* dcos_theta_di, rvec* dcos_theta_dj, 
-        rvec* dcos_theta_dk )
-    int  t;
-    real sqr_d_ji   = SQR(d_ji);
-    real sqr_d_jk   = SQR(d_jk);
-    real inv_dists  = 1.0 / (d_ji * d_jk);
-    real inv_dists3 = POW( inv_dists, 3 );
-    real dot_dvecs  = Dot( dvec_ji, dvec_jk, 3 );
-    real Cdot_inv3  = dot_dvecs * inv_dists3;
-    for( t = 0; t < 3; ++t ) {
-        (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - 
-            Cdot_inv3 * sqr_d_jk * dvec_ji[t];
-        (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists +
-            Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
-        (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - 
-            Cdot_inv3 * sqr_d_ji * dvec_jk[t];
-    }
-    /*fprintf( stderr, 
-      "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-      dvec_jk[t] * inv_dists*/
-/* this is a 3-body interaction in which the main role is 
-   played by j which sits in the middle of the other two. */
-void Three_Body_Interactions( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace,
-        list **lists, output_controls *out_control )
-    int  i, j, pi, k, pk, t;
-    int  type_i, type_j, type_k;
-    int  start_j, end_j, start_pk, end_pk;
-    int  flag, cnt, num_thb_intrs;
-    real temp, temp_bo_jt, pBOjt7;
-    real p_val1, p_val2, p_val3, p_val4, p_val5;
-    real p_val6, p_val7, p_val8, p_val9, p_val10;
-    real p_pen1, p_pen2, p_pen3, p_pen4;
-    real p_coa1, p_coa2, p_coa3, p_coa4;
-    real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
-    real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
-    real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
-    real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
-    real CEpen1, CEpen2, CEpen3;
-    real e_ang, e_coa, e_pen;
-    real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
-    real Cf7ij, Cf7jk, Cf8j, Cf9j;
-    real f7_ij, f7_jk, f8_Dj, f9_Dj;
-    real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
-    real r_ij, r_jk;
-    real BOA_ij, BOA_jk;
-    real vlpadj;
-    rvec force, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
-    real *total_bo;
-    three_body_header *thbh;
-    three_body_parameters *thbp;
-    three_body_interaction_data *p_ijk, *p_kji;
-    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-    bond_order_data *bo_ij, *bo_jk, *bo_jt;
-    list *bonds, *thb_intrs;
-    bond_data *bond_list;
-    three_body_interaction_data *thb_list;
-    total_bo = workspace->total_bond_order;
-    bonds = (*lists) + BONDS;
-    bond_list = bonds->select.bond_list;
-    thb_intrs = (*lists) + THREE_BODIES;
-    thb_list = thb_intrs->select.three_body_list;
-    /* global parameters used in these calculations */
-    p_val6 = system->reaxprm.gp.l[14];
-    p_val8 = system->reaxprm.gp.l[33];
-    p_val9 = system->reaxprm.gp.l[16];
-    p_val10 = system->reaxprm.gp.l[17];
-    num_thb_intrs = 0;
-    for( j = 0; j < system->N; ++j ) {
-        // fprintf( out_control->eval, "j: %d\n", j );
-        type_j = system->atoms[j].type;
-        start_j = Start_Index(j, bonds);
-        end_j = End_Index(j, bonds);
-        p_val3 = system->reaxprm.sbp[ type_j ].p_val3;
-        p_val5 = system->reaxprm.sbp[ type_j ].p_val5;
-        SBOp = 0, prod_SBO = 1;
-        for( t = start_j; t < end_j; ++t ) {
-            bo_jt = &(bond_list[t].bo_data);
-            SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
-            temp = SQR( bo_jt->BO );
-            temp *= temp; 
-            temp *= temp;
-            prod_SBO *= EXP( -temp );
-        }
-        /* modifications to match Adri's code - 09/01/09 */
-        if( workspace->vlpex[j] >= 0 ){
-            vlpadj = 0;
-            dSBO2 = prod_SBO - 1;
-        }
-        else{
-            vlpadj = workspace->nlp[j];
-            dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
-        }
-        SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
-        dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
-        if( SBO <= 0 )
-            SBO2 = 0, CSBO2 = 0;
-        else if( SBO > 0 && SBO <= 1 ) {
-            SBO2 = POW( SBO, p_val9 );
-            CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
-        }
-        else if( SBO > 1 && SBO < 2 ) {
-            SBO2 = 2 - POW( 2-SBO, p_val9 );
-            CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
-        }
-        else 
-            SBO2 = 2, CSBO2 = 0;  
-        expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
-        /* unlike 2-body intrs where we enforce i<j, we cannot put any such 
-           restrictions here. such a restriction would prevent us from producing 
-           all 4-body intrs correctly */
-        for( pi = start_j; pi < end_j; ++pi ) {
-            Set_Start_Index( pi, num_thb_intrs, thb_intrs );
-            pbond_ij = &(bond_list[pi]);
-            bo_ij = &(pbond_ij->bo_data);
-            BOA_ij = bo_ij->BO - control->thb_cut;
-            if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) {
-                i = pbond_ij->nbr;
-                r_ij = pbond_ij->d;     
-                type_i = system->atoms[i].type;
-                // fprintf( out_control->eval, "i: %d\n", i );
-                /* first copy 3-body intrs from previously computed ones where i>k.
-IMPORTANT: if it is less costly to compute theta and its 
-derivative, we should definitely re-compute them, 
-instead of copying!
-in the second for-loop below, we compute only new 3-body intrs 
-where i < k */
-                for( pk = start_j; pk < pi; ++pk ) {
-                    // fprintf( out_control->eval, "pk: %d\n", pk );
-                    start_pk = Start_Index( pk, thb_intrs );
-                    end_pk = End_Index( pk, thb_intrs );
-                    for( t = start_pk; t < end_pk; ++t )
-                        if( thb_list[t].thb == i ) {
-                            p_ijk = &(thb_list[num_thb_intrs]);
-                            p_kji = &(thb_list[t]);
-                            p_ijk->thb = bond_list[pk].nbr;
-                            p_ijk->pthb  = pk;
-                            p_ijk->theta = p_kji->theta;              
-                            rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-                            rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-                            rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-                            //if (j == 12)
-                            //fprintf (stderr, "Adding one for matched atom %d \n", i);
-                            ++num_thb_intrs;
-                            break;
-                        }
-                }
-                /* and this is the second for loop mentioned above */
-                for( pk = pi+1; pk < end_j; ++pk ) {
-                    pbond_jk = &(bond_list[pk]);
-                    bo_jk    = &(pbond_jk->bo_data);
-                    BOA_jk   = bo_jk->BO - control->thb_cut;
-                    k        = pbond_jk->nbr;
-                    type_k   = system->atoms[k].type;
-                    p_ijk    = &( thb_list[num_thb_intrs] );
-                    //TODO - CHANGE ORIGINAL
-                    if (BOA_jk <= 0) continue;
-                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
-                            pbond_jk->dvec, pbond_jk->d,
-                            &theta, &cos_theta );
-                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
-                            pbond_jk->dvec, pbond_jk->d, 
-                            &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
-                            &(p_ijk->dcos_dk) );
-                    p_ijk->thb = k;
-                    p_ijk->pthb = pk;
-                    p_ijk->theta = theta;
-                    //if (j == 12)
-                    //fprintf (stderr, "Adding one for the rest %d \n", k);
-                    sin_theta = SIN( theta );
-                    if( sin_theta < 1.0e-5 )
-                        sin_theta = 1.0e-5;
-                    ++num_thb_intrs;
-                    if( BOA_jk > 0.0 && 
-                            (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
-                        r_jk = pbond_jk->d;              
-                        thbh = &( system->reaxprm.thbp[ index_thbp (type_i,type_j,type_k,&system->reaxprm) ] );
-                        flag = 0;
-                        /* if( workspace->orig_id[i] < workspace->orig_id[k] )
-                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-                           workspace->orig_id[i], workspace->orig_id[j],
-                           workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
-                           else 
-                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-                           workspace->orig_id[k], workspace->orig_id[j],
-                           workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
-                        for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
-                            // fprintf( out_control->eval, 
-                            // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
-                            if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
-                                thbp = &( thbh->prm[cnt] );
-                                /* ANGLE ENERGY */
-                                p_val1 = thbp->p_val1;
-                                p_val2 = thbp->p_val2;
-                                p_val4 = thbp->p_val4;
-                                p_val7 = thbp->p_val7;
-                                theta_00 = thbp->theta_00;
-                                exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
-                                f7_ij = 1.0 - exp3ij;
-                                Cf7ij = p_val3 * p_val4 * 
-                                    POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
-                                exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
-                                f7_jk = 1.0 - exp3jk;
-                                Cf7jk = p_val3 * p_val4 * 
-                                    POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
-                                expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
-                                trm8 = 1.0 + expval6 + expval7;
-                                f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
-                                Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
-                                    (p_val6 * expval6 * trm8 - 
-                                     (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
-                                theta_0 = 180.0 - 
-                                    theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
-                                theta_0 = DEG2RAD( theta_0 );              
-                                expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
-                                if( p_val1 >= 0 )
-                                    expval12theta = p_val1 * (1.0 - expval2theta);
-                                else // To avoid linear Me-H-Me angles (6/6/06)
-                                    expval12theta = p_val1 * -expval2theta;
-                                CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
-                                CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
-                                CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
-                                CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
-                                    expval2theta * (theta_0 - theta);
-                                Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
-                                    exp( -p_val10 * (2.0 - SBO2) );
-                                CEval5 = -CEval4 * Ctheta_0 * CSBO2;
-                                CEval6 = CEval5 * dSBO1;
-                                CEval7 = CEval5 * dSBO2;
-                                CEval8 = -CEval4 / sin_theta;
-                                data->E_Ang += e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
-                                /* END ANGLE ENERGY*/
-                                /* PENALTY ENERGY */
-                                p_pen1 = thbp->p_pen1;
-                                p_pen2 = system->reaxprm.gp.l[19];
-                                p_pen3 = system->reaxprm.gp.l[20];
-                                p_pen4 = system->reaxprm.gp.l[21];
-                                exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
-                                exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
-                                exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
-                                exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
-                                trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
-                                f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-                                Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
-                                        (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
-                                            p_pen4 * exp_pen4 )) /
-                                    SQR( trm_pen34 );
-                                data->E_Pen += e_pen = 
-                                    p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
-                                CEpen1 = e_pen * Cf9j / f9_Dj;
-                                temp   = -2.0 * p_pen2 * e_pen;
-                                CEpen2 = temp * (BOA_ij - 2.0);
-                                CEpen3 = temp * (BOA_jk - 2.0);
-                                /* END PENALTY ENERGY */
-                                /* COALITION ENERGY */
-                                p_coa1 = thbp->p_coa1;
-                                p_coa2 = system->reaxprm.gp.l[2];
-                                p_coa3 = system->reaxprm.gp.l[38];
-                                p_coa4 = system->reaxprm.gp.l[30];
-                                exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
-                                data->E_Coa += e_coa = 
-                                    p_coa1 / (1. + exp_coa2) *
-                                    EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
-                                    EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
-                                    EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
-                                    EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
-                                CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
-                                CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-                                CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
-                                CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
-                                CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
-                                /* END COALITION ENERGY */
-                                /* FORCES */
-                                bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4));
-                                bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5));
-                                workspace->CdDelta[j] += ((CEval3 + CEval7) + 
-                                        CEpen1 + CEcoa3);
-                                workspace->CdDelta[i] += CEcoa4;
-                                workspace->CdDelta[k] += CEcoa5;              
-                                for( t = start_j; t < end_j; ++t ) {
-                                    pbond_jt = &( bond_list[t] );
-                                    bo_jt = &(pbond_jt->bo_data);
-                                    temp_bo_jt = bo_jt->BO;
-                                    temp = CUBE( temp_bo_jt );
-                                    pBOjt7 = temp * temp * temp_bo_jt; 
-                                    // fprintf( out_control->eval, "%6d%12.8f\n", 
-                                    // workspace->orig_id[ bond_list[t].nbr ], 
-                                    //    (CEval6 * pBOjt7) );
-                                    bo_jt->Cdbo += (CEval6 * pBOjt7);
-                                    bo_jt->Cdbopi += CEval5;
-                                    bo_jt->Cdbopi2 += CEval5;
-                                }              
-                                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                                    rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di );
-                                    rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
-                                    rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk );
-                                    /*
-                                       if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j);
-                                       if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j);
-                                     */
-                                }
-                                else {
-                                    /* terms not related to bond order derivatives
-                                       are added directly into 
-                                       forces and pressure vector/tensor */
-                                    rvec_Scale( force, CEval8, p_ijk->dcos_di );
-                                    rvec_Add( system->atoms[i].f, force );
-                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                    rvec_Add( data->ext_press, ext_press );
-                                    rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
-                                    rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-                                    rvec_Add( system->atoms[k].f, force );
-                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                    rvec_Add( data->ext_press, ext_press );
-                                    /* This part is for a fully-flexible box */
-                                    /* rvec_OuterProduct( temp_rtensor, 
-                                       p_ijk->dcos_di, system->atoms[i].x );
-                                       rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
-                                       rvec_OuterProduct( temp_rtensor, 
-                                       p_ijk->dcos_dj, system->atoms[j].x );
-                                       rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-                                       rvec_OuterProduct( temp_rtensor, 
-                                       p_ijk->dcos_dk, system->atoms[k].x );
-                                       rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-                                       if( pbond_ij->imaginary || pbond_jk->imaginary )
-                                       rtensor_ScaledAdd( data->flex_bar.P, 
-                                       -1.0, total_rtensor );
-                                       else
-                                       rtensor_Add( data->flex_bar.P, total_rtensor ); */
-                                }
-                                fprintf( out_control->eval, 
-                                        //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
-                                        "%6d%6d%6d%23.15e%23.15e%23.15e\n",
-                                        i+1, j+1, k+1,
-                                        //workspace->orig_id[i]+1,  
-                                        //workspace->orig_id[j]+1,
-                                        //workspace->orig_id[k]+1,
-                                        //workspace->Delta_boc[j], 
-                                        RAD2DEG(theta), /*BOA_ij, BOA_jk, */
-                                        e_ang, data->E_Ang );
-                                /*fprintf( out_control->eval, 
-                                  "%23.15e%23.15e%23.15e%23.15e",
-                                  p_val3, p_val4, BOA_ij, BOA_jk );
-                                  fprintf( out_control->eval, 
-                                  "%23.15e%23.15e%23.15e%23.15e",
-                                  f7_ij, f7_jk, f8_Dj, expval12theta );
-                                  fprintf( out_control->eval, 
-                                  "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                  CEval1, CEval2, CEval3, CEval4, CEval5
-                                //CEval6, CEval7, CEval8  );*/
-                                /*fprintf( out_control->eval, 
-                                  "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                  -p_ijk->dcos_di[0]/sin_theta, 
-                                  -p_ijk->dcos_di[1]/sin_theta, 
-                                  -p_ijk->dcos_di[2]/sin_theta, 
-                                  -p_ijk->dcos_dj[0]/sin_theta, 
-                                  -p_ijk->dcos_dj[1]/sin_theta, 
-                                  -p_ijk->dcos_dj[2]/sin_theta, 
-                                  -p_ijk->dcos_dk[0]/sin_theta, 
-                                  -p_ijk->dcos_dk[1]/sin_theta, 
-                                  -p_ijk->dcos_dk[2]/sin_theta );*/
-                                /* fprintf( out_control->epen, 
-                                   "%23.15e%23.15e%23.15e\n", 
-                                   CEpen1, CEpen2, CEpen3 );
-                                   fprintf( out_control->epen, 
-                                   "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                   workspace->orig_id[i],  workspace->orig_id[j],
-                                   workspace->orig_id[k], RAD2DEG(theta), 
-                                   BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
-                                fprintf( out_control->ecoa, 
-                                        "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                        workspace->orig_id[i], 
-                                        workspace->orig_id[j],
-                                        workspace->orig_id[k], 
-                                        RAD2DEG(theta), BOA_ij, BOA_jk, 
-                                        e_coa, data->E_Coa );
-#ifdef TEST_FORCES            /* angle forces */
-                                Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
-                                Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
-                                Add_dDelta( system, lists, 
-                                        j, CEval3 + CEval7, workspace->f_ang );
-                                for( t = start_j; t < end_j; ++t ) {
-                                    pbond_jt = &( bond_list[t] );
-                                    bo_jt = &(pbond_jt->bo_data);
-                                    temp_bo_jt = bo_jt->BO;
-                                    temp = CUBE( temp_bo_jt );
-                                    pBOjt7 = temp * temp * temp_bo_jt; 
-                                    Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
-                                            workspace->f_ang );
-                                    Add_dBOpinpi2( system, lists, j, t, 
-                                            CEval5, CEval5, 
-                                            workspace->f_ang, workspace->f_ang );
-                                }
-                                rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
-                                rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
-                                rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
-                                /* end angle forces */
-                                /* penalty forces */
-                                Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
-                                Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
-                                Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
-                                /* end penalty forces */
-                                /* coalition forces */
-                                Add_dBO( system, lists, 
-                                        j, pi, CEcoa1-CEcoa4, workspace->f_coa );
-                                Add_dBO( system, lists, 
-                                        j, pk, CEcoa2-CEcoa5, workspace->f_coa );
-                                Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
-                                Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
-                                Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
-                                /* end coalition forces */
-                            }
-                        }
-                    }
-                }
-            }
-            Set_End_Index(pi, num_thb_intrs, thb_intrs );
-        }
-    }
-    if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
-        workspace->realloc.num_3body = num_thb_intrs;
-        if( num_thb_intrs > thb_intrs->num_intrs ) {
-            fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
-                    data->step, num_thb_intrs, thb_intrs->num_intrs );
-            exit( INSUFFICIENT_SPACE );
-        }
-    }
-    //fprintf( stderr,"%d: Number of angle interactions: %d\n", 
-    // data->step, num_thb_intrs );
-    fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
-    fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
-            data->E_Ang, data->E_Pen, data->E_Coa );
-    fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
-            data->ext_press[0], data->ext_press[1], data->ext_press[2] );
-//Cuda Function for the Three body interactions. 
-/* this is a 3-body interaction in which the main role is 
-   played by j which sits in the middle of the other two. */
-GLOBAL void Three_Body_Interactions( reax_atom *atoms,
-        single_body_parameters *sbp,
-        three_body_header *d_thbp,
-        global_parameters g_params,
-        control_params *control,
-        simulation_data *data,
-        static_storage p_workspace, 
-        list p_bonds, list p_thb_intrs,
-        int N, int num_atom_types,
-        real *E_Ang, real *E_Pen, real *E_Coa, rvec *aux_ext_press )
-    int  i, j, pi, k, pk, t;
-    int  type_i, type_j, type_k;
-    int  start_j, end_j, start_pk, end_pk;
-    int  flag, cnt, num_thb_intrs;
-    real temp, temp_bo_jt, pBOjt7;
-    real p_val1, p_val2, p_val3, p_val4, p_val5;
-    real p_val6, p_val7, p_val8, p_val9, p_val10;
-    real p_pen1, p_pen2, p_pen3, p_pen4;
-    real p_coa1, p_coa2, p_coa3, p_coa4;
-    real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
-    real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
-    real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
-    real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
-    real CEpen1, CEpen2, CEpen3;
-    real e_ang, e_coa, e_pen;
-    real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
-    real Cf7ij, Cf7jk, Cf8j, Cf9j;
-    real f7_ij, f7_jk, f8_Dj, f9_Dj;
-    real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
-    real r_ij, r_jk;
-    real BOA_ij, BOA_jk;
-    real vlpadj;
-    rvec force, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
-    real *total_bo;
-    three_body_header *thbh;
-    three_body_parameters *thbp;
-    three_body_interaction_data *p_ijk, *p_kji;
-    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-    bond_order_data *bo_ij, *bo_jk, *bo_jt;
-    list *bonds, *thb_intrs;
-    bond_data *bond_list;
-    three_body_interaction_data *thb_list;
-    static_storage *workspace = &p_workspace;
-    j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= N) return;
-    total_bo = workspace->total_bond_order;
-    bonds = &p_bonds;
-    bond_list = bonds->select.bond_list;
-    thb_intrs = &p_thb_intrs;
-    thb_list = thb_intrs->select.three_body_list;
-    /* global parameters used in these calculations */
-    p_val6 = g_params.l[14];
-    p_val8 = g_params.l[33];
-    p_val9 = g_params.l[16];
-    p_val10 = g_params.l[17];
-    //TODO check this, initially this was zero, 
-    // I am changing it to the starting index for this atom.
-    //num_thb_intrs = j * MAX_TH_BODY;
-    //for( j = 0; j < system->N; ++j ) {
-    // fprintf( out_control->eval, "j: %d\n", j );
-    type_j = atoms[j].type;
-    start_j = Start_Index(j, bonds);
-    end_j = End_Index(j, bonds);
-    p_val3 = sbp[ type_j ].p_val3;
-    p_val5 = sbp[ type_j ].p_val5;
-    SBOp = 0, prod_SBO = 1;
-    for( t = start_j; t < end_j; ++t ) {
-        bo_jt = &(bond_list[t].bo_data);
-        SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
-        temp = SQR( bo_jt->BO );
-        temp *= temp; 
-        temp *= temp;
-        prod_SBO *= EXP( -temp );
-    }
-    /* modifications to match Adri's code - 09/01/09 */
-    if( workspace->vlpex[j] >= 0 ){
-        vlpadj = 0;
-        dSBO2 = prod_SBO - 1;
-    }
-    else{
-        vlpadj = workspace->nlp[j];
-        dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
-    }
-    SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
-    dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
-    if( SBO <= 0 )
-        SBO2 = 0, CSBO2 = 0;
-    else if( SBO > 0 && SBO <= 1 ) {
-        SBO2 = POW( SBO, p_val9 );
-        CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
-    }
-    else if( SBO > 1 && SBO < 2 ) {
-        SBO2 = 2 - POW( 2-SBO, p_val9 );
-        CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
-    }
-    else 
-        SBO2 = 2, CSBO2 = 0;  
-    expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
-    /* unlike 2-body intrs where we enforce i<j, we cannot put any such 
-       restrictions here. such a restriction would prevent us from producing 
-       all 4-body intrs correctly */
-    for( pi = start_j; pi < end_j; ++pi ) {
-        //TODO
-        //num_thb_intrs = pi * MAX_THREE_BODIES;
-        //TODO
-        //Set_Start_Index( pi, num_thb_intrs, thb_intrs );
-        num_thb_intrs = Start_Index (pi, thb_intrs);
-        pbond_ij = &(bond_list[pi]);
-        bo_ij = &(pbond_ij->bo_data);
-        BOA_ij = bo_ij->BO - control->thb_cut;
-        if( BOA_ij/*bo_ij->BO*/ > 0.0 ) {
-            i = pbond_ij->nbr;
-            r_ij = pbond_ij->d;     
-            type_i = atoms[i].type;
-            // fprintf( out_control->eval, "i: %d\n", i );
-            /* first copy 3-body intrs from previously computed ones where i>k.
-IMPORTANT: if it is less costly to compute theta and its 
-derivative, we should definitely re-compute them, 
-instead of copying!
-in the second for-loop below, we compute only new 3-body intrs 
-where i < k */
-            for( pk = start_j; pk < pi; ++pk ) {
-                // fprintf( out_control->eval, "pk: %d\n", pk );
-                start_pk = Start_Index( pk, thb_intrs );
-                end_pk = End_Index( pk, thb_intrs );
-                for( t = start_pk; t < end_pk; ++t )
-                    if( thb_list[t].thb == i ) {
-                        p_ijk = &(thb_list[num_thb_intrs]);
-                        p_kji = &(thb_list[t]);
-                        p_ijk->thb = bond_list[pk].nbr;
-                        p_ijk->pthb  = pk;
-                        p_ijk->theta = p_kji->theta;              
-                        rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-                        rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-                        rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-                        ++num_thb_intrs;
-                        break;
-                    }
-            }
-            /* and this is the second for loop mentioned above */
-            for( pk = pi+1; pk < end_j; ++pk ) {
-                pbond_jk = &(bond_list[pk]);
-                bo_jk    = &(pbond_jk->bo_data);
-                BOA_jk   = bo_jk->BO - control->thb_cut;
-                k        = pbond_jk->nbr;
-                type_k   = atoms[k].type;
-                p_ijk    = &( thb_list[num_thb_intrs] );
-                //CHANGE ORIGINAL
-                if (BOA_jk <= 0) continue;
-                //CHANGE ORIGINAL
-                Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
-                        pbond_jk->dvec, pbond_jk->d,
-                        &theta, &cos_theta );
-                Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
-                        pbond_jk->dvec, pbond_jk->d, 
-                        &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
-                        &(p_ijk->dcos_dk) );
-                p_ijk->thb = k;
-                p_ijk->pthb = pk;
-                p_ijk->theta = theta;
-                sin_theta = SIN( theta );
-                if( sin_theta < 1.0e-5 )
-                    sin_theta = 1.0e-5;
-                ++num_thb_intrs;
-                if( BOA_jk > 0.0 && 
-                        (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
-                    r_jk = pbond_jk->d;              
-                    thbh = &( d_thbp[ index_thbp (type_i,type_j,type_k,num_atom_types) ] );
-                    flag = 0;
-                    /* if( workspace->orig_id[i] < workspace->orig_id[k] )
-                       fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-                       workspace->orig_id[i], workspace->orig_id[j],
-                       workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
-                       else 
-                       fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-                       workspace->orig_id[k], workspace->orig_id[j],
-                       workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
-                    //TODO:
-                    //pbond_jk->scratch = thbh->cnt;
-                    for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
-                        // fprintf( out_control->eval, 
-                        // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
-                        if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
-                            thbp = &( thbh->prm[cnt] );
-                            /* ANGLE ENERGY */
-                            p_val1 = thbp->p_val1;
-                            p_val2 = thbp->p_val2;
-                            p_val4 = thbp->p_val4;
-                            p_val7 = thbp->p_val7;
-                            theta_00 = thbp->theta_00;
-                            exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
-                            f7_ij = 1.0 - exp3ij;
-                            Cf7ij = p_val3 * p_val4 * 
-                                POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
-                            exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
-                            f7_jk = 1.0 - exp3jk;
-                            Cf7jk = p_val3 * p_val4 * 
-                                POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
-                            expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
-                            trm8 = 1.0 + expval6 + expval7;
-                            f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
-                            Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
-                                (p_val6 * expval6 * trm8 - 
-                                 (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
-                            theta_0 = 180.0 - 
-                                theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
-                            theta_0 = DEG2RAD( theta_0 );              
-                            expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
-                            if( p_val1 >= 0 )
-                                expval12theta = p_val1 * (1.0 - expval2theta);
-                            else // To avoid linear Me-H-Me angles (6/6/06)
-                                expval12theta = p_val1 * -expval2theta;
-                            CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
-                            CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
-                            CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
-                            CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
-                                expval2theta * (theta_0 - theta);
-                            Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
-                                exp( -p_val10 * (2.0 - SBO2) );
-                            CEval5 = -CEval4 * Ctheta_0 * CSBO2;
-                            CEval6 = CEval5 * dSBO1;
-                            CEval7 = CEval5 * dSBO2;
-                            CEval8 = -CEval4 / sin_theta;
-                            e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
-                            //PERFORMANCE IMPACT
-                            //atomicAdd (&data->E_Ang, e_ang);
-                            E_Ang [j] += e_ang;
-                            /* END ANGLE ENERGY*/
-                            /* PENALTY ENERGY */
-                            p_pen1 = thbp->p_pen1;
-                            p_pen2 = g_params.l[19];
-                            p_pen3 = g_params.l[20];
-                            p_pen4 = g_params.l[21];
-                            exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
-                            exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
-                            exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
-                            exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
-                            trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
-                            f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-                            Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
-                                    (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
-                                        p_pen4 * exp_pen4 )) /
-                                SQR( trm_pen34 );
-                            e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
-                            //PERFORMANCE IMPACT
-                            //atomicAdd (&data->E_Pen, e_pen);
-                            E_Pen [j] += e_pen;
-                            CEpen1 = e_pen * Cf9j / f9_Dj;
-                            temp   = -2.0 * p_pen2 * e_pen;
-                            CEpen2 = temp * (BOA_ij - 2.0);
-                            CEpen3 = temp * (BOA_jk - 2.0);
-                            /* END PENALTY ENERGY */
-                            /* COALITION ENERGY */
-                            p_coa1 = thbp->p_coa1;
-                            p_coa2 = g_params.l[2];
-                            p_coa3 = g_params.l[38];
-                            p_coa4 = g_params.l[30];
-                            exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
-                            e_coa = 
-                                p_coa1 / (1. + exp_coa2) *
-                                EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
-                                EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
-                                EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
-                                EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
-                            //PERFORMANCE IMPACT
-                            //atomicAdd (&data->E_Coa, e_coa);
-                            E_Coa [j] += e_coa;
-                            CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
-                            CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-                            CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
-                            CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
-                            CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
-                            /* END COALITION ENERGY */
-                            /* FORCES */
-                            /*
-                               atomicAdd (&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) );
-                               atomicAdd (&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) );
-                               atomicAdd (&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) );
-                               atomicAdd (&workspace->CdDelta[i], CEcoa4 );
-                               atomicAdd (&workspace->CdDelta[k], CEcoa5 );              
-                             */
-                            bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ;
-                            bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ;
-                            workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ;
-                            //atomicAdd (&workspace->CdDelta[i], CEcoa4 );
-                            pbond_ij->CdDelta_ij += CEcoa4 ;
-                            //atomicAdd (&workspace->CdDelta[k], CEcoa5 );              
-                            pbond_jk->CdDelta_ij += CEcoa5;
-                            for( t = start_j; t < end_j; ++t ) {
-                                pbond_jt = &( bond_list[t] );
-                                bo_jt = &(pbond_jt->bo_data);
-                                temp_bo_jt = bo_jt->BO;
-                                temp = CUBE( temp_bo_jt );
-                                pBOjt7 = temp * temp * temp_bo_jt; 
-                                // fprintf( out_control->eval, "%6d%12.8f\n", 
-                                // workspace->orig_id[ bond_list[t].nbr ], 
-                                //    (CEval6 * pBOjt7) );
-                                /*
-                                   atomicAdd (&bo_jt->Cdbo, (CEval6 * pBOjt7) );
-                                   atomicAdd (&bo_jt->Cdbopi, CEval5 );
-                                   atomicAdd (&bo_jt->Cdbopi2, CEval5 );
-                                 */
-                                bo_jt->Cdbo        += (CEval6 * pBOjt7) ;
-                                bo_jt->Cdbopi    += CEval5 ;
-                                bo_jt->Cdbopi2    += CEval5 ;
-                            }              
-                            if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                                /*
-                                   atomic_rvecScaledAdd( atoms[i].f, CEval8, p_ijk->dcos_di );
-                                   atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
-                                   atomic_rvecScaledAdd( atoms[k].f, CEval8, p_ijk->dcos_dk );
-                                 */
-                                rvec_ScaledAdd( pbond_ij->f, CEval8, p_ijk->dcos_di );
-                                rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
-                                rvec_ScaledAdd( pbond_jk->f, CEval8, p_ijk->dcos_dk );
-                            }
-                            else {
-                                /* terms not related to bond order derivatives
-                                   are added directly into 
-                                   forces and pressure vector/tensor */
-                                rvec_Scale( force, CEval8, p_ijk->dcos_di );
-                                //atomic_rvecAdd( atoms[i].f, force );
-                                rvec_Add( pbond_ij->f, force );
-                                rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                                //atomic_rvecAdd( data->ext_press, ext_press );
-                                rvec_Add( aux_ext_press [j], ext_press );
-                                //atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
-                                rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
-                                rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-                                //atomic_rvecAdd( atoms[k].f, force );
-                                rvec_Add( pbond_jk->f, force );
-                                rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-                                //atomic_rvecAdd( data->ext_press, ext_press );
-                                rvec_Add( aux_ext_press [j], ext_press );
-                                /* This part is for a fully-flexible box */
-                                /* rvec_OuterProduct( temp_rtensor, 
-                                   p_ijk->dcos_di, system->atoms[i].x );
-                                   rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
-                                   rvec_OuterProduct( temp_rtensor, 
-                                   p_ijk->dcos_dj, system->atoms[j].x );
-                                   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-                                   rvec_OuterProduct( temp_rtensor, 
-                                   p_ijk->dcos_dk, system->atoms[k].x );
-                                   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-                                   if( pbond_ij->imaginary || pbond_jk->imaginary )
-                                   rtensor_ScaledAdd( data->flex_bar.P, 
-                                   -1.0, total_rtensor );
-                                   else
-                                   rtensor_Add( data->flex_bar.P, total_rtensor ); */
-                            }
-                            //TODO -- check this
-                            //        fprintf( out_control->eval, 
-                            //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
-                            //             "%6d%6d%6d%23.15e%23.15e%23.15e\n",
-                            //             i+1, j+1, k+1,
-                            //workspace->orig_id[i]+1,  
-                            //workspace->orig_id[j]+1,
-                            //workspace->orig_id[k]+1,
-                            //workspace->Delta_boc[j], 
-                            //             RAD2DEG(theta), /*BOA_ij, BOA_jk, */
-                            //             e_ang, data->E_Ang );
-                            /*fprintf( out_control->eval, 
-                              "%23.15e%23.15e%23.15e%23.15e",
-                              p_val3, p_val4, BOA_ij, BOA_jk );
-                              fprintf( out_control->eval, 
-                              "%23.15e%23.15e%23.15e%23.15e",
-                              f7_ij, f7_jk, f8_Dj, expval12theta );
-                              fprintf( out_control->eval, 
-                              "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                              CEval1, CEval2, CEval3, CEval4, CEval5
-                            //CEval6, CEval7, CEval8  );*/
-                            /*fprintf( out_control->eval, 
-                              "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                              -p_ijk->dcos_di[0]/sin_theta, 
-                              -p_ijk->dcos_di[1]/sin_theta, 
-                              -p_ijk->dcos_di[2]/sin_theta, 
-                              -p_ijk->dcos_dj[0]/sin_theta, 
-                              -p_ijk->dcos_dj[1]/sin_theta, 
-                              -p_ijk->dcos_dj[2]/sin_theta, 
-                              -p_ijk->dcos_dk[0]/sin_theta, 
-                              -p_ijk->dcos_dk[1]/sin_theta, 
-                              -p_ijk->dcos_dk[2]/sin_theta );*/
-                            /* fprintf( out_control->epen, 
-                               "%23.15e%23.15e%23.15e\n", 
-                               CEpen1, CEpen2, CEpen3 );
-                               fprintf( out_control->epen, 
-                               "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                               workspace->orig_id[i],  workspace->orig_id[j],
-                               workspace->orig_id[k], RAD2DEG(theta), 
-                               BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
-                            //        fprintf( out_control->ecoa, 
-                            //             "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                            //             workspace->orig_id[i], 
-                            //             workspace->orig_id[j],
-                            //             workspace->orig_id[k], 
-                            //             RAD2DEG(theta), BOA_ij, BOA_jk, 
-                            //             e_coa, data->E_Coa );
-#ifdef TEST_FORCES            /* angle forces */
-                            //TODO -- check this
-                            /*
-                               Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
-                               Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
-                               Add_dDelta( system, lists, 
-                               j, CEval3 + CEval7, workspace->f_ang );
-                               for( t = start_j; t < end_j; ++t ) {
-                               pbond_jt = &( bond_list[t] );
-                               bo_jt = &(pbond_jt->bo_data);
-                               temp_bo_jt = bo_jt->BO;
-                               temp = CUBE( temp_bo_jt );
-                               pBOjt7 = temp * temp * temp_bo_jt; 
-                               Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
-                               workspace->f_ang );
-                               Add_dBOpinpi2( system, lists, j, t, 
-                               CEval5, CEval5, 
-                               workspace->f_ang, workspace->f_ang );
-                               }
-                               rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
-                               rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
-                               rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
-                            // end angle forces 
-                            // penalty forces 
-                            Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
-                            Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
-                            Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
-                            // end penalty forces 
-                            // coalition forces 
-                            Add_dBO( system, lists, 
-                            j, pi, CEcoa1-CEcoa4, workspace->f_coa );
-                            Add_dBO( system, lists, 
-                            j, pk, CEcoa2-CEcoa5, workspace->f_coa );
-                            Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
-                            Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
-                            Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
-                            // end coalition forces 
-                             */
-                        }
-                    }
-                }
-            }
-        }
-        Set_End_Index(pi, num_thb_intrs, thb_intrs );
-    }
-    //  } // end of the main for loop here
-    //TODO - to be done on the CPU
-    /*
-       if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
-       workspace->realloc.num_3body = num_thb_intrs;
-       if( num_thb_intrs > thb_intrs->num_intrs ) {
-       fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
-       data->step, num_thb_intrs, thb_intrs->num_intrs );
-       exit( INSUFFICIENT_SPACE );
-       }
-       }
-     */
-    //fprintf( stderr,"%d: Number of angle interactions: %d\n", 
-    // data->step, num_thb_intrs );
-    /*
-       fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
-       fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
-       data->E_Ang, data->E_Pen, data->E_Coa );
-       fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
-       data->ext_press[0], data->ext_press[1], data->ext_press[2] );
-     */
-GLOBAL void Three_Body_Interactions_results (     reax_atom *atoms, control_params *control,
-        static_storage p_workspace, 
-        list p_bonds, int N )
-    int i, pj;
-    bond_data *pbond;
-    bond_data *sym_index_bond;
-    list *bonds = &p_bonds;
-    static_storage *workspace = &p_workspace;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= N) return;
-    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
-        pbond = &(bonds->select.bond_list[pj]);
-        sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
-        workspace->CdDelta [i] += sym_index_bond->CdDelta_ij;
-        rvec_Add (atoms[i].f, sym_index_bond->f );
-    }
-// Three Body Estimation
-/* this is a 3-body interaction in which the main role is 
-   played by j which sits in the middle of the other two. */
-GLOBAL void Three_Body_Estimate ( reax_atom *atoms, 
-        control_params *control,
-        list p_bonds, int N, 
-        int *count)
-    int  i, j, pi, k, pk, t;
-    int  type_i, type_j, type_k;
-    int  start_j, end_j ;
-    int  flag, cnt, num_thb_intrs;
-    real r_ij, r_jk;
-    real BOA_ij, BOA_jk;
-    list *bonds;
-    bond_order_data *bo_ij, *bo_jk, *bo_jt;
-    bond_data *bond_list;
-    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-    j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= N) return;
-    bonds = &p_bonds;
-    bond_list = bonds->select.bond_list;
-    type_j = atoms[j].type;
-    start_j = Start_Index(j, bonds);
-    end_j = End_Index(j, bonds);
-    for( pi = start_j; pi < end_j; ++pi ) {
-        num_thb_intrs = 0;
-        count [pi] = 0;
-        pbond_ij = &(bond_list[pi]);
-        bo_ij = &(pbond_ij->bo_data);
-        BOA_ij = bo_ij->BO - control->thb_cut;
-        if( BOA_ij/*bo_ij->BO*/ > 0.0 ) {
-            i = pbond_ij->nbr;
-            r_ij = pbond_ij->d;     
-            type_i = atoms[i].type;
-            /*
-               for( pk = start_j; pk < pi; ++pk ) {
-               start_pk = Start_Index( pk, thb_intrs );
-               end_pk = End_Index( pk, thb_intrs );
-               for( t = start_pk; t < end_pk; ++t )
-               if( thb_list[t].thb == i ) {
-               ++num_thb_intrs;
-               break;
-               }
-               }
-             */
-            /* and this is the second for loop mentioned above */
-            for( pk = start_j; pk < end_j; ++pk ) {
-                if (pk == pi) continue;
-                pbond_jk = &(bond_list[pk]);
-                bo_jk    = &(pbond_jk->bo_data);
-                BOA_jk   = bo_jk->BO - control->thb_cut;
-                if (BOA_jk <= 0) continue;
-                ++num_thb_intrs;
-            }
-        }
-        count [pi] = num_thb_intrs;
-    }
-//End here
-void Hydrogen_Bonds( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
-    int i, j, k, pi, pk, itr, top;
-    int type_i, type_j, type_k;
-    int start_j, end_j, hb_start_j, hb_end_j;
-    int hblist[MAX_BONDS];
-    int num_hb_intrs = 0;
-    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
-    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-    rvec dvec_jk, force, ext_press;
-    ivec rel_jk;
-    // rtensor temp_rtensor, total_rtensor;
-    hbond_parameters *hbp;
-    bond_order_data *bo_ij;
-    bond_data *pbond_ij;
-    far_neighbor_data *nbr_jk;
-    list *bonds, *hbonds;
-    bond_data *bond_list;
-    hbond_data *hbond_list;
-    bonds = (*lists) + BONDS;
-    bond_list = bonds->select.bond_list;
-    hbonds = (*lists) + HBONDS;
-    hbond_list = hbonds->select.hbond_list;
-    /* loops below discover the Hydrogen bonds between i-j-k triplets.
-       here j is H atom and there has to be some bond between i and j.
-       Hydrogen bond is between j and k.
-       so in this function i->X, j->H, k->Z when we map 
-       variables onto the ones in the handout.*/
-    for( j = 0; j < system->N; ++j )
-        if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H
-            /*set j's variables */
-            type_j  = system->atoms[j].type;
-            start_j = Start_Index(j, bonds);
-            end_j   = End_Index(j, bonds);
-            hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-            hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
-            top = 0;
-            for( pi = start_j; pi < end_j; ++pi ) {
-                pbond_ij = &( bond_list[pi] );
-                i = pbond_ij->nbr;
-                bo_ij = &(pbond_ij->bo_data);
-                type_i = system->atoms[i].type;
-                if( system->reaxprm.sbp[type_i].p_hbond == 2 && 
-                        bo_ij->BO >= HB_THRESHOLD )
-                    hblist[top++] = pi;
-            }
-            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-            //          j, top, hb_start_j, hb_end_j );
-            for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
-                /* set k's varibles */
-                k = hbond_list[pk].nbr;
-                type_k = system->atoms[k].type;
-                nbr_jk = hbond_list[pk].ptr;
-                r_jk = nbr_jk->d;
-                rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
-                for( itr=0; itr < top; ++itr ) {
-                    pi = hblist[itr];
-                    pbond_ij = &( bond_list[pi] );
-                    i = pbond_ij->nbr;
-                    if( i != k ) {
-                        bo_ij = &(pbond_ij->bo_data);
-                        type_i = system->atoms[i].type;
-                        r_ij = pbond_ij->d;         
-                        hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, &system->reaxprm) ]);
-                        ++num_hb_intrs;
-                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                &theta, &cos_theta );
-                        /* the derivative of cos(theta) */
-                        Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                &dcos_theta_di, &dcos_theta_dj, 
-                                &dcos_theta_dk );
-                        /* hydrogen bond energy*/
-                        sin_theta2 = SIN( theta/2.0 );
-                        sin_xhz4 = SQR(sin_theta2);
-                        sin_xhz4 *= sin_xhz4;
-                        cos_xhz1 = ( 1.0 - cos_theta );
-                        exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-                                    r_jk / hbp->r0_hb - 2.0 ) );
-                        data->E_HB += e_hb = 
-                            hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                        CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
-                        CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
-                                1.0 / hbp->r0_hb);
-                        /* hydrogen bond forces */
-                        bo_ij->Cdbo += CEhb1;   // dbo term
-                        if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
-                            rvec_ScaledAdd( system->atoms[i].f, 
-                                    +CEhb2, dcos_theta_di ); //dcos terms
-                            rvec_ScaledAdd( system->atoms[j].f, 
-                                    +CEhb2, dcos_theta_dj );
-                            //TODO
-                            rvec_ScaledAdd( system->atoms[k].f, 
-                                    +CEhb2, dcos_theta_dk );
-                            //dr terms
-                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
-                            //TODO
-                            rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk );
-                        }
-                        else
-                        {
-                            /* for pressure coupling, terms that are not related 
-                               to bond order derivatives are added directly into 
-                               pressure vector/tensor */
-                            rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
-                            rvec_Add( system->atoms[i].f, force );
-                            rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
-                            rvec_ScaledAdd( system->atoms[j].f, +CEhb2, dcos_theta_dj );
-                            ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
-                            rvec_Scale( force, +CEhb2, dcos_theta_dk );
-                            //TODO
-                            rvec_Add( system->atoms[k].f, force );
-                            rvec_iMultiply( ext_press, rel_jk, force );
-                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
-                            //dr terms
-                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
-                            rvec_Scale( force, CEhb3/r_jk, dvec_jk );
-                            rvec_Add( system->atoms[k].f, force );
-                            rvec_iMultiply( ext_press, rel_jk, force );
-                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
-                            /* This part is intended for a fully-flexible box */
-                            /* rvec_OuterProduct( temp_rtensor, 
-                               dcos_theta_di, system->atoms[i].x );
-                               rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor );
-                               rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj,
-                               -CEhb3/r_jk, pbond_jk->dvec );
-                               rvec_OuterProduct( temp_rtensor, 
-                               temp_rvec, system->atoms[j].x );
-                               rtensor_Add( total_rtensor, temp_rtensor );
-                               rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk,
-                               +CEhb3/r_jk, pbond_jk->dvec );
-                               rvec_OuterProduct( temp_rtensor, 
-                               temp_rvec, system->atoms[k].x );
-                               rtensor_Add( total_rtensor, temp_rtensor );
-                               if( pbond_ij->imaginary || pbond_jk->imaginary )
-                               rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
-                               else
-                               rtensor_Add( data->flex_bar.P, total_rtensor ); */
-                        }
-                        /*fprintf( out_control->ehb, 
-                          "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n",
-                          dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], 
-                          dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], 
-                          dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
-                          fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n",
-                          CEhb1, CEhb2, CEhb3 ); */
-                        fprintf( stderr, //out_control->ehb, 
-                                "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-                                workspace->orig_id[i], 
-                                workspace->orig_id[j], 
-                                workspace->orig_id[k], 
-                                r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
-                        // dbo term
-                        Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb );
-                        // dcos terms
-                        rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); 
-                        rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
-                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
-                        // dr terms
-                        rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk );
-                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk );
-                    }
-                }
-            }
-        }
-    /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", 
-       data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
-    fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs );
-    fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->E_HB );
-// Cuda Function
-GLOBAL void Hydrogen_Bonds (    reax_atom *atoms,
-        single_body_parameters *sbp,
-        hbond_parameters *d_hbp,
-        control_params *control,
-        simulation_data *data,
-        static_storage p_workspace, 
-        list p_bonds, list p_hbonds,
-        int N, int num_atom_types, 
-        real *E_HB, rvec *aux_ext_press, rvec *atoms_f )
-    extern __shared__ real t_hb[];
-    extern __shared__ real t_f[];
-    //extern __shared__ rvec t_cdbo[];
-    //extern __shared__ rvec t_hf [];
-    real *sh_hb = t_hb;
-    rvec *sh_atomf = (rvec *)(t_hb + blockDim.x);
-    //real *sh_cdbo = t_hb + blockDim.x;
-    //rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
-    int i, j, k, pi, pk, itr, top;
-    int type_i, type_j, type_k;
-    int start_j, end_j, hb_start_j, hb_end_j;
-    int hblist[MAX_BONDS];
-    int num_hb_intrs = 0;
-    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
-    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-    rvec dvec_jk, force, ext_press;
-    ivec rel_jk;
-    // rtensor temp_rtensor, total_rtensor;
-    hbond_parameters *hbp;
-    bond_order_data *bo_ij;
-    bond_data *pbond_ij;
-    far_neighbor_data *nbr_jk;
-    list *bonds, *hbonds;
-    bond_data *bond_list;
-    hbond_data *hbond_list, *hbond_jk;
-    static_storage *workspace = &p_workspace;
-    j = blockIdx.x * blockDim.x + threadIdx.x;
-    if (j >= N) return;
-    //j = blockIdx.x;
-    bonds = &p_bonds;
-    bond_list = bonds->select.bond_list;
-    hbonds = &p_hbonds;
-    hbond_list = hbonds->select.hbond_list;
-    // loops below discover the Hydrogen bonds between i-j-k triplets.
-    // here j is H atom and there has to be some bond between i and j.
-    // Hydrogen bond is between j and k.
-    // so in this function i->X, j->H, k->Z when we map 
-    // variables onto the ones in the handout.
-    //for( j = 0; j < system->N; ++j )
-    sh_hb [threadIdx.x] = 0;
-    rvec_MakeZero ( sh_atomf[ threadIdx.x] );
-    if( sbp[atoms[j].type].p_hbond==1) {// j must be H
-        //set j's variables 
-        type_j  = atoms[j].type;
-        start_j = Start_Index(j, bonds);
-        end_j   = End_Index(j, bonds);
-        hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-        hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
-        top = 0;
-        for( pi = start_j; pi < end_j; ++pi ) {
-            pbond_ij = &( bond_list[pi] );
-            i = pbond_ij->nbr;
-            bo_ij = &(pbond_ij->bo_data);
-            type_i = atoms[i].type;
-            if( sbp[type_i].p_hbond == 2 && 
-                    bo_ij->BO >= HB_THRESHOLD )
-                hblist[top++] = pi;
-        }
-        // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-        //          j, top, hb_start_j, hb_end_j );
-        for( pk = hb_start_j; pk < hb_end_j; ++pk )
-            //pk = hb_start_j + threadIdx.x;
-            //while (pk < hb_end_j)
-        {
-            // set k's varibles 
-            //TODO
-            hbond_jk = &( hbond_list[pk] );
-            //TODO
-            k = hbond_list[pk].nbr;
-            type_k = atoms[k].type;
-            nbr_jk = hbond_list[pk].ptr;
-            r_jk = nbr_jk->d;
-            rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
-            //TODO Double check this Hydrogen Bonds fix
-            //rvec_MakeZero ( nbr_jk->h_f );
-            rvec_MakeZero ( hbond_jk->h_f );
-            //TODO Double check this Hydrogen Bonds fix
-            //sh_hb [threadIdx.x] = 0;
-            //itr = threadIdx.x;
-            for( itr=0; itr < top; ++itr ) {
-                //while (itr < top) {
-                pi = hblist[itr];
-                pbond_ij = &( bond_list[pi] );
-                i = pbond_ij->nbr;
-                //TODO
-                //rvec_MakeZero (sh_hf [threadIdx.x]);
-                //sh_cdbo [threadIdx.x] = 0;
-                //rvec_MakeZero ( sh_atomf[ threadIdx.x] );
-                if( i != k ) {
-                    bo_ij = &(pbond_ij->bo_data);
-                    type_i = atoms[i].type;
-                    r_ij = pbond_ij->d;         
-                    hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
-                    ++num_hb_intrs;
-                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                            &theta, &cos_theta );
-                    // the derivative of cos(theta)
-                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                            &dcos_theta_di, &dcos_theta_dj, 
-                            &dcos_theta_dk );
-                    // hydrogen bond energy
-                    sin_theta2 = SIN( theta/2.0 );
-                    sin_xhz4 = SQR(sin_theta2);
-                    sin_xhz4 *= sin_xhz4;
-                    cos_xhz1 = ( 1.0 - cos_theta );
-                    exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-                    exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-                                r_jk / hbp->r0_hb - 2.0 ) );
-                    //PERFORMANCE IMPACT
-                    e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                    //atomicAdd ( &data->E_HB, e_hb );
-                    //E_HB [j] += e_hb;
-                    sh_hb [threadIdx.x] += e_hb;
-                    CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
-                    CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-                    CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
-                            1.0 / hbp->r0_hb);
-                    //this is the problem here
-                    //TODO
-                    // hydrogen bond forces
-                    bo_ij->Cdbo += CEhb1;   // dbo term
-                    //sh_cdbo[threadIdx.x] += CEhb1;
-                    //TODO
-                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                        //PERFORMANCE IMPACT
-                        /*
-                           atomic_rvecScaledAdd( atoms[i].f, 
-                           +CEhb2, dcos_theta_di ); //dcos terms
-                           atomic_rvecScaledAdd( atoms[j].f, 
-                           +CEhb2, dcos_theta_dj );
-                           atomic_rvecScaledAdd( atoms[k].f, 
-                           +CEhb2, dcos_theta_dk );
-                        //dr terms
-                        atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-                        atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk );
-                         */
-                        //PERFORMANCE IMPACT
-                        rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms
-                        //rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms
-                        //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
-                        rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
-                        //TODO you forgot here
-                        //TODO Hydrogen bonds fix. -- BE VERY CAREFUL *****
-                        rvec_ScaledAdd( hbond_jk->h_f, 
-                                +CEhb2, dcos_theta_dk );
-                        //rvec_ScaledAdd( nbr_jk->h_f, 
-                        //     +CEhb2, dcos_theta_dk );
-                        //dr terms
-                        //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-                        rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk );
-                        //atoms_f [j] ++;
-                        //TODO you forgot 
-                        rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk );
-                        //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk );
-                    }
-                    else
-                    {
-                        // for pressure coupling, terms that are not related 
-                        // to bond order derivatives are added directly into 
-                        // pressure vector/tensor 
-                        rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
-                        rvec_Add( pbond_ij->h_f, force );
-                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-                        //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press );
-                        rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
-                        ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
-                        rvec_Scale( force, +CEhb2, dcos_theta_dk );
-                        //rvec_Add( nbr_jk->h_f, force );
-                        rvec_Add( hbond_jk->h_f, force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-                        //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
-                        //dr terms
-                        rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-                        rvec_Scale( force, CEhb3/r_jk, dvec_jk );
-                        rvec_Add( hbond_jk->h_f, force );
-                        rvec_iMultiply( ext_press, rel_jk, force );
-                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-                        //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
-                    }
-                    //do the reduction for the bond_ij here
-                    /*
-                       if (threadIdx.x < 16){
-                       sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
-                       rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
-                       sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-                       rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
-                       }
-                       if (threadIdx.x < 8){ 
-                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
-                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
-                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
-                    }
-                    if (threadIdx.x < 4){
-                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
-                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
-                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
-                    }
-                    if (threadIdx.x < 2){
-                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
-                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
-                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
-                    }
-                    if (threadIdx.x < 1){
-                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
-                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
-                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
-                    }
-                    if (threadIdx.x == 0){
-                    //bo_ij->Cdbo += sh_cdbo [threadIdx.x];
-                    //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
-                    E_HB [j] += sh_hb [threadIdx.x];
-                    //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-                    }
-                     */
-                } // i != k if statement
-                //itr += blockDim.x;
-            } //itr for statement
-            /*
-               __syncthreads ();
-               for (int x = 1; x < blockDim.x; x++)
-               sh_hb [0] += sh_hb [x];    
-               E_HB [j] += sh_hb[0];
-               if (threadIdx.x < 16) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-               if (threadIdx.x < 8) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-               if (threadIdx.x < 4) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-               if (threadIdx.x < 2) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-               if (threadIdx.x < 1) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-               if (threadIdx.x == 0) E_HB [j] += sh_hb [threadIdx.x];
-             */
-            //pk += blockDim.x;
-            }  // pk for statement
-        } // main if statment
-        //do the reduction for the bond_ij here
-        /*
-           if (threadIdx.x < 16){
-           sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
-        }
-        if (threadIdx.x < 8){ 
-        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
-        }
-        if (threadIdx.x < 4){
-        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
-        }
-        if (threadIdx.x < 2){
-        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
-        }
-        if (threadIdx.x < 1){
-        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
-        }
-        if (threadIdx.x == 0){
-        E_HB [j] += sh_hb [threadIdx.x];
-        //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-        }
-         */
-        E_HB [j]  += sh_hb [threadIdx.x];
-        rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-        //rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]);
-    }
-    DEVICE void warpReduce(volatile real* sdata, int tid) 
-    {
-        if (tid < 16) sdata[tid] += sdata[tid + 16]; 
-        if (tid < 8) sdata[tid] += sdata[tid + 8]; 
-        if (tid < 4) sdata[tid] += sdata[tid + 4]; 
-        if (tid < 2) sdata[tid] += sdata[tid + 2]; 
-        if (tid < 1) sdata[tid] += sdata[tid + 1]; 
-    }
-    GLOBAL void Hydrogen_Bonds_HB (    reax_atom *atoms,
-            single_body_parameters *sbp,
-            hbond_parameters *d_hbp,
-            control_params *control,
-            simulation_data *data,
-            static_storage p_workspace, 
-            list p_bonds, list p_hbonds,
-            int N, int num_atom_types, 
-            real *E_HB, rvec *aux_ext_press, rvec *atoms_f )
-    {
-        extern __shared__ real t_hb[];
-        extern __shared__ rvec t__f[];
-        extern __shared__ rvec t_cdbo[];
-        extern __shared__ rvec t_hf [];
-        real *sh_hb = t_hb;
-        real *sh_cdbo = t_hb + blockDim.x;
-        rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x);
-        rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
-        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-        int warp_id = thread_id / __THREADS_PER_ATOM__;
-        int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
-        int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-        if (warp_id >= N ) return;
-        int i, j, k, pi, pk, itr, top;
-        int type_i, type_j, type_k;
-        int start_j, end_j, hb_start_j, hb_end_j;
-        int hblist[MAX_BONDS];
-        int num_hb_intrs = 0;
-        real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-        real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
-        rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-        rvec dvec_jk, force, ext_press;
-        ivec rel_jk;
-        // rtensor temp_rtensor, total_rtensor;
-        hbond_parameters *hbp;
-        bond_order_data *bo_ij;
-        bond_data *pbond_ij;
-        far_neighbor_data *nbr_jk;
-        list *bonds, *hbonds;
-        bond_data *bond_list;
-        hbond_data *hbond_list, *hbond_jk;
-        static_storage *workspace = &p_workspace;
-        /*
-           j = blockIdx.x * blockDim.x + threadIdx.x;
-           if (j >= N) return;
-         */
-        //     j = blockIdx.x;
-        j = warp_id;
-        bonds = &p_bonds;
-        bond_list = bonds->select.bond_list;
-        hbonds = &p_hbonds;
-        hbond_list = hbonds->select.hbond_list;
-        // loops below discover the Hydrogen bonds between i-j-k triplets.
-        // here j is H atom and there has to be some bond between i and j.
-        // Hydrogen bond is between j and k.
-        // so in this function i->X, j->H, k->Z when we map 
-        // variables onto the ones in the handout.
-        //for( j = 0; j < system->N; ++j )
-        sh_hb [threadIdx.x] = 0;
-        rvec_MakeZero ( sh_atomf[ threadIdx.x] );
-        if( sbp[atoms[j].type].p_hbond==1) {// j must be H
-            //set j's variables 
-            type_j  = atoms[j].type;
-            start_j = Start_Index(j, bonds);
-            end_j   = End_Index(j, bonds);
-            hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-            hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
-            top = 0;
-            for( pi = start_j; pi < end_j; ++pi ) {
-                pbond_ij = &( bond_list[pi] );
-                i = pbond_ij->nbr;
-                bo_ij = &(pbond_ij->bo_data);
-                type_i = atoms[i].type;
-                if( sbp[type_i].p_hbond == 2 && 
-                        bo_ij->BO >= HB_THRESHOLD ) {
-                    hblist[top++] = pi;
-                }
-            }
-            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-            //          j, top, hb_start_j, hb_end_j );
-            for( itr=0; itr < top; ++itr ) {
-                pi = hblist[itr];
-                pbond_ij = &( bond_list[pi] );
-                i = pbond_ij->nbr;
-                //TODO
-                rvec_MakeZero (sh_hf [threadIdx.x]);
-                sh_cdbo [threadIdx.x] = 0;
-                //for( pk = hb_start_j; pk < hb_end_j; ++pk )
-                int loopcount = (hb_end_j - hb_start_j) / HBONDS_THREADS_PER_ATOM + (((hb_end_j - hb_start_j)%HBONDS_THREADS_PER_ATOM == 0) ? 0 : 1);
-                int count = 0;
-                //jpk = hb_start_j + threadIdx.x;
-                pk = hb_start_j + lane_id;
-                //while (pk < hb_end_j)
-                while (count < loopcount)
-                {
-                    if (pk < hb_end_j)
-                    {
-                        // set k's varibles 
-                        //TODO
-                        hbond_jk = &( hbond_list[pk] );
-                        //TODO
-                        k = hbond_list[pk].nbr;
-                        type_k = atoms[k].type;
-                        nbr_jk = hbond_list[pk].ptr;
-                        r_jk = nbr_jk->d;
-                        rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
-                    }
-                    else k = -1;
-                    //TODO Double check this Hydrogen Bonds fix
-                    //rvec_MakeZero ( nbr_jk->h_f );
-                    //rvec_MakeZero ( hbond_jk->h_f );
-                    //TODO Double check this Hydrogen Bonds fix
-                    //sh_hb [threadIdx.x] = 0;
-                    //rvec_MakeZero ( sh_atomf[ threadIdx.x] );
-                    //__syncthreads ();
-                    if(( i != k ) && (k != -1)) {
-                        bo_ij = &(pbond_ij->bo_data);
-                        type_i = atoms[i].type;
-                        r_ij = pbond_ij->d;         
-                        hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
-                        ++num_hb_intrs;
-                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                &theta, &cos_theta );
-                        // the derivative of cos(theta)
-                        Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-                                &dcos_theta_di, &dcos_theta_dj, 
-                                &dcos_theta_dk );
-                        // hydrogen bond energy
-                        sin_theta2 = SIN( theta/2.0 );
-                        sin_xhz4 = SQR(sin_theta2);
-                        sin_xhz4 *= sin_xhz4;
-                        cos_xhz1 = ( 1.0 - cos_theta );
-                        exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-                                    r_jk / hbp->r0_hb - 2.0 ) );
-                        //PERFORMANCE IMPACT
-                        e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-                        //atomicAdd ( &data->E_HB, e_hb );
-                        //E_HB [j] += e_hb;
-                        sh_hb [threadIdx.x] += e_hb;
-                        CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
-                        CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
-                                1.0 / hbp->r0_hb);
-                        //this is the problem here
-                        //TODO
-                        // hydrogen bond forces
-                        //bo_ij->Cdbo += CEhb1;   // dbo term
-                        sh_cdbo[threadIdx.x] += CEhb1;
-                        //TODO
-                        //warpReduce (sh_cdbo, threadIdx.x);
-                        //if (threadIdx.x == 0)
-                        //    bo_ij->Cdbo += sh_cdbo [0];
-                        if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
-                            //PERFORMANCE IMPACT
-                            /*
-                               atomic_rvecScaledAdd( atoms[i].f, 
-                               +CEhb2, dcos_theta_di ); //dcos terms
-                               atomic_rvecScaledAdd( atoms[j].f, 
-                               +CEhb2, dcos_theta_dj );
-                               atomic_rvecScaledAdd( atoms[k].f, 
-                               +CEhb2, dcos_theta_dk );
-                            //dr terms
-                            atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-                            atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk );
-                             */
-                            //PERFORMANCE IMPACT
-                            //rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms
-                            rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms
-                            //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
-                            rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
-                            //TODO you forgot here
-                            //TODO Hydrogen bonds fix. -- BE VERY CAREFUL *****
-                            rvec_ScaledAdd( hbond_jk->h_f, +CEhb2, dcos_theta_dk );
-                            //rvec_ScaledAdd( nbr_jk->h_f, 
-                            //     +CEhb2, dcos_theta_dk );
-                            //dr terms
-                            //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-                            rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk );
-                            //TODO you forgot 
-                            rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk );
-                            //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk );
-                        }
-                        else
-                        {
-                            // for pressure coupling, terms that are not related 
-                            // to bond order derivatives are added directly into 
-                            // pressure vector/tensor 
-                            //rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
-                            //rvec_Add( pbond_ij->h_f, force );
-                            //rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-                            //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-                            //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press );
-                            //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
-                            //ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
-                            //rvec_Scale( force, +CEhb2, dcos_theta_dk );
-                            //rvec_Add( nbr_jk->h_f, force );
-                            //rvec_Add( hbond_jk->h_f, force );
-                            //rvec_iMultiply( ext_press, rel_jk, force );
-                            //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-                            //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
-                            //dr terms
-                            //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-                            //rvec_Scale( force, CEhb3/r_jk, dvec_jk );
-                            //rvec_Add( hbond_jk->h_f, force );
-                            //rvec_iMultiply( ext_press, rel_jk, force );
-                            //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-                            //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
-                        }
-                    } // i != k if statement
-                    pk += __THREADS_PER_ATOM__;
-                    count ++;
-                }  // pk for statement
-                //__syncthreads ();
-                //at this point done with one bond....
-                //do the reduction now
-                //if (threadIdx.x == 0){
-                if (lane_id < 16) {
-                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
-                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
-                }
-                if (lane_id < 8) {
-                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
-                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
-                }
-                if (lane_id < 4) {
-                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
-                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
-                }
-                if (lane_id < 2) {
-                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
-                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
-                }
-                if (lane_id < 1) {
-                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
-                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
-                    bo_ij->Cdbo += sh_cdbo [threadIdx.x];
-                    rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
-                }
-                /*
-                   if (lane_id == 0){
-                   for (i = 1; i < 32; i++)
-                   {
-                //sh_cdbo [threadIdx.x] += sh_cdbo [i];
-                //rvec_Add (sh_hf [threadIdx.x], sh_hf [i]);
-                sh_cdbo [lane_id] += sh_cdbo [lane_id + i];
-                rvec_Add (sh_hf [lane_id], sh_hf [lane_id + i]);
-                }
-                //bo_ij->Cdbo += sh_cdbo [threadIdx.x];
-                //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
-                bo_ij->Cdbo += sh_cdbo [lane_id];
-                rvec_Add (pbond_ij->h_f, sh_hf [lane_id]);
-                }
-                 */
-            } //itr for statement
-            //__syncthreads ();
-            } // main if statment
-            //__syncthreads ();
-            //do the reduction for the bond_ij here
-            if (lane_id < 16){
-                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
-            }
-            if (lane_id < 8){ 
-                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
-            }
-            if (lane_id < 4){
-                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
-            }
-            if (lane_id < 2){
-                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
-            }
-            if (lane_id < 1){
-                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
-                E_HB [j] += sh_hb [threadIdx.x];
-                rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-            }
-            /*
-               if (lane == 0){
-            //E_HB [j] += sh_hb [threadIdx.x];
-            rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-            rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]);
-            }
-             */
-            //if (threadIdx.x == 0){
-            /*
-               if (lane_id == 0){
-               for (i = 1; i < 32; i++)
-               {
-            //sh_hb [threadIdx.x] += sh_hb [i];
-            //rvec_Add (sh_atomf [threadIdx.x], sh_atomf [i]);
-            sh_hb [lane_id] += sh_hb [lane_id + i];
-            rvec_Add (sh_atomf [lane_id], sh_atomf [lane_id + i]);
-            }
-            //E_HB [j] += sh_hb [threadIdx.x];
-            //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-            E_HB [j] += sh_hb [lane_id];
-            rvec_Add (atoms[j].f, sh_atomf [lane_id]);
-            //rvec_Copy (atoms_f[j], sh_atomf [threadIdx.x]);
-            }
-             */
-            //E_HB [j]  += sh_hb [threadIdx.x];
-            //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-        }
-        GLOBAL void Hydrogen_Bonds_Postprocess (     reax_atom *atoms, 
-                single_body_parameters *sbp,
-                static_storage p_workspace,
-                list p_bonds, list p_hbonds, list p_far_nbrs, int N, 
-                real *e_hb)
-        {
-            int i, pj, hj, nbr, k, j;
-            int start, end;
-            bond_data *pbond;
-            bond_data *sym_index_bond;
-            far_neighbor_data *nbr_pj, *sym_index_nbr;
-            list *bonds = &p_bonds;
-            list *far_nbrs = &p_far_nbrs;
-            i = blockIdx.x * blockDim.x + threadIdx.x;
-            if ( i >= N) return;
-            // For processing ij information
-            start = Start_Index(i, bonds);
-            end = End_Index(i, bonds); 
-            //rvec_Scale (atoms[i].f, e_hb[i], atoms[i].f);
-            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
-                pbond = &(bonds->select.bond_list[pj]);
-                sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
-                rvec_Add (atoms[i].f, sym_index_bond->h_f );
-            }
-            /*
-               for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++)
-               {
-            // check if the neighbor is of h_type
-            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-            j = nbr_pj->nbr;
-            sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]);
-            rvec_Add (atoms[i].f, sym_index_nbr->h_f );
-            }
-             */
-            //    if (workspace->hbond_index [j] != -1)
-            //    {
-            //        hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-            //        hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
-            //        for ( hj = hb_start_j; hj < hb_end_j; hj ++ )
-            //        {
-            //            h_bond_data = &( hbonds->select.hbond_list [hj] );
-            //             nbr = h_bond_data->nbr;
-            //            if (nbr == i) {
-            //                     rvec_Add (atoms[i].f, h_bond_data->h_f );
-            //            }
-            //        }
-            //    }
-        }
-        GLOBAL void Hydrogen_Bonds_Far_Nbrs (     reax_atom *atoms, 
-                single_body_parameters *sbp,
-                static_storage p_workspace,
-                list p_bonds, list p_hbonds, list p_far_nbrs, int N )
-        {
-            extern __shared__ rvec __f[];
-            int i, pj,j;
-            int start, end;
-            far_neighbor_data *nbr_pj, *sym_index_nbr;
-            list *far_nbrs = &p_far_nbrs;
-            i = blockIdx.x;
-            start = Start_Index (i, far_nbrs);
-            end = End_Index (i, far_nbrs);
-            pj = start + threadIdx.x;
-            rvec_MakeZero (__f[threadIdx.x]);
-            while (pj < end)
-            {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j = nbr_pj->nbr;
-                //sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]);
-                //
-                //rvec_Add (atoms[i].f, sym_index_nbr->h_f );
-                //
-                //rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f );
-                pj += blockDim.x;
-            }
-            if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
-            if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
-            if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
-            if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
-            if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
-            if (threadIdx.x == 0)
-                rvec_Add (atoms[i].f, __f[0]);
-        }
-        GLOBAL void Hydrogen_Bonds_HNbrs (     reax_atom *atoms, 
-                single_body_parameters *sbp,
-                static_storage p_workspace,
-                list p_bonds, list p_hbonds, list p_far_nbrs, int N )
-        {
-            extern __shared__ rvec __f[];
-            int i, pj,j;
-            int start, end;
-            hbond_data *nbr_pj, *sym_index_nbr;
-            list *hbonds = &p_hbonds;
-            i = blockIdx.x;
-            start = Start_Index (i, hbonds);
-            end = End_Index (i, hbonds);
-            pj = start + threadIdx.x;
-            rvec_MakeZero (__f[threadIdx.x]);
-            while (pj < end)
-            {
-                nbr_pj = &( hbonds->select.hbond_list[pj] );
-                j = nbr_pj->nbr;
-                sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
-                rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f );
-                pj += blockDim.x;
-            }
-            if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
-            if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
-            if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
-            if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
-            if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
-            if (threadIdx.x == 0)
-                rvec_Add (atoms[i].f, __f[0]);
-        }
diff --git a/PuReMD-GPU/src/three_body_interactions.h b/PuReMD-GPU/src/three_body_interactions.h
index 2aa0d4434a7001793a5065a82e238b6d349f49d7..dcbadb0697f951b11b98d53488bd466a633a3fb6 100644
--- a/PuReMD-GPU/src/three_body_interactions.h
+++ b/PuReMD-GPU/src/three_body_interactions.h
@@ -23,52 +23,15 @@
 #include "mytypes.h"
 void Three_Body_Interactions( reax_system*, control_params*, simulation_data*,
-                              static_storage*, list**, output_controls* );
+        static_storage*, list**, output_controls* );
 void Hydrogen_Bonds( reax_system*, control_params*, simulation_data*,
-                     static_storage*, list**, output_controls* );
-//CUDA Functions.
-HOST_DEVICE void Calculate_Theta( rvec, real, rvec, real, real*, real* );
-HOST_DEVICE void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* );
-GLOBAL void Three_Body_Interactions( reax_atom *, single_body_parameters *, three_body_header *,
-                                     global_parameters , control_params *, simulation_data *,
-                                     static_storage ,
-                                     list , list , int , int , real *, real *, real *, rvec *);
-GLOBAL void Three_Body_Interactions_results (  reax_atom *,
-        control_params *,
-        static_storage ,
-        list , int );
+        static_storage*, list**, output_controls* );
-GLOBAL void Three_Body_Estimate ( reax_atom *atoms,
-                                  control_params *control,
-                                  list p_bonds, int N,
-                                  int *count);
+void Calculate_Theta( rvec, real, rvec, real, real*, real* );
-GLOBAL void Hydrogen_Bonds (  reax_atom *,
-                              single_body_parameters *, hbond_parameters *,
-                              control_params *, simulation_data *, static_storage ,
-                              list , list , int , int, real *, rvec *, rvec *);
-GLOBAL void Hydrogen_Bonds_HB (  reax_atom *,
-                                 single_body_parameters *, hbond_parameters *,
-                                 control_params *, simulation_data *, static_storage ,
-                                 list , list , int , int, real *, rvec *, rvec *);
+void Calculate_dCos_Theta( rvec, real, rvec, real, rvec*, rvec*, rvec* );
-GLOBAL void Hydrogen_Bonds_Postprocess (  reax_atom *,
-        single_body_parameters *,
-        static_storage , list,
-        list , list , int, real * );
-GLOBAL void Hydrogen_Bonds_Far_Nbrs (  reax_atom *,
-                                       single_body_parameters *,
-                                       static_storage , list,
-                                       list , list , int );
-GLOBAL void Hydrogen_Bonds_HNbrs (  reax_atom *,
-                                    single_body_parameters *,
-                                    static_storage , list,
-                                    list , list , int );
diff --git a/PuReMD-GPU/src/traj.cu b/PuReMD-GPU/src/traj.c
similarity index 98%
rename from PuReMD-GPU/src/traj.cu
rename to PuReMD-GPU/src/traj.c
index 97496e7f7b8dc4e9add824ee198d0c2907180a98..2844c370ee79702ed0c75d090afe545149aae185 100644
--- a/PuReMD-GPU/src/traj.cu
+++ b/PuReMD-GPU/src/traj.c
@@ -19,13 +19,17 @@
 #include "traj.h"
 #include "list.h"
-#include "cuda_copy.h"
+  #include "cuda_copy.h"
 /*      CUSTOM FORMAT ROUTINES                  */
 int Write_Custom_Header(reax_system *system, control_params *control, 
         static_storage *workspace, output_controls *out_control)
@@ -207,9 +211,9 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
     if( write_bonds )
-#ifndef __PRINT_CPU_RESULTS__
         //fprintf (stderr, "Synching bonds from device for printing ....\n");
-        Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND );
+        Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND );
         for( i = 0; i < system->N; ++i )
@@ -239,12 +243,12 @@ int Append_Custom_Frame( reax_system *system, control_params *control,
     num_thb_intrs = 0;
     if( write_angles ) {
-#ifndef __PRINT_CPU_RESULTS__
         //fprintf (stderr, "Synching three bodies from deivce for printing ... \n");
-        Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY );
+        Sync_Host_Device_List( thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY );
         if ( !write_bonds) {
             //fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n");
-            Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND );
+            Sync_Host_Device_List( bonds, (dev_lists + BONDS), TYP_BOND );
diff --git a/PuReMD-GPU/src/traj.h b/PuReMD-GPU/src/traj.h
index d8c1792d0f6941d6881939d559bd9003b286bfa1..35d92602eee7c2d0b5ee83889623df2cb2106c71 100644
--- a/PuReMD-GPU/src/traj.h
+++ b/PuReMD-GPU/src/traj.h
@@ -22,8 +22,8 @@
 #define __TRAJ_H__
 #include "mytypes.h"
-#include "zlib.h"
+#include <zlib.h>
 #define BLOCK_MARK_LEN 16
@@ -73,12 +73,14 @@
 #define SIZE_INFO_LINE3 "%-10d %-10d %-10d\n"
 #define SIZE_INFO_LEN3 33
                      OPT_ATOM_wV = 6, OPT_ATOM_FULL = 7
     int no_of_sub_blocks;
@@ -89,11 +91,11 @@ struct
 typedef struct __block block;
 int Write_Block( gzFile, block* );
 int Read_Next_Block( gzFile, block*, int* );
 int Skip_Next_Block( gzFile, int*);
   Format for trajectory file
@@ -141,8 +143,6 @@ int Skip_Next_Block( gzFile, int*);
   No. of torsion entries (int)
   Torsion info lines as per torsion format.
 int Write_Custom_Header( reax_system*, control_params*,
                          static_storage*, output_controls* );
 int Write_xyz_Header   ( reax_system*, control_params*,
diff --git a/PuReMD-GPU/src/two_body_interactions.c b/PuReMD-GPU/src/two_body_interactions.c
new file mode 100644
index 0000000000000000000000000000000000000000..2e7a6daf9039ea26c22b2fcfda5913e46255ad75
--- /dev/null
+++ b/PuReMD-GPU/src/two_body_interactions.c
@@ -0,0 +1,571 @@
+  PuReMD-GPU - Reax Force Field Simulator
+  Copyright (2014) Purdue University
+  Sudhir Kylasa, skylasa@purdue.edu
+  Hasan Metin Aktulga, haktulga@cs.purdue.edu
+  Ananth Y Grama, ayg@cs.purdue.edu
+  This program is free software; you can redistribute it and/or
+  modify it under the terms of the GNU General Public License as
+  published by the Free Software Foundation; either version 2 of 
+  the License, or (at your option) any later version.
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  See the GNU General Public License for more details:
+  <http://www.gnu.org/licenses/>.
+  ----------------------------------------------------------------------*/
+#include "two_body_interactions.h"
+#include "bond_orders.h"
+#include "list.h"
+#include "lookup.h"
+#include "vector.h"
+#include "index_utils.h"
+void Bond_Energy( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    real ebond, pow_BOs_be2, exp_be12, CEbo;
+    real gp3, gp4, gp7, gp10, gp37;
+    real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
+    real decobdbo, decobdboua, decobdboub;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij;
+    list *bonds;
+    bonds = (*lists) + BONDS;
+    gp3 = system->reaxprm.gp.l[3];
+    gp4 = system->reaxprm.gp.l[4];
+    gp7 = system->reaxprm.gp.l[7];
+    gp10 = system->reaxprm.gp.l[10];
+    gp37 = (int) system->reaxprm.gp.l[37];
+    for( i=0; i < system->N; ++i ) {
+        start_i = Start_Index(i, bonds);
+        end_i = End_Index(i, bonds);
+        //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
+        for( pj = start_i; pj < end_i; ++pj )
+            if( i < bonds->select.bond_list[pj].nbr ) {
+                /* set the pointers */
+                j = bonds->select.bond_list[pj].nbr;
+                type_i = system->atoms[i].type;
+                type_j = system->atoms[j].type;
+                sbp_i = &( system->reaxprm.sbp[type_i] );
+                sbp_j = &( system->reaxprm.sbp[type_j] );
+                twbp = &( system->reaxprm.tbp[ index_tbp(type_i,type_j,system->reaxprm.num_atom_types) ] );
+                bo_ij = &( bonds->select.bond_list[pj].bo_data );
+                /* calculate the constants */
+                pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
+                exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
+                CEbo = -twbp->De_s * exp_be12 * 
+                    ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
+                /* calculate the Bond Energy */
+                ebond = 
+                    -twbp->De_s * bo_ij->BO_s * exp_be12 
+                    -twbp->De_p * bo_ij->BO_pi 
+                    -twbp->De_pp * bo_ij->BO_pi2;
+                data->E_BE += ebond;
+                /* calculate derivatives of Bond Orders */
+                bo_ij->Cdbo += CEbo;
+                bo_ij->Cdbopi -= (CEbo + twbp->De_p);
+                bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
+                fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
+                        workspace->orig_id[i], workspace->orig_id[j], 
+                        // i+1, j+1, 
+                        bo_ij->BO, ebond/*, data->E_BE*/ );
+                /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
+                   workspace->orig_id[i], workspace->orig_id[j], 
+                   CEbo, -twbp->De_p, -twbp->De_pp );*/
+                Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
+                Add_dBOpinpi2( system, lists, i, pj, 
+                        -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
+                        workspace->f_be, workspace->f_be );
+                /* Stabilisation terminal triple bond */
+                if( bo_ij->BO >= 1.00 ) {
+                    if( gp37 == 2 ||
+                            (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
+                            (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
+                        // ba = SQR(bo_ij->BO - 2.50);
+                        exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
+                        //oboa=abo(j1)-boa;
+                        //obob=abo(j2)-boa;
+                        exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
+                        exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
+                        //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
+                        exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
+                        hulpov = 1.0 / (1.0 + 25.0 * exphuov);
+                        estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
+                        //estrain(j1) = estrain(j1) + 0.50*estriph;
+                        //estrain(j2) = estrain(j2) + 0.50*estriph;
+                        data->E_BE += estriph;
+                        decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
+                            ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
+                        decobdboua = -gp10 * exphu * hulpov * 
+                            (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+                        decobdboub = -gp10 * exphu * hulpov * 
+                            (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+                        bo_ij->Cdbo += decobdbo;
+                        workspace->CdDelta[i] += decobdboua;
+                        workspace->CdDelta[j] += decobdboub;
+                        //loop_j ++;
+                        //fprintf (stderr, "incrementing loopj %d \n", loop_j);
+                        fprintf( out_control->ebond, 
+                                "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                                workspace->orig_id[i], workspace->orig_id[j],
+                                //i+1, j+1, 
+                                estriph, decobdbo, decobdboua, decobdboub );
+                        Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
+                        Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
+                        Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
+                    }
+                }
+            }
+    }
+void vdW_Coulomb_Energy( reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
+    int  i, j, pj;
+    int  start_i, end_i;
+    real self_coef;
+    real p_vdW1, p_vdW1i;
+    real powr_vdW1, powgi_vdW1;
+    real tmp, r_ij, fn13, exp1, exp2;
+    real Tap, dTap, dfn13, CEvd, CEclmb;
+    real dr3gamij_1, dr3gamij_3;
+    real e_ele, e_vdW, e_core, de_core;
+    rvec temp, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    list *far_nbrs;
+    p_vdW1 = system->reaxprm.gp.l[28];
+    p_vdW1i = 1.0 / p_vdW1;
+    far_nbrs = (*lists) + FAR_NBRS; 
+    e_ele = 0;
+    e_vdW = 0;
+    e_core = 0;
+    de_core = 0;
+    for( i = 0; i < system->N; ++i ) {
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
+        // fprintf( stderr, "i: %d, start: %d, end: %d\n",
+        //     i, start_i, end_i );
+        for( pj = start_i; pj < end_i; ++pj )
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j = nbr_pj->nbr;
+                r_ij = nbr_pj->d;
+                twbp = &(system->reaxprm.tbp[ index_tbp(system->atoms[i].type, system->atoms[j].type, system->reaxprm.num_atom_types) ]);
+                self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
+                /* Calculate Taper and its derivative */
+                // Tap = nbr_pj->Tap;   -- precomputed during compte_H
+                Tap = control->Tap7 * r_ij + control->Tap6;
+                Tap = Tap * r_ij + control->Tap5;
+                Tap = Tap * r_ij + control->Tap4;
+                Tap = Tap * r_ij + control->Tap3;
+                Tap = Tap * r_ij + control->Tap2;
+                Tap = Tap * r_ij + control->Tap1;
+                Tap = Tap * r_ij + control->Tap0;
+                dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
+                dTap = dTap * r_ij + 5*control->Tap5;
+                dTap = dTap * r_ij + 4*control->Tap4;
+                dTap = dTap * r_ij + 3*control->Tap3;
+                dTap = dTap * r_ij + 2*control->Tap2;
+                dTap += control->Tap1/r_ij;
+                /*vdWaals Calculations*/
+                if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) {
+                    // shielding
+                    powr_vdW1 = POW(r_ij, p_vdW1);
+                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+                    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+                    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                    data->E_vdW += e_vdW = 
+                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
+                        POW(r_ij, p_vdW1 - 2.0);
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+                            (exp1 - exp2) * dfn13 );
+                }
+                else{ // no shielding
+                    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                    data->E_vdW += e_vdW = 
+                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+                            (exp1 - exp2) );
+                }
+                if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) {
+                    // innner wall
+                    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+                    e_vdW += self_coef * Tap * e_core;
+                    data->E_vdW += self_coef * Tap * e_core;
+                    de_core = -(twbp->acore/twbp->rcore) * e_core;
+                    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
+                }
+                /*Coulomb Calculations*/
+                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+                tmp = Tap / dr3gamij_3;
+                //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
+                data->E_Ele += e_ele = 
+                    self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
+                CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q *
+                    ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+                /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
+                  ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                    rvec_ScaledAdd( system->atoms[i].f, 
+                            -(CEvd+CEclmb), nbr_pj->dvec );
+                    rvec_ScaledAdd( system->atoms[j].f, 
+                            +(CEvd+CEclmb), nbr_pj->dvec );
+                }
+                else { // NPT, iNPT or sNPT
+                    /* for pressure coupling, terms not related to bond order 
+                       derivatives are added directly into pressure vector/tensor */
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                    rvec_ScaledAdd( system->atoms[i].f, -1., temp );
+                    rvec_Add( system->atoms[j].f, temp );
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                    rvec_Add( data->ext_press, ext_press );
+                    /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", 
+                      i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] );
+                      fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] );
+                      fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",        
+                      data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
+                    /* This part is intended for a fully-flexible box */          
+                    /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, 
+                       system->atoms[i].x );
+                       rtensor_Scale( total_rtensor, 
+                       F_C * -(CEvd + CEclmb), temp_rtensor );
+                       rvec_OuterProduct( temp_rtensor, 
+                       nbr_pj->dvec, system->atoms[j].x );
+                       rtensor_ScaledAdd( total_rtensor, 
+                       F_C * +(CEvd + CEclmb), temp_rtensor );
+                       if( nbr_pj->imaginary )
+                    // This is an external force due to an imaginary nbr
+                    rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
+                    else
+                    // This interaction is completely internal
+                    rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                }
+                rvec_MakeZero( temp );
+                rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec );
+                fprintf( out_control->evdw,
+                        "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                        //i+1, j+1,
+                        MIN( workspace->orig_id[i], workspace->orig_id[j] ), 
+                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
+                        r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ );
+                fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                        MIN( workspace->orig_id[i], workspace->orig_id[j] ),
+                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
+                        r_ij, system->atoms[i].q, system->atoms[j].q, 
+                        e_ele/*, data->E_Ele*/ );
+                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+            }
+    }
+    // fclose( fout );
+    // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", 
+    // data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+void LR_vdW_Coulomb( reax_system *system, control_params *control, 
+        int i, int j, real r_ij, LR_data *lr )
+    real p_vdW1 = system->reaxprm.gp.l[28];
+    real p_vdW1i = 1.0 / p_vdW1;
+    real powr_vdW1, powgi_vdW1;
+    real tmp, fn13, exp1, exp2;
+    real Tap, dTap, dfn13;
+    real dr3gamij_1, dr3gamij_3;
+    real e_core, de_core;
+    two_body_parameters *twbp;
+    twbp = &(system->reaxprm.tbp[ index_tbp(i,j,system->reaxprm.num_atom_types) ]);
+    e_core = 0;
+    de_core = 0;
+    /* calculate taper and its derivative */
+    Tap = control->Tap7 * r_ij + control->Tap6;
+    Tap = Tap * r_ij + control->Tap5;
+    Tap = Tap * r_ij + control->Tap4;
+    Tap = Tap * r_ij + control->Tap3;
+    Tap = Tap * r_ij + control->Tap2;
+    Tap = Tap * r_ij + control->Tap1;
+    Tap = Tap * r_ij + control->Tap0;
+    dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
+    dTap = dTap * r_ij + 5*control->Tap5;
+    dTap = dTap * r_ij + 4*control->Tap4;
+    dTap = dTap * r_ij + 3*control->Tap3;
+    dTap = dTap * r_ij + 2*control->Tap2;
+    dTap += control->Tap1/r_ij;
+    /* vdWaals calculations */
+    powr_vdW1 = POW(r_ij, p_vdW1);
+    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
+    /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
+Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
+Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), 
+powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
+    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
+    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - 
+        Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+    /*vdWaals Calculations*/
+    if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3)
+    { // shielding
+        powr_vdW1 = POW(r_ij, p_vdW1);
+        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
+        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
+            POW(r_ij, p_vdW1 - 2.0);
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+    }
+    else{ // no shielding
+        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
+    }
+    if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3)
+    { // innner wall
+        e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+        lr->e_vdW += Tap * e_core;
+        de_core = -(twbp->acore/twbp->rcore) * e_core;
+        lr->CEvd += dTap * e_core + Tap * de_core;
+    }
+    /* Coulomb calculations */
+    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+    tmp = Tap / dr3gamij_3;
+    lr->H = EV_to_KCALpMOL * tmp;
+    lr->e_ele = C_ele * tmp;
+    /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
+Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
+i, system->atoms[i].type, j, system->atoms[j].type, 
+twbp->gamma, Tap, dr3gamij_3, 
+system->atoms[i].q, system->atoms[j].q ); */
+    lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+    /* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
+       i+1, j+1, r_ij, e_vdW, CEvd * r_ij,
+       system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */
+    /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n",
+       i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */
+void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
+        simulation_data *data, 
+        static_storage *workspace, list **lists, 
+        output_controls *out_control )
+    int i, j, pj, r, steps, update_freq, update_energies;
+    int type_i, type_j, tmin, tmax;
+    int start_i, end_i;
+    real r_ij, self_coef, base, dif;
+    real e_vdW, e_ele;
+    real CEvd, CEclmb;
+    rvec temp, ext_press;
+    far_neighbor_data *nbr_pj;
+    list *far_nbrs = (*lists) + FAR_NBRS;
+    LR_lookup_table *t;
+    steps = data->step - data->prev_steps;
+    update_freq = out_control->energy_update_freq;
+    update_energies = update_freq > 0 && steps % update_freq == 0;
+    for( i = 0; i < system->N; ++i ) {
+        type_i  = system->atoms[i].type;
+        start_i = Start_Index(i,far_nbrs);
+        end_i   = End_Index(i,far_nbrs);
+        for( pj = start_i; pj < end_i; ++pj ) 
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j      = nbr_pj->nbr;
+                type_j = system->atoms[j].type;
+                r_ij   = nbr_pj->d;
+                self_coef = (i == j) ? 0.5 : 1.0;
+                tmin  = MIN( type_i, type_j );
+                tmax  = MAX( type_i, type_j );
+                t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); 
+                /* Cubic Spline Interpolation */
+                r = (int)(r_ij * t->inv_dx);
+                if( r == 0 )  ++r;
+                base = (real)(r+1) * t->dx;
+                dif = r_ij - base;
+                //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
+                if( update_energies ) {
+                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
+                        t->vdW[r].a;
+                    e_vdW *= self_coef;
+                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                        t->ele[r].a;
+                    e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q;
+                    data->E_vdW += e_vdW;
+                    data->E_Ele += e_ele;
+                }    
+                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
+                    t->CEvd[r].a;
+                CEvd *= self_coef;
+                //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b;
+                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
+                    t->CEclmb[r].a;
+                CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q;
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                    rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
+                    rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec );
+                }
+                else { // NPT, iNPT or sNPT
+                    /* for pressure coupling, terms not related to bond order 
+                       derivatives are added directly into pressure vector/tensor */
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                    rvec_ScaledAdd( system->atoms[i].f, -1., temp );
+                    rvec_Add( system->atoms[j].f, temp );
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                    rvec_Add( data->ext_press, ext_press );
+                }
+                fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
+                        workspace->orig_id[i], workspace->orig_id[j], 
+                        r_ij, e_vdW, data->E_vdW );
+                fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                        workspace->orig_id[i], workspace->orig_id[j],
+                        r_ij, system->atoms[i].q, system->atoms[j].q, 
+                        e_ele, data->E_Ele );
+                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+            }
+    }
+#if defined(OLD)
+    /* Linear extrapolation */
+    /*p     = (r_ij * t->inv_dx;
+      r     = (int) p;
+      prev  = &( t->y[r] );
+      next  = &( t->y[r+1] );
+      tmp    = p - r;
+      e_vdW  = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW ));
+      CEvd   = self_coef * (prev->CEvd  + tmp*(next->CEvd  - prev->CEvd  ));
+      e_ele  = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele ));
+      e_ele  = e_ele  * system->atoms[i].q * system->atoms[j].q;
+      CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb));
+      CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/
diff --git a/PuReMD-GPU/src/two_body_interactions.cu b/PuReMD-GPU/src/two_body_interactions.cu
deleted file mode 100644
index f53b0cfb0fb1c23626032e7a257fce5b1447953e..0000000000000000000000000000000000000000
--- a/PuReMD-GPU/src/two_body_interactions.cu
+++ /dev/null
@@ -1,1630 +0,0 @@
-  PuReMD-GPU - Reax Force Field Simulator
-  Copyright (2014) Purdue University
-  Sudhir Kylasa, skylasa@purdue.edu
-  Hasan Metin Aktulga, haktulga@cs.purdue.edu
-  Ananth Y Grama, ayg@cs.purdue.edu
-  This program is free software; you can redistribute it and/or
-  modify it under the terms of the GNU General Public License as
-  published by the Free Software Foundation; either version 2 of 
-  the License, or (at your option) any later version.
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  See the GNU General Public License for more details:
-  <http://www.gnu.org/licenses/>.
-  ----------------------------------------------------------------------*/
-#include "two_body_interactions.h"
-#include "bond_orders.h"
-#include "list.h"
-#include "lookup.h"
-#include "vector.h"
-#include "index_utils.h"
-#include "cuda_helpers.h"
-void Bond_Energy( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
-    int i, j, pj;
-    int start_i, end_i;
-    int type_i, type_j;
-    real ebond, pow_BOs_be2, exp_be12, CEbo;
-    real gp3, gp4, gp7, gp10, gp37;
-    real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
-    real decobdbo, decobdboua, decobdboub;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    bond_order_data *bo_ij;
-    list *bonds;
-    bonds = (*lists) + BONDS;
-    gp3 = system->reaxprm.gp.l[3];
-    gp4 = system->reaxprm.gp.l[4];
-    gp7 = system->reaxprm.gp.l[7];
-    gp10 = system->reaxprm.gp.l[10];
-    gp37 = (int) system->reaxprm.gp.l[37];
-    for( i=0; i < system->N; ++i ) {
-        start_i = Start_Index(i, bonds);
-        end_i = End_Index(i, bonds);
-        //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
-        for( pj = start_i; pj < end_i; ++pj )
-            if( i < bonds->select.bond_list[pj].nbr ) {
-                /* set the pointers */
-                j = bonds->select.bond_list[pj].nbr;
-                type_i = system->atoms[i].type;
-                type_j = system->atoms[j].type;
-                sbp_i = &( system->reaxprm.sbp[type_i] );
-                sbp_j = &( system->reaxprm.sbp[type_j] );
-                twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] );
-                bo_ij = &( bonds->select.bond_list[pj].bo_data );
-                /* calculate the constants */
-                pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
-                exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
-                CEbo = -twbp->De_s * exp_be12 * 
-                    ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
-                /* calculate the Bond Energy */
-                ebond = 
-                    -twbp->De_s * bo_ij->BO_s * exp_be12 
-                    -twbp->De_p * bo_ij->BO_pi 
-                    -twbp->De_pp * bo_ij->BO_pi2;
-                data->E_BE += ebond;
-                /* calculate derivatives of Bond Orders */
-                bo_ij->Cdbo += CEbo;
-                bo_ij->Cdbopi -= (CEbo + twbp->De_p);
-                bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
-                fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
-                        workspace->orig_id[i], workspace->orig_id[j], 
-                        // i+1, j+1, 
-                        bo_ij->BO, ebond/*, data->E_BE*/ );
-                /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
-                   workspace->orig_id[i], workspace->orig_id[j], 
-                   CEbo, -twbp->De_p, -twbp->De_pp );*/
-                Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
-                Add_dBOpinpi2( system, lists, i, pj, 
-                        -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
-                        workspace->f_be, workspace->f_be );
-                /* Stabilisation terminal triple bond */
-                if( bo_ij->BO >= 1.00 ) {
-                    if( gp37 == 2 ||
-                            (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
-                            (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
-                        // ba = SQR(bo_ij->BO - 2.50);
-                        exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
-                        //oboa=abo(j1)-boa;
-                        //obob=abo(j2)-boa;
-                        exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
-                        exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
-                        //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
-                        exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
-                        hulpov = 1.0 / (1.0 + 25.0 * exphuov);
-                        estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
-                        //estrain(j1) = estrain(j1) + 0.50*estriph;
-                        //estrain(j2) = estrain(j2) + 0.50*estriph;
-                        data->E_BE += estriph;
-                        decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
-                            ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
-                        decobdboua = -gp10 * exphu * hulpov * 
-                            (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-                        decobdboub = -gp10 * exphu * hulpov * 
-                            (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-                        bo_ij->Cdbo += decobdbo;
-                        workspace->CdDelta[i] += decobdboua;
-                        workspace->CdDelta[j] += decobdboub;
-                        //loop_j ++;
-                        //fprintf (stderr, "incrementing loopj %d \n", loop_j);
-                        fprintf( out_control->ebond, 
-                                "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                                workspace->orig_id[i], workspace->orig_id[j],
-                                //i+1, j+1, 
-                                estriph, decobdbo, decobdboua, decobdboub );
-                        Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
-                        Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
-                        Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
-                    }
-                }
-            }
-    }
-GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, 
-        single_body_parameters *sbp, two_body_parameters *tbp, 
-        simulation_data *data,
-        static_storage p_workspace, list p_bonds, 
-        int N, int num_atom_types, real *E_BE)
-    int i, j, pj;
-    int start_i, end_i;
-    int type_i, type_j;
-    real ebond, pow_BOs_be2, exp_be12, CEbo;
-    real gp3, gp4, gp7, gp10, gp37;
-    real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
-    real decobdbo, decobdboua, decobdboub;
-    single_body_parameters *sbp_i, *sbp_j;
-    two_body_parameters *twbp;
-    bond_order_data *bo_ij;
-    list *bonds;
-    static_storage *workspace;
-    i = blockIdx.x * blockDim.x + threadIdx.x;
-    if ( i >= N ) return;
-    bonds = &p_bonds;
-    workspace = &p_workspace;
-    gp3 = g_params.l[3];
-    gp4 = g_params.l[4];
-    gp7 = g_params.l[7];
-    gp10 = g_params.l[10];
-    gp37 = (int) g_params.l[37];
-    //for( i=0; i < system->N; ++i )
-    start_i = Start_Index(i, bonds);
-    end_i = End_Index(i, bonds);
-    //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
-    for( pj = start_i; pj < end_i; ++pj )
-    {
-        //TODO
-        //if( i < bonds->select.bond_list[pj].nbr ) 
-        if( i < bonds->select.bond_list[pj].nbr ) 
-        {
-            //TODO
-            /* set the pointers */
-            j = bonds->select.bond_list[pj].nbr;
-            type_i = atoms[i].type;
-            type_j = atoms[j].type;
-            sbp_i = &( sbp[type_i] );
-            sbp_j = &( sbp[type_j] );
-            twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );
-            bo_ij = &( bonds->select.bond_list[pj].bo_data );
-            /* calculate the constants */
-            pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
-            exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
-            CEbo = -twbp->De_s * exp_be12 * 
-                ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
-            /* calculate the Bond Energy */
-            ebond = 
-                -twbp->De_s * bo_ij->BO_s * exp_be12 
-                -twbp->De_p * bo_ij->BO_pi 
-                -twbp->De_pp * bo_ij->BO_pi2;
-            //atomicAdd (&data->E_BE, ebond);
-            //TODO
-            //E_BE [ i ] += ebond/2.0;
-            E_BE [ i ] += ebond;
-            //data->E_BE += ebond;
-            /* calculate derivatives of Bond Orders */
-            bo_ij->Cdbo += CEbo;
-            bo_ij->Cdbopi -= (CEbo + twbp->De_p);
-            bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
-            //TODO
-            //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
-            //     workspace->orig_id[i], workspace->orig_id[j], 
-            // i+1, j+1, 
-            //     bo_ij->BO, ebond/*, data->E_BE*/ );
-            /*
-               fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
-               workspace->orig_id[i], workspace->orig_id[j], 
-               CEbo, -twbp->De_p, -twbp->De_pp );*/
-            //TODO
-            /*
-               Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
-               Add_dBOpinpi2( system, lists, i, pj, 
-               -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
-               workspace->f_be, workspace->f_be );
-             */
-            //TODO
-            /* Stabilisation terminal triple bond */
-            if( bo_ij->BO >= 1.00 ) {
-                if( gp37 == 2 ||
-                        (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
-                        (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
-                    // ba = SQR(bo_ij->BO - 2.50);
-                    exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
-                    //oboa=abo(j1)-boa;
-                    //obob=abo(j2)-boa;
-                    exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
-                    exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
-                    //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
-                    exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
-                    hulpov = 1.0 / (1.0 + 25.0 * exphuov);
-                    estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
-                    //estrain(j1) = estrain(j1) + 0.50*estriph;
-                    //estrain(j2) = estrain(j2) + 0.50*estriph;
-                    //PERFORMANCE IMPACT
-                    //atomicAdd (&data->E_BE, estriph);
-                    E_BE [ i] += estriph;
-                    //data->E_BE += estriph;
-                    decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
-                        ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
-                    decobdboua = -gp10 * exphu * hulpov * 
-                        (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-                    decobdboub = -gp10 * exphu * hulpov * 
-                        (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-                    bo_ij->Cdbo += decobdbo;
-                    //PERFORMANCE IMAPCT
-                    workspace->CdDelta[i] += decobdboua;
-                    //atomicAdd (&workspace->CdDelta[j], decobdboub);
-                    //CdDelta [ i * N + i ] += decobdboua;
-                    //CdDelta [ i * N + j ] += decobdboua;
-                    //workspace->CdDelta [i] += decobdboua;
-                    //workspace->CdDelta [j] += decobdboub;
-                    /*
-                       fprintf( out_control->ebond, 
-                       "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                       workspace->orig_id[i], workspace->orig_id[j],
-                    //i+1, j+1, 
-                    estriph, decobdbo, decobdboua, decobdboub );
-                     */
-                    /*
-                       Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
-                       Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
-                       Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
-                     */
-                }
-            }
-        }
-    } //TODO commented out the if statement for processing i < j. 
-    // we process all teh bonds and add only half the energy
-void vdW_Coulomb_Energy( reax_system *system, control_params *control, 
-        simulation_data *data, static_storage *workspace, 
-        list **lists, output_controls *out_control )
-    int  i, j, pj;
-    int  start_i, end_i;
-    real self_coef;
-    real p_vdW1, p_vdW1i;
-    real powr_vdW1, powgi_vdW1;
-    real tmp, r_ij, fn13, exp1, exp2;
-    real Tap, dTap, dfn13, CEvd, CEclmb;
-    real dr3gamij_1, dr3gamij_3;
-    real e_ele, e_vdW, e_core, de_core;
-    rvec temp, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
-    list *far_nbrs;
-    p_vdW1 = system->reaxprm.gp.l[28];
-    p_vdW1i = 1.0 / p_vdW1;
-    far_nbrs = (*lists) + FAR_NBRS; 
-    e_ele = 0;
-    e_vdW = 0;
-    e_core = 0;
-    de_core = 0;
-    for( i = 0; i < system->N; ++i ) {
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
-        // fprintf( stderr, "i: %d, start: %d, end: %d\n",
-        //     i, start_i, end_i );
-        for( pj = start_i; pj < end_i; ++pj )
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j = nbr_pj->nbr;
-                r_ij = nbr_pj->d;
-                twbp = &(system->reaxprm.tbp[ index_tbp (system->atoms[i].type, system->atoms[j].type, &system->reaxprm) ]);
-                self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
-                /* Calculate Taper and its derivative */
-                // Tap = nbr_pj->Tap;   -- precomputed during compte_H
-                Tap = control->Tap7 * r_ij + control->Tap6;
-                Tap = Tap * r_ij + control->Tap5;
-                Tap = Tap * r_ij + control->Tap4;
-                Tap = Tap * r_ij + control->Tap3;
-                Tap = Tap * r_ij + control->Tap2;
-                Tap = Tap * r_ij + control->Tap1;
-                Tap = Tap * r_ij + control->Tap0;
-                dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-                dTap = dTap * r_ij + 5*control->Tap5;
-                dTap = dTap * r_ij + 4*control->Tap4;
-                dTap = dTap * r_ij + 3*control->Tap3;
-                dTap = dTap * r_ij + 2*control->Tap2;
-                dTap += control->Tap1/r_ij;
-                /*vdWaals Calculations*/
-                if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) {
-                    // shielding
-                    powr_vdW1 = POW(r_ij, p_vdW1);
-                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-                    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-                    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-                    data->E_vdW += e_vdW = 
-                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
-                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-                        POW(r_ij, p_vdW1 - 2.0);
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-                            (exp1 - exp2) * dfn13 );
-                }
-                else{ // no shielding
-                    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-                    data->E_vdW += e_vdW = 
-                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-                            (exp1 - exp2) );
-                }
-                if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) {
-                    // innner wall
-                    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
-                    e_vdW += self_coef * Tap * e_core;
-                    data->E_vdW += self_coef * Tap * e_core;
-                    de_core = -(twbp->acore/twbp->rcore) * e_core;
-                    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
-                }
-                /*Coulomb Calculations*/
-                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-                tmp = Tap / dr3gamij_3;
-                //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
-                data->E_Ele += e_ele = 
-                    self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
-                CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q *
-                    ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-                /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
-                  ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                    rvec_ScaledAdd( system->atoms[i].f, 
-                            -(CEvd+CEclmb), nbr_pj->dvec );
-                    rvec_ScaledAdd( system->atoms[j].f, 
-                            +(CEvd+CEclmb), nbr_pj->dvec );
-                }
-                else { // NPT, iNPT or sNPT
-                    /* for pressure coupling, terms not related to bond order 
-                       derivatives are added directly into pressure vector/tensor */
-                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-                    rvec_ScaledAdd( system->atoms[i].f, -1., temp );
-                    rvec_Add( system->atoms[j].f, temp );
-                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-                    rvec_Add( data->ext_press, ext_press );
-                    /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", 
-                      i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] );
-                      fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] );
-                      fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",        
-                      data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
-                    /* This part is intended for a fully-flexible box */          
-                    /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, 
-                       system->atoms[i].x );
-                       rtensor_Scale( total_rtensor, 
-                       F_C * -(CEvd + CEclmb), temp_rtensor );
-                       rvec_OuterProduct( temp_rtensor, 
-                       nbr_pj->dvec, system->atoms[j].x );
-                       rtensor_ScaledAdd( total_rtensor, 
-                       F_C * +(CEvd + CEclmb), temp_rtensor );
-                       if( nbr_pj->imaginary )
-                    // This is an external force due to an imaginary nbr
-                    rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
-                    else
-                    // This interaction is completely internal
-                    rtensor_Add( data->flex_bar.P, total_rtensor ); */
-                }
-                rvec_MakeZero( temp );
-                rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec );
-                fprintf( out_control->evdw,
-                        "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                        //i+1, j+1,
-                        MIN( workspace->orig_id[i], workspace->orig_id[j] ), 
-                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
-                        r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ );
-                fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-                        MIN( workspace->orig_id[i], workspace->orig_id[j] ),
-                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
-                        r_ij, system->atoms[i].q, system->atoms[j].q, 
-                        e_ele/*, data->E_Ele*/ );
-                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
-            }
-    }
-    // fclose( fout );
-    // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", 
-    // data->ext_press[0], data->ext_press[1], data->ext_press[2] );
-   GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms,     
-   two_body_parameters *tbp,
-   global_parameters g_p,
-   control_params *control, 
-   simulation_data *data,  
-   list p_far_nbrs, 
-   real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
-   int num_atom_types, int N )
-   {
-   int  i, j, pj;
-   int  start_i, end_i;
-   real self_coef;
-   real p_vdW1, p_vdW1i;
-   real powr_vdW1, powgi_vdW1;
-   real tmp, r_ij, fn13, exp1, exp2;
-   real Tap, dTap, dfn13, CEvd, CEclmb;
-   real dr3gamij_1, dr3gamij_3;
-   real e_ele, e_vdW, e_core, de_core;
-   rvec temp, ext_press;
-// rtensor temp_rtensor, total_rtensor;
-two_body_parameters *twbp;
-far_neighbor_data *nbr_pj;
-list *far_nbrs = &p_far_nbrs;
-i = blockIdx.x * blockDim.x + threadIdx.x;
-if ( i >= N ) return;
-p_vdW1 = g_p.l[28];
-p_vdW1i = 1.0 / p_vdW1;
-e_ele = 0;
-e_vdW = 0;
-e_core = 0;
-de_core = 0;
-//for( i = 0; i < system->N; ++i ) {
-start_i = Start_Index(i, far_nbrs);
-end_i   = End_Index(i, far_nbrs);
-// fprintf( stderr, "i: %d, start: %d, end: %d\n",
-//     i, start_i, end_i );
-for( pj = start_i; pj < end_i; ++pj )
-if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
-nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-j = nbr_pj->nbr;
-r_ij = nbr_pj->d;
-twbp = &(tbp[ index_tbp (atoms[i].type, atoms[j].type, num_atom_types) ]);
-self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
-//if (i <= j) continue;
-// Calculate Taper and its derivative 
-// Tap = nbr_pj->Tap;   -- precomputed during compte_H
-Tap = control->Tap7 * r_ij + control->Tap6;
-Tap = Tap * r_ij + control->Tap5;
-Tap = Tap * r_ij + control->Tap4;
-Tap = Tap * r_ij + control->Tap3;
-Tap = Tap * r_ij + control->Tap2;
-Tap = Tap * r_ij + control->Tap1;
-Tap = Tap * r_ij + control->Tap0;
-dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-dTap = dTap * r_ij + 5*control->Tap5;
-dTap = dTap * r_ij + 4*control->Tap4;
-dTap = dTap * r_ij + 3*control->Tap3;
-dTap = dTap * r_ij + 2*control->Tap2;
-dTap += control->Tap1/r_ij;
-//vdWaals Calculations
-if(g_p.vdw_type==1 || g_p.vdw_type==3) {
-    // shielding
-    powr_vdW1 = POW(r_ij, p_vdW1);
-    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
-    E_vdW [i] += e_vdW / 2.0;
-    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-        POW(r_ij, p_vdW1 - 2.0);
-    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
-            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-            (exp1 - exp2) * dfn13 );
-else{ // no shielding
-    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
-    E_vdW [i] += e_vdW / 2.0;
-    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-            (exp1 - exp2) );
-if(g_p.vdw_type==2 || g_p.vdw_type==3) {
-    // innner wall
-    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
-    e_vdW = self_coef * Tap * e_core;
-    //TODO check this
-    E_vdW [i] += e_vdW / 2.0;
-    //TODO check this
-    de_core = -(twbp->acore/twbp->rcore) * e_core;
-    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
-//Coulomb Calculations
-dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-tmp = Tap / dr3gamij_3;
-//tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
-e_ele = 
-self_coef * C_ele * atoms[i].q * atoms[j].q * tmp;
-E_Ele [i] += e_ele / 2.0;
-CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q *
-( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-//CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
-// ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;
-if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-    if (i >= j)
-        rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec );
-    else
-        rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec );
-else { // NPT, iNPT or sNPT
-    // for pressure coupling, terms not related to bond order 
-    //  derivatives are added directly into pressure vector/tensor 
-    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-    if ( i >= j)
-        rvec_ScaledAdd( atoms[i].f, -1., temp );
-    else
-        rvec_Add( atoms[i].f, temp );
-    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-    //rvec_Add( data->ext_press, ext_press );
-    rvec_Copy (aux_ext_press[i], ext_press);
-    //TODO CHECK THIS calculation here, it should be divided by two somehow.
-GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms,     
-        two_body_parameters *tbp,
-        global_parameters g_p,
-        control_params *control, 
-        simulation_data *data,  
-        list p_far_nbrs, 
-        real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
-        int num_atom_types, int N )
-    extern __shared__ real _vdw[];
-    extern __shared__ real _ele[];
-    extern __shared__ rvec _force [];
-    real *sh_vdw;
-    real *sh_ele;
-    rvec *sh_force;
-    int  i, j, pj;
-    int  start_i, end_i;
-    real self_coef;
-    real p_vdW1, p_vdW1i;
-    real powr_vdW1, powgi_vdW1;
-    real tmp, r_ij, fn13, exp1, exp2;
-    real Tap, dTap, dfn13, CEvd, CEclmb;
-    real dr3gamij_1, dr3gamij_3;
-    real e_ele, e_vdW, e_core, de_core;
-    rvec temp, ext_press;
-    // rtensor temp_rtensor, total_rtensor;
-    two_body_parameters *twbp;
-    far_neighbor_data *nbr_pj;
-    list *far_nbrs = &p_far_nbrs;
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int warpid = thread_id / VDW_THREADS_PER_ATOM;
-    int laneid = thread_id & (VDW_THREADS_PER_ATOM -1);
-    i = warpid;
-    sh_vdw = _vdw;
-    sh_ele = _vdw + blockDim.x;
-    sh_force = (rvec *)( _vdw + 2*blockDim.x);
-    sh_vdw[threadIdx.x] = 0.0; 
-    sh_ele[threadIdx.x] = 0.0; 
-    rvec_MakeZero ( sh_force [threadIdx.x] );
-    if (i < N)
-    {
-        p_vdW1 = g_p.l[28];
-        p_vdW1i = 1.0 / p_vdW1;
-        e_ele = 0;
-        e_vdW = 0;
-        e_core = 0;
-        de_core = 0;
-        //for( i = 0; i < system->N; ++i ) {
-        start_i = Start_Index(i, far_nbrs);
-        end_i   = End_Index(i, far_nbrs);
-        // fprintf( stderr, "i: %d, start: %d, end: %d\n",
-        //     i, start_i, end_i );
-        pj = start_i + laneid;
-        //for( pj = start_i; pj < end_i; ++pj )
-        while (pj < end_i)
-        {
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j = nbr_pj->nbr;
-                r_ij = nbr_pj->d;
-                twbp = &(tbp[ index_tbp (atoms[i].type, atoms[j].type, num_atom_types) ]);
-                self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
-                //CHANGE ORIGINAL
-                //if (i <= j) continue;
-                //CHANGE ORIGINAL
-                // Calculate Taper and its derivative 
-                // Tap = nbr_pj->Tap;   -- precomputed during compte_H
-                Tap = control->Tap7 * r_ij + control->Tap6;
-                Tap = Tap * r_ij + control->Tap5;
-                Tap = Tap * r_ij + control->Tap4;
-                Tap = Tap * r_ij + control->Tap3;
-                Tap = Tap * r_ij + control->Tap2;
-                Tap = Tap * r_ij + control->Tap1;
-                Tap = Tap * r_ij + control->Tap0;
-                dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-                dTap = dTap * r_ij + 5*control->Tap5;
-                dTap = dTap * r_ij + 4*control->Tap4;
-                dTap = dTap * r_ij + 3*control->Tap3;
-                dTap = dTap * r_ij + 2*control->Tap2;
-                dTap += control->Tap1/r_ij;
-                //vdWaals Calculations
-                if(g_p.vdw_type==1 || g_p.vdw_type==3) {
-                    // shielding
-                    powr_vdW1 = POW(r_ij, p_vdW1);
-                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-                    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-                    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-                    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
-                    //E_vdW [i] += e_vdW / 2.0;
-                    sh_vdw [threadIdx.x] += e_vdW/2.0;
-                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-                        POW(r_ij, p_vdW1 - 2.0);
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-                            (exp1 - exp2) * dfn13 );
-                }
-                else{ // no shielding
-                    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-                    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
-                    //E_vdW [i] += e_vdW / 2.0;
-                    sh_vdw [threadIdx.x] += e_vdW/2.0;
-                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-                            (exp1 - exp2) );
-                }
-                if(g_p.vdw_type==2 || g_p.vdw_type==3) {
-                    // innner wall
-                    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
-                    e_vdW = self_coef * Tap * e_core;
-                    //TODO check this
-                    //E_vdW [i] += e_vdW / 2.0;
-                    sh_vdw [threadIdx.x] += e_vdW / 2.0;
-                    //TODO check this
-                    de_core = -(twbp->acore/twbp->rcore) * e_core;
-                    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
-                }
-                //Coulomb Calculations
-                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-                tmp = Tap / dr3gamij_3;
-                //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
-                e_ele = 
-                    self_coef * C_ele * atoms[i].q * atoms[j].q * tmp;
-                //E_Ele [i] += e_ele / 2.0;
-                sh_ele [threadIdx.x] += e_ele / 2.0;
-                CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q *
-                    ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-                //CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
-                // ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                    if (i >= j){
-                        //rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec );
-                        rvec_ScaledAdd( sh_force[threadIdx.x], -(CEvd+CEclmb), nbr_pj->dvec );
-                    }
-                    else
-                    {
-                        //rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec );
-                        rvec_ScaledAdd( sh_force[threadIdx.x], +(CEvd+CEclmb), nbr_pj->dvec );
-                    }
-                }
-                else { // NPT, iNPT or sNPT
-                    // for pressure coupling, terms not related to bond order 
-                    //  derivatives are added directly into pressure vector/tensor 
-                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-                    if ( i >= j)
-                    {
-                        //rvec_ScaledAdd( atoms[i].f, -1., temp );
-                        rvec_ScaledAdd( sh_force[threadIdx.x], -1., temp );
-                    }
-                    else
-                    {
-                        //rvec_Add( atoms[i].f, temp );
-                        rvec_Add( sh_force[threadIdx.x], temp );
-                    }
-                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-                    //rvec_Add( data->ext_press, ext_press );
-                    rvec_Copy (aux_ext_press[i], ext_press);
-                    //TODO CHECK THIS calculation here, it should be divided by two somehow.
-                }
-            } // if condition for far neighbors
-            pj += VDW_THREADS_PER_ATOM;
-        } // end of while loop for pj < end_i condition
-    } // if (i < N ) condition
-    //}
-    __syncthreads ();
-    if (laneid < 16) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
-    }
-    __syncthreads ();
-    if (laneid < 8) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
-    }
-    __syncthreads ();
-    if (laneid < 4) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
-    }
-    __syncthreads ();
-    if (laneid < 2) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
-    }
-    __syncthreads ();
-    if (laneid < 1) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
-    }
-    __syncthreads ();
-    if (laneid == 0) {
-        E_vdW [i] += sh_vdw[threadIdx.x];
-        E_Ele [i] += sh_ele[threadIdx.x];
-        rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]);
-    }
-void LR_vdW_Coulomb( reax_system *system, control_params *control, 
-        int i, int j, real r_ij, LR_data *lr )
-    real p_vdW1 = system->reaxprm.gp.l[28];
-    real p_vdW1i = 1.0 / p_vdW1;
-    real powr_vdW1, powgi_vdW1;
-    real tmp, fn13, exp1, exp2;
-    real Tap, dTap, dfn13;
-    real dr3gamij_1, dr3gamij_3;
-    real e_core, de_core;
-    two_body_parameters *twbp;
-    twbp = &(system->reaxprm.tbp[ index_tbp (i,j,&system->reaxprm) ]);
-    e_core = 0;
-    de_core = 0;
-    /* calculate taper and its derivative */
-    Tap = control->Tap7 * r_ij + control->Tap6;
-    Tap = Tap * r_ij + control->Tap5;
-    Tap = Tap * r_ij + control->Tap4;
-    Tap = Tap * r_ij + control->Tap3;
-    Tap = Tap * r_ij + control->Tap2;
-    Tap = Tap * r_ij + control->Tap1;
-    Tap = Tap * r_ij + control->Tap0;
-    dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-    dTap = dTap * r_ij + 5*control->Tap5;
-    dTap = dTap * r_ij + 4*control->Tap4;
-    dTap = dTap * r_ij + 3*control->Tap3;
-    dTap = dTap * r_ij + 2*control->Tap2;
-    dTap += control->Tap1/r_ij;
-    /* vdWaals calculations */
-    powr_vdW1 = POW(r_ij, p_vdW1);
-    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
-    /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
-Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
-Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), 
-powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
-    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
-    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - 
-        Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
-    /*vdWaals Calculations*/
-    if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3)
-    { // shielding
-        powr_vdW1 = POW(r_ij, p_vdW1);
-        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
-        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-            POW(r_ij, p_vdW1 - 2.0);
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
-    }
-    else{ // no shielding
-        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
-    }
-    if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3)
-    { // innner wall
-        e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
-        lr->e_vdW += Tap * e_core;
-        de_core = -(twbp->acore/twbp->rcore) * e_core;
-        lr->CEvd += dTap * e_core + Tap * de_core;
-    }
-    /* Coulomb calculations */
-    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-    tmp = Tap / dr3gamij_3;
-    lr->H = EV_to_KCALpMOL * tmp;
-    lr->e_ele = C_ele * tmp;
-    /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
-Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
-i, system->atoms[i].type, j, system->atoms[j].type, 
-twbp->gamma, Tap, dr3gamij_3, 
-system->atoms[i].q, system->atoms[j].q ); */
-    lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-    /* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
-       i+1, j+1, r_ij, e_vdW, CEvd * r_ij,
-       system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */
-    /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n",
-       i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */
-void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
-        simulation_data *data, 
-        static_storage *workspace, list **lists, 
-        output_controls *out_control )
-    int i, j, pj, r, steps, update_freq, update_energies;
-    int type_i, type_j, tmin, tmax;
-    int start_i, end_i;
-    real r_ij, self_coef, base, dif;
-    real e_vdW, e_ele;
-    real CEvd, CEclmb;
-    rvec temp, ext_press;
-    far_neighbor_data *nbr_pj;
-    list *far_nbrs = (*lists) + FAR_NBRS;
-    LR_lookup_table *t;
-    steps = data->step - data->prev_steps;
-    update_freq = out_control->energy_update_freq;
-    update_energies = update_freq > 0 && steps % update_freq == 0;
-    for( i = 0; i < system->N; ++i ) {
-        type_i  = system->atoms[i].type;
-        start_i = Start_Index(i,far_nbrs);
-        end_i   = End_Index(i,far_nbrs);
-        for( pj = start_i; pj < end_i; ++pj ) 
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j      = nbr_pj->nbr;
-                type_j = system->atoms[j].type;
-                r_ij   = nbr_pj->d;
-                self_coef = (i == j) ? 0.5 : 1.0;
-                tmin  = MIN( type_i, type_j );
-                tmax  = MAX( type_i, type_j );
-                t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); 
-                /* Cubic Spline Interpolation */
-                r = (int)(r_ij * t->inv_dx);
-                if( r == 0 )  ++r;
-                base = (real)(r+1) * t->dx;
-                dif = r_ij - base;
-                //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
-                if( update_energies ) {
-                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-                        t->vdW[r].a;
-                    e_vdW *= self_coef;
-                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-                        t->ele[r].a;
-                    e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q;
-                    data->E_vdW += e_vdW;
-                    data->E_Ele += e_ele;
-                }    
-                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-                    t->CEvd[r].a;
-                CEvd *= self_coef;
-                //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b;
-                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-                    t->CEclmb[r].a;
-                CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q;
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                    rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
-                    rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec );
-                }
-                else { // NPT, iNPT or sNPT
-                    /* for pressure coupling, terms not related to bond order 
-                       derivatives are added directly into pressure vector/tensor */
-                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-                    rvec_ScaledAdd( system->atoms[i].f, -1., temp );
-                    rvec_Add( system->atoms[j].f, temp );
-                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-                    rvec_Add( data->ext_press, ext_press );
-                }
-                fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-                        workspace->orig_id[i], workspace->orig_id[j], 
-                        r_ij, e_vdW, data->E_vdW );
-                fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-                        workspace->orig_id[i], workspace->orig_id[j],
-                        r_ij, system->atoms[i].q, system->atoms[j].q, 
-                        e_ele, data->E_Ele );
-                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
-            }
-    }
-GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy(     reax_atom *atoms, 
-        control_params *control,
-        simulation_data *data, 
-        list p_far_nbrs, 
-        real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
-        LR_lookup_table *d_LR,
-        int num_atom_types,
-        int energy_update_freq,
-        int N  )
-    extern __shared__ real _vdw[];
-    extern __shared__ real _ele[];
-    extern __shared__ rvec _force [];
-    real *sh_vdw;
-    real *sh_ele;
-    rvec *sh_force;
-    int i, j, pj, r, steps, update_freq, update_energies;
-    int type_i, type_j, tmin, tmax;
-    int start_i, end_i;
-    real r_ij, self_coef, base, dif;
-    real e_vdW, e_ele;
-    real CEvd, CEclmb;
-    rvec temp, ext_press;
-    far_neighbor_data *nbr_pj;
-    LR_lookup_table *t;
-    list *far_nbrs = &p_far_nbrs;
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int warpid = thread_id / VDW_THREADS_PER_ATOM;
-    int laneid = thread_id & (VDW_THREADS_PER_ATOM -1);
-    i = warpid;
-    sh_vdw = _vdw;
-    sh_ele = _vdw + blockDim.x;
-    sh_force = (rvec *)( _vdw + 2*blockDim.x);
-    sh_vdw[threadIdx.x] = 0.0; 
-    sh_ele[threadIdx.x] = 0.0; 
-    rvec_MakeZero ( sh_force [threadIdx.x] );
-    if ( i < N ) 
-    {
-        reax_atom local_atom ;
-        local_atom.q =  atoms[i].q;
-        //local_atom.q =  d_far_data.q[i];
-        local_atom.type = atoms[i].type;
-        //local_atom.type = d_far_data.type[i];
-        /*
-           sh_vdw = _vdw;
-           sh_ele = _vdw + warpid;
-           sh_force = (rvec *)( _vdw + 2*warpid);
-           sh_vdw[threadIdx.x] = 0.0; 
-           sh_ele[threadIdx.x] = 0.0; 
-           rvec_MakeZero ( sh_force [threadIdx.x] );
-         */
-        steps = data->step - data->prev_steps;
-        update_freq = energy_update_freq;
-        update_energies = update_freq > 0 && steps % update_freq == 0;
-        //for( i = 0; i < system->N; ++i ) {
-        type_i  = local_atom.type;
-        start_i = Start_Index(i,far_nbrs);
-        end_i   = End_Index(i,far_nbrs);
-        pj = start_i + laneid;
-        //for( pj = start_i; pj < end_i; ++pj ) 
-        while (pj < end_i)
-        {
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
-                //if( d_far_data.d[pj] <= control->r_cut ) 
-            {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j      = nbr_pj->nbr;
-                //j      = d_far_data.nbrs[pj];
-                type_j = atoms[j].type;
-                //type_j = d_far_data.type[j];
-                r_ij   = nbr_pj->d;
-                //r_ij   = d_far_data.d[pj];
-                self_coef = (i == j) ? 0.5 : 1.0;
-                tmin  = MIN( type_i, type_j );
-                tmax  = MAX( type_i, type_j );
-                t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
-                //TODO
-                //CHANGE ORIGINAL
-                //if (i <= j) { pj += blockDim.x; continue; }
-                //CHANGE ORIGINAL
-                /* Cubic Spline Interpolation */
-                r = (int)(r_ij * t->inv_dx);
-                if( r == 0 )  ++r;
-                base = (real)(r+1) * t->dx;
-                dif = r_ij - base;
-                if(( update_energies )) 
-                {
-                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-                        t->vdW[r].a;
-                    e_vdW *= self_coef;
-                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + t->ele[r].a;
-                    e_ele *= self_coef * local_atom.q * atoms[j].q;
-                    //data->E_vdW += e_vdW;
-                    //TODO
-                    //E_vdW [i] += e_vdW / 2.0;
-                    //E_vdW [i] = __dadd_rd (E_vdW [i], e_vdW/2.0);
-                    sh_vdw [threadIdx.x] += e_vdW/2.0;
-                    //E_vdW [i] += e_vdW;
-                    //TODO
-                    //data->E_Ele += e_ele;
-                    //E_Ele [i] += e_ele / 2.0;
-                    //E_Ele [i] = __dadd_rd ( E_Ele [i], e_ele / 2.0);
-                    sh_ele [threadIdx.x] += e_ele/2.0;
-                    //E_Ele [i] += e_ele;
-                }    
-                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-                    t->CEvd[r].a;
-                CEvd *= self_coef;
-                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-                    t->CEclmb[r].a;
-                CEclmb *= self_coef * local_atom.q * atoms[j].q;
-                //CEclmb *= self_coef * local_atom.q * d_far_data.q[j];
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-                    if ( i >= j)
-                        //rvec_ScaledAdd( atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
-                        rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec );
-                    //rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), d_far_data.dvec[pj] );
-                    else 
-                        //rvec_ScaledAdd( atoms[i].f, +(CEvd + CEclmb), nbr_pj->dvec );
-                        rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec );
-                    //rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), d_far_data.dvec[pj] );
-                }
-                else { // NPT, iNPT or sNPT
-                    // for pressure coupling, terms not related to bond order 
-                    //  derivatives are added directly into pressure vector/tensor /
-                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-                    if (i >= j)
-                        rvec_ScaledAdd( atoms[i].f, -1., temp );
-                    else
-                        rvec_Add( atoms[i].f, temp );
-                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-                    //rvec_Add( data->ext_press, ext_press );
-                    rvec_Copy (aux_ext_press [i], ext_press );
-                    //TODO CHECK THIS
-                }
-            }
-            pj += VDW_THREADS_PER_ATOM;
-        }
-    }// if i < n condition
-    __syncthreads ();
-    if (laneid < 16) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
-    }
-    __syncthreads ();
-    if (laneid < 8) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
-    }
-    __syncthreads ();
-    if (laneid < 4) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
-    }
-    __syncthreads ();
-    if (laneid < 2) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
-    }
-    __syncthreads ();
-    if (laneid < 1) {
-        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
-        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
-        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
-    }
-    __syncthreads ();
-    if (laneid == 0) {
-        E_vdW [i] += sh_vdw[threadIdx.x];
-        E_Ele [i] += sh_ele[threadIdx.x];
-        rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]);
-    }
-    }
-    GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1(     reax_atom *atoms, 
-            control_params *control,
-            simulation_data *data, 
-            list p_far_nbrs, 
-            real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
-            LR_lookup_table *d_LR,
-            int num_atom_types,
-            int energy_update_freq,
-            int N )
-    {
-        extern __shared__ real _vdw[];
-        extern __shared__ real _ele[];
-        real *sh_vdw;
-        real *sh_ele;
-        int i, j, pj, r, steps, update_freq, update_energies;
-        int type_i, type_j, tmin, tmax;
-        int start_i, end_i;
-        real r_ij, self_coef, base, dif;
-        real e_vdW, e_ele;
-        real CEvd, CEclmb;
-        rvec temp, ext_press;
-        far_neighbor_data *nbr_pj;
-        LR_lookup_table *t;
-        list *far_nbrs = &p_far_nbrs;
-        i = blockIdx.x;
-        reax_atom local_atom;
-        local_atom.q =  atoms[i].q;
-        local_atom.type = atoms[i].type;
-        sh_vdw = _vdw;
-        sh_ele = _vdw + blockDim.x;
-        sh_vdw[threadIdx.x] = 0.0; 
-        sh_ele[threadIdx.x] = 0.0; 
-        steps = data->step - data->prev_steps;
-        update_freq = energy_update_freq;
-        update_energies = update_freq > 0 && steps % update_freq == 0;
-        type_i  = local_atom.type;
-        start_i = Start_Index(i,far_nbrs);
-        end_i   = End_Index(i,far_nbrs);
-        pj = start_i + threadIdx.x;
-        while (pj < end_i)
-        {
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
-            {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j      = nbr_pj->nbr;
-                type_j = atoms[j].type;
-                r_ij   = nbr_pj->d;
-                self_coef = (i == j) ? 0.5 : 1.0;
-                tmin  = MIN( type_i, type_j );
-                tmax  = MAX( type_i, type_j );
-                t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
-                /* Cubic Spline Interpolation */
-                r = (int)(r_ij * t->inv_dx);
-                if( r == 0 )  ++r;
-                base = (real)(r+1) * t->dx;
-                dif = r_ij - base;
-                if(( update_energies )) 
-                {
-                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-                        t->vdW[r].a;
-                    e_vdW *= self_coef;
-                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-                        t->ele[r].a;
-                    e_ele *= self_coef * local_atom.q * atoms[j].q;
-                    sh_vdw [threadIdx.x] += e_vdW/2.0;
-                    sh_ele [threadIdx.x] += e_ele/2.0;
-                }    
-            }
-            pj += blockDim.x;
-        }
-        // now do a reduce inside the warp for E_vdW, E_Ele and force.
-        if (threadIdx.x < 16) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
-        }
-        if (threadIdx.x < 8) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
-        }
-        if (threadIdx.x < 4) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
-        }
-        if (threadIdx.x < 2) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
-        }
-        if (threadIdx.x < 1) {
-            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
-            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
-        }
-        if (threadIdx.x == 0) {
-            E_vdW [i] += sh_vdw[0];
-            E_Ele [i] += sh_ele[0];
-        }
-    }
-    GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2(     reax_atom *atoms, 
-            control_params *control,
-            simulation_data *data, 
-            list p_far_nbrs, 
-            real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
-            LR_lookup_table *d_LR,
-            int num_atom_types,
-            int energy_update_freq,
-            int N )
-    {
-        extern __shared__ rvec _force [];
-        rvec *sh_force;
-        int i, j, pj, r, steps, update_freq, update_energies;
-        int type_i, type_j, tmin, tmax;
-        int start_i, end_i;
-        real r_ij, self_coef, base, dif;
-        real e_vdW, e_ele;
-        real CEvd, CEclmb;
-        rvec temp, ext_press;
-        far_neighbor_data *nbr_pj;
-        LR_lookup_table *t;
-        list *far_nbrs = &p_far_nbrs;
-        i = blockIdx.x;
-        reax_atom local_atom;
-        local_atom.q =  atoms[i].q;
-        local_atom.type = atoms[i].type;
-        sh_force = _force;
-        rvec_MakeZero ( sh_force [threadIdx.x] );
-        steps = data->step - data->prev_steps;
-        update_freq = energy_update_freq;
-        update_energies = update_freq > 0 && steps % update_freq == 0;
-        //for( i = 0; i < system->N; ++i ) {
-        type_i  = local_atom.type;
-        start_i = Start_Index(i,far_nbrs);
-        end_i   = End_Index(i,far_nbrs);
-        pj = start_i + threadIdx.x;
-        while (pj < end_i)
-        {
-            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
-            {
-                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-                j      = nbr_pj->nbr;
-                type_j = atoms[j].type;
-                r_ij   = nbr_pj->d;
-                self_coef = (i == j) ? 0.5 : 1.0;
-                tmin  = MIN( type_i, type_j );
-                tmax  = MAX( type_i, type_j );
-                t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
-                /* Cubic Spline Interpolation */
-                r = (int)(r_ij * t->inv_dx);
-                if( r == 0 )  ++r;
-                base = (real)(r+1) * t->dx;
-                dif = r_ij - base;
-                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-                    t->CEvd[r].a;
-                CEvd *= self_coef;
-                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-                    t->CEclmb[r].a;
-                CEclmb *= self_coef * local_atom.q * atoms[j].q;
-                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
-                    if ( i >= j)
-                        rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec );
-                    else 
-                        rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec );
-                }
-                else { // NPT, iNPT or sNPT
-                    // for pressure coupling, terms not related to bond order 
-                    //  derivatives are added directly into pressure vector/tensor /
-                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-                    if (i >= j)
-                        rvec_ScaledAdd( atoms[i].f, -1., temp );
-                    else
-                        rvec_Add( atoms[i].f, temp );
-                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-                    rvec_Copy (aux_ext_press [i], ext_press );
-                }
-            }
-            pj += blockDim.x;
-        }
-        // now do a reduce inside the warp for E_vdW, E_Ele and force.
-        if (threadIdx.x < 16) {
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
-        }
-        if (threadIdx.x < 8) {
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
-        }
-        if (threadIdx.x < 4) {
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
-        }
-        if (threadIdx.x < 2) {
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
-        }
-        if (threadIdx.x < 1) {
-            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
-        }
-        if (threadIdx.x == 0) {
-            rvec_Add (atoms[i].f, sh_force [ 0 ]);
-        }
-    }
-#if defined(OLD)
-    /* Linear extrapolation */
-    /*p     = (r_ij * t->inv_dx;
-      r     = (int) p;
-      prev  = &( t->y[r] );
-      next  = &( t->y[r+1] );
-      tmp    = p - r;
-      e_vdW  = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW ));
-      CEvd   = self_coef * (prev->CEvd  + tmp*(next->CEvd  - prev->CEvd  ));
-      e_ele  = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele ));
-      e_ele  = e_ele  * system->atoms[i].q * system->atoms[j].q;
-      CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb));
-      CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/
diff --git a/PuReMD-GPU/src/two_body_interactions.h b/PuReMD-GPU/src/two_body_interactions.h
index 41483222a1d1e66eb21805f7e1659b5eecd3f40b..f689290e9e680ca62352b6f627737b8a8b006472 100644
--- a/PuReMD-GPU/src/two_body_interactions.h
+++ b/PuReMD-GPU/src/two_body_interactions.h
@@ -21,156 +21,19 @@
-#include <mytypes.h>
-#include "index_utils.h"
+#include "mytypes.h"
-void Bond_Energy( reax_system*, control_params*, simulation_data*,
-                  static_storage*, list**, output_controls* );
-void vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*,
-                         static_storage*, list**, output_controls* );
-void LR_vdW_Coulomb( reax_system*, control_params*, int, int, real, LR_data* );
-void Tabulated_vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*,
-                                   static_storage*, list**, output_controls* );
-//CUDA functions
-GLOBAL void Cuda_Bond_Energy ( reax_atom *, global_parameters , single_body_parameters *, two_body_parameters *,
-                               simulation_data *, static_storage , list , int , int, real *);
-GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *,
-                                     two_body_parameters *,
-                                     global_parameters ,
-                                     control_params *,
-                                     simulation_data *,
-                                     list ,
-                                     real *, real *, rvec *,
-                                     int , int );
-GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy ( reax_atom *, control_params *, simulation_data *,
-        list , real *, real *, rvec *,
-        LR_lookup_table *, int , int , int ) ;
-GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1 ( reax_atom *, control_params *, simulation_data *,
-        list , real *, real *, rvec *,
-        LR_lookup_table *, int , int , int ) ;
-GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2 ( reax_atom *, control_params *, simulation_data *,
-        list , real *, real *, rvec *,
-        LR_lookup_table *, int , int , int ) ;
-HOST_DEVICE void LR_vdW_Coulomb( global_parameters , two_body_parameters *,
-                                 control_params *, int , int , real , LR_data * , int);
-HOST_DEVICE inline void LR_vdW_Coulomb(    global_parameters g_params, two_body_parameters *tbp,
-        control_params *control,
-        int i, int j, real r_ij, LR_data *lr, int num_atom_types )
-    real p_vdW1 = g_params.l[28];
-    real p_vdW1i = 1.0 / p_vdW1;
-    real powr_vdW1, powgi_vdW1;
-    real tmp, fn13, exp1, exp2;
-    real Tap, dTap, dfn13;
-    real dr3gamij_1, dr3gamij_3;
-    real e_core, de_core;
-    two_body_parameters *twbp;
-    twbp = &(tbp[ index_tbp (i, j, num_atom_types) ]);
-    e_core = 0;
-    de_core = 0;
-    /* calculate taper and its derivative */
-    Tap = control->Tap7 * r_ij + control->Tap6;
-    Tap = Tap * r_ij + control->Tap5;
-    Tap = Tap * r_ij + control->Tap4;
-    Tap = Tap * r_ij + control->Tap3;
-    Tap = Tap * r_ij + control->Tap2;
-    Tap = Tap * r_ij + control->Tap1;
-    Tap = Tap * r_ij + control->Tap0;
-    dTap = 7 * control->Tap7 * r_ij + 6 * control->Tap6;
-    dTap = dTap * r_ij + 5 * control->Tap5;
-    dTap = dTap * r_ij + 4 * control->Tap4;
-    dTap = dTap * r_ij + 3 * control->Tap3;
-    dTap = dTap * r_ij + 2 * control->Tap2;
-    dTap += control->Tap1 / r_ij;
-    /* vdWaals calculations */
-    powr_vdW1 = POW(r_ij, p_vdW1);
-    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
-    /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
-       Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
-       Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2),
-       powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
-    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
-    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) -
-               Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
-    /*vdWaals Calculations*/
-    if (g_params.vdw_type == 1 || g_params.vdw_type == 3)
-    {
-        // shielding
-        powr_vdW1 = POW(r_ij, p_vdW1);
-        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
-        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) *
-                POW(r_ij, p_vdW1 - 2.0);
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
-                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
-    }
-    else  // no shielding
-    {
-        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
-        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
-                   Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
-    }
-    if (g_params.vdw_type == 2 || g_params.vdw_type == 3)
-    {
-        // innner wall
-        e_core = twbp->ecore * EXP(twbp->acore * (1.0 - (r_ij / twbp->rcore)));
-        lr->e_vdW += Tap * e_core;
-        de_core = -(twbp->acore / twbp->rcore) * e_core;
-        lr->CEvd += dTap * e_core + Tap * de_core;
-    }
+void Bond_Energy( reax_system*, control_params*, simulation_data*,
+        static_storage*, list**, output_controls* );
-    /* Coulomb calculations */
-    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+void vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*,
+        static_storage*, list**, output_controls* );
-    tmp = Tap / dr3gamij_3;
-    lr->H = EV_to_KCALpMOL * tmp;
-    lr->e_ele = C_ele * tmp;
-    /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
-       Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
-       i, system->atoms[i].type, j, system->atoms[j].type,
-       twbp->gamma, Tap, dr3gamij_3,
-       system->atoms[i].q, system->atoms[j].q ); */
+void LR_vdW_Coulomb( reax_system*, control_params*, int, int, real, LR_data* );
-    lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-    /* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
-       i+1, j+1, r_ij, e_vdW, CEvd * r_ij,
-       system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */
+void Tabulated_vdW_Coulomb_Energy( reax_system*, control_params*, simulation_data*,
+        static_storage*, list**, output_controls* );
-    /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n",
-       i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */
diff --git a/PuReMD-GPU/src/validation.cu b/PuReMD-GPU/src/validation.cu
index f8261555a595a06dcbdfbffe5d62c7a0375c4c97..21cd2145e689621ee0b3827889b106ed7c05af7f 100644
--- a/PuReMD-GPU/src/validation.cu
+++ b/PuReMD-GPU/src/validation.cu
@@ -18,7 +18,6 @@
 #include "validation.h"
 #include "cuda_utils.h"
@@ -27,33 +26,37 @@
 #include "sort.h"
 #include "index_utils.h"
-bool check_zero (real p1, real p2)
+int check_zero (real p1, real p2)
     if (abs (p1 - p2) >= GPU_TOLERANCE)
-        return true;
+        return TRUE;
-        return false;
+        return FALSE;
-bool check_zero (rvec p1, rvec p2)
+int check_zero (rvec p1, rvec p2)
     if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) ||
             ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) ||
             ((abs (p1[2] - p2[2])) >= GPU_TOLERANCE ))
-        return true;
-    else return false;
+        return TRUE;
+    else return FALSE;
-bool check_same (ivec p1, ivec p2)
+int check_same (ivec p1, ivec p2)
     if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) )
-        return true;
+        return TRUE;
-        return false;
+        return FALSE;
-bool validate_box (simulation_box *host, simulation_box *dev)
+int validate_box (simulation_box *host, simulation_box *dev)
     simulation_box test;
@@ -62,14 +65,15 @@ bool validate_box (simulation_box *host, simulation_box *dev)
     if (memcmp (&test, host, SIMULATION_BOX_SIZE)) {
         fprintf (stderr, " Simulation box is not in synch between host and device \n");
-        return false;
+        return FALSE;
     fprintf (stderr, " Simulation box is in **synch** between host and device \n");
-    return true;
+    return TRUE;
-bool validate_atoms (reax_system *system, list **lists)
+int validate_atoms (reax_system *system, list **lists)
     int start, end, index, count, miscount;
@@ -154,9 +158,10 @@ bool validate_atoms (reax_system *system, list **lists)
     //fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount);
     free (test);
-    return true;
+    return TRUE;
 void Print_Matrix( sparse_matrix *A )
     int i, j;
@@ -170,6 +175,7 @@ void Print_Matrix( sparse_matrix *A )
 void Print_Matrix_L( sparse_matrix *A )
     int i, j;
@@ -184,7 +190,7 @@ void Print_Matrix_L( sparse_matrix *A )
-bool validate_sort_matrix (reax_system *system, static_storage *workspace)
+int validate_sort_matrix (reax_system *system, static_storage *workspace)
     sparse_matrix test;
     int index, count;
@@ -221,7 +227,7 @@ bool validate_sort_matrix (reax_system *system, static_storage *workspace)
-bool validate_sparse_matrix( reax_system *system, static_storage *workspace )
+int validate_sparse_matrix( reax_system *system, static_storage *workspace )
     sparse_matrix test;
     int index, count;
@@ -287,10 +293,10 @@ bool validate_sparse_matrix( reax_system *system, static_storage *workspace )
     free (test.start);
     free (test.end);
     free (test.entries);
-    return true;
+    return TRUE;
-bool validate_lu (static_storage *workspace)
+int validate_lu (static_storage *workspace)
     sparse_matrix test;
     int index, count;
@@ -354,7 +360,7 @@ bool validate_lu (static_storage *workspace)
     //fprintf (stderr, "L and U match on device and host \n");
-    return true;
+    return TRUE;
 void print_sparse_matrix (reax_system *system, static_storage *workspace)
@@ -405,7 +411,7 @@ void print_sparse_matrix (reax_system *system, static_storage *workspace)
-bool validate_bonds (reax_system *system, static_storage *workspace, list **lists)
+int validate_bonds (reax_system *system, static_storage *workspace, list **lists)
     int start, end, index, count, miscount;
     int *d_start, *d_end;
@@ -601,10 +607,10 @@ bool validate_bonds (reax_system *system, static_storage *workspace, list **list
     free (d_start);
     free (d_end);
     free (d_bond_data);
-    return true;
+    return TRUE;
-bool validate_sym_dbond_indices (reax_system *system, static_storage *workspace, list **lists)
+int validate_sym_dbond_indices (reax_system *system, static_storage *workspace, list **lists)
     int start, end, index, count, miscount;
     int *d_start, *d_end;
@@ -660,10 +666,11 @@ bool validate_sym_dbond_indices (reax_system *system, static_storage *workspace,
     free (d_start);
     free (d_end);
     free (d_bond_data);
-    return true;
+    return TRUE;
-bool analyze_hbonds (reax_system *system, static_storage *workspace, list **lists)
+int analyze_hbonds (reax_system *system, static_storage *workspace, list **lists)
     int hindex, nbr_hindex;
     int pj, hj, hb_start_j, hb_end_j, j, nbr;
@@ -748,7 +755,7 @@ bool analyze_hbonds (reax_system *system, static_storage *workspace, list **list
-bool validate_hbonds (reax_system *system, static_storage *workspace, list **lists)
+int validate_hbonds (reax_system *system, static_storage *workspace, list **lists)
     int *hbond_index, count;
     int *d_start, *d_end, index, d_index;
@@ -858,10 +865,10 @@ bool validate_hbonds (reax_system *system, static_storage *workspace, list **lis
     free (d_start);
     free (d_end);
     free (data);
-    return true;
+    return TRUE;
-bool validate_neighbors (reax_system *system, list **lists)
+int validate_neighbors (reax_system *system, list **lists)
     list *far_nbrs = *lists + FAR_NBRS;
     list *d_nbrs = dev_lists + FAR_NBRS;
@@ -989,971 +996,975 @@ bool validate_neighbors (reax_system *system, list **lists)
                     start[i], end[i]);
             exit (10);
+    }
+    //fprintf (stderr, "FAR Neighbors match between device and host \n");
+    free (start);
+    free (end);
+    free (data);
+    return TRUE;
+int validate_workspace (reax_system *system, static_storage *workspace, list **lists) 
+    real *total_bond_order;
+    int count, tcount;
+    total_bond_order = (real *) malloc ( system->N * REAL_SIZE );
+    copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++) {
+        //if (abs (workspace->total_bond_order[i] - total_bond_order[i]) >= GPU_TOLERANCE){
+        if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){
+            fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->total_bond_order[i], total_bond_order[i]);
+            exit (-1);
+            count ++;
+    }
+    free (total_bond_order);
+    //fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count);
-        //fprintf (stderr, "FAR Neighbors match between device and host \n");
-        free (start);
-        free (end);
-        free (data);
-        return true;
+    rvec *dDeltap_self;
+    dDeltap_self = (rvec *) calloc (system->N, RVEC_SIZE);
+    copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ )
+    {
+        if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i]))
+        {
+            fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, 
+                    workspace->dDeltap_self[i][0],
+                    workspace->dDeltap_self[i][1],
+                    workspace->dDeltap_self[i][2],
+                    dDeltap_self[3*i+0],
+                    dDeltap_self[3*i+1],
+                    dDeltap_self[3*i+2] );
+            exit (-1);
+            count ++;
+    }
+    free (dDeltap_self);
+    //fprintf (stderr, "dDeltap_self mismatch count %d\n", count);
-        bool validate_workspace (reax_system *system, static_storage *workspace, list **lists) 
+    //exit for init_forces
+    real *test;
+    test = (real *) malloc (system->N * REAL_SIZE);
+    copy_host_device (test, dev_workspace->Deltap, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ )
+    {
+        if (check_zero (workspace->Deltap[i], test[i]))
-            real *total_bond_order;
-            int count, tcount;
+            fprintf (stderr, "Deltap: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "Deltap mismatch count %d\n", count);
-            total_bond_order = (real *) malloc ( system->N * REAL_SIZE );
-            copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (test, dev_workspace->Deltap_boc, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ )
+    {
+        if (check_zero (workspace->Deltap_boc[i], test[i]))
+        {
+            fprintf (stderr, "Deltap_boc: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap_boc[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "dDeltap_boc mismatch count %d\n", count);
-            count = 0;
-            for (int i = 0; i < system->N; i++) {
+    copy_host_device (test, dev_workspace->Delta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->Delta[i], test[i])) {
+            fprintf (stderr, "Delta: Mismatch index --> %d (%f %f) \n", i, workspace->Delta[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "Delta mismatch count %d\n", count);
-                //if (abs (workspace->total_bond_order[i] - total_bond_order[i]) >= GPU_TOLERANCE){
-                if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){
-                    fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n",
-                            i, workspace->total_bond_order[i], total_bond_order[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            free (total_bond_order);
-            //fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count);
+    copy_host_device (test, dev_workspace->Delta_e, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->Delta_e[i], test[i])) {
+            fprintf (stderr, "Delta_e: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_e[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "Delta_e mismatch count %d\n", count);
+    copy_host_device (test, dev_workspace->vlpex, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->vlpex[i], test[i])) {
+            fprintf (stderr, "vlpex: Mismatch index --> %d (%f %f) \n", i, workspace->vlpex[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "vlpex mismatch count %d\n", count);
-            rvec *dDeltap_self;
-            dDeltap_self = (rvec *) calloc (system->N, RVEC_SIZE);
-            copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (test, dev_workspace->nlp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->nlp[i], test[i])) {
+            fprintf (stderr, "nlp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "nlp mismatch count %d\n", count);
-            count = 0;
-            for (int i = 0; i < system->N; i++ )
-            {
-                if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i]))
-                {
-                    fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, 
-                            workspace->dDeltap_self[i][0],
-                            workspace->dDeltap_self[i][1],
-                            workspace->dDeltap_self[i][2],
-                            dDeltap_self[3*i+0],
-                            dDeltap_self[3*i+1],
-                            dDeltap_self[3*i+2] );
-                    exit (-1);
-                    count ++;
-                }
-            }
-            free (dDeltap_self);
-            //fprintf (stderr, "dDeltap_self mismatch count %d\n", count);
+    copy_host_device (test, dev_workspace->Delta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->Delta_lp[i], test[i])) {
+            fprintf (stderr, "Delta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "Delta_lp mismatch count %d\n", count);
-            //exit for init_forces
+    copy_host_device (test, dev_workspace->Clp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->Clp[i], test[i])) {
+            fprintf (stderr, "Clp: Mismatch index --> %d (%f %f) \n", i, workspace->Clp[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "Clp mismatch count %d\n", count);
-            real *test;
-            test = (real *) malloc (system->N * REAL_SIZE);
+    copy_host_device (test, dev_workspace->dDelta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->dDelta_lp[i], test[i])) {
+            fprintf (stderr, "dDelta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "dDelta_lp mismatch count %d\n", count);
-            copy_host_device (test, dev_workspace->Deltap, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ )
-            {
-                if (check_zero (workspace->Deltap[i], test[i]))
-                {
-                    fprintf (stderr, "Deltap: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "Deltap mismatch count %d\n", count);
+    copy_host_device (test, dev_workspace->nlp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->nlp_temp[i], test[i])) {
+            fprintf (stderr, "nlp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp_temp[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "nlp_temp mismatch count %d\n", count);
-            copy_host_device (test, dev_workspace->Deltap_boc, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ )
-            {
-                if (check_zero (workspace->Deltap_boc[i], test[i]))
-                {
-                    fprintf (stderr, "Deltap_boc: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap_boc[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "dDeltap_boc mismatch count %d\n", count);
+    copy_host_device (test, dev_workspace->Delta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->Delta_lp_temp[i], test[i])) {
+            fprintf (stderr, "Delta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp_temp[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count);
-            copy_host_device (test, dev_workspace->Delta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->Delta[i], test[i])) {
-                    fprintf (stderr, "Delta: Mismatch index --> %d (%f %f) \n", i, workspace->Delta[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "Delta mismatch count %d\n", count);
+    copy_host_device (test, dev_workspace->dDelta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->dDelta_lp_temp[i], test[i])) {
+            fprintf (stderr, "dDelta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp_temp[i], test[i]);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count);
-            copy_host_device (test, dev_workspace->Delta_e, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->Delta_e[i], test[i])) {
-                    fprintf (stderr, "Delta_e: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_e[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "Delta_e mismatch count %d\n", count);
+    //exit for Bond order calculations
-            copy_host_device (test, dev_workspace->vlpex, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->vlpex[i], test[i])) {
-                    fprintf (stderr, "vlpex: Mismatch index --> %d (%f %f) \n", i, workspace->vlpex[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "vlpex mismatch count %d\n", count);
-            copy_host_device (test, dev_workspace->nlp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->nlp[i], test[i])) {
-                    fprintf (stderr, "nlp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "nlp mismatch count %d\n", count);
+    copy_host_device (test, dev_workspace->CdDelta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->CdDelta[i], test[i])) {
+            fprintf (stderr, " CdDelta does NOT match (%f %f) for atom  %d \n", workspace->CdDelta[i], test[i], i);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "CdDelta mismatch count %d\n", count);
+    //exit for Bond Energy calculations
-            copy_host_device (test, dev_workspace->Delta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->Delta_lp[i], test[i])) {
-                    fprintf (stderr, "Delta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "Delta_lp mismatch count %d\n", count);
+    /*
+       copy_host_device (test, dev_workspace->droptol, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+       count = 0;
+       for (int i = 0; i < system->N; i++ ) {
+       if (check_zero (workspace->droptol[i], test[i])) {
+       fprintf (stderr, " Droptol Does not match (%f %f) \n", workspace->droptol[i], test[i]);
+       exit (-1);
+       count ++;
+       }
+       }
+    //fprintf (stderr, "droptol mismatch count %d\n", count);
+     */
-            copy_host_device (test, dev_workspace->Clp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->Clp[i], test[i])) {
-                    fprintf (stderr, "Clp: Mismatch index --> %d (%f %f) \n", i, workspace->Clp[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "Clp mismatch count %d\n", count);
-            copy_host_device (test, dev_workspace->dDelta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->dDelta_lp[i], test[i])) {
-                    fprintf (stderr, "dDelta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "dDelta_lp mismatch count %d\n", count);
+    //exit for  QEa calculations
+    /*
+       real *t_s;
-            copy_host_device (test, dev_workspace->nlp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->nlp_temp[i], test[i])) {
-                    fprintf (stderr, "nlp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp_temp[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "nlp_temp mismatch count %d\n", count);
+       t_s = (real *) malloc (REAL_SIZE * (system->N * 2) );
+       copy_host_device (t_s, dev_workspace->b_prm, REAL_SIZE * (system->N * 2), cudaMemcpyDeviceToHost, __LINE__);
-            copy_host_device (test, dev_workspace->Delta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->Delta_lp_temp[i], test[i])) {
-                    fprintf (stderr, "Delta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp_temp[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count);
+       count = 0;
+       for (int i = 0; i < (system->N * 2); i++ ) {
+       if (check_zero (workspace->b_prm[i], t_s[i])) {
+       fprintf (stderr, " (%f %f) \n", workspace->b_prm[i], t_s[i]);
+       exit (-1);
+       count ++;
+       }
+       }
+    //fprintf (stderr, "b_prm mismatch count %d\n", count);
-            copy_host_device (test, dev_workspace->dDelta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->dDelta_lp_temp[i], test[i])) {
-                    fprintf (stderr, "dDelta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp_temp[i], test[i]);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count);
+    t_s = (real *) malloc (REAL_SIZE * 5 * system->N);
+    copy_host_device (t_s, dev_workspace->s, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__);
-            //exit for Bond order calculations
+    count = 0;
+    for (int i = 0; i < 5*system->N; i++ ) {
+    if (check_zero (workspace->s[i], t_s[i])) {
+    //fprintf (stderr, " (%f %f)  @ index %d \n", workspace->s[i], t_s[i], i);
+    count ++;
+    }
+    }
+    fprintf (stderr, "s mismatch count %d\n", count);
-            copy_host_device (test, dev_workspace->CdDelta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->CdDelta[i], test[i])) {
-                    fprintf (stderr, " CdDelta does NOT match (%f %f) for atom  %d \n", workspace->CdDelta[i], test[i], i);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "CdDelta mismatch count %d\n", count);
-            //exit for Bond Energy calculations
+    t_s = (real *) malloc (REAL_SIZE * 5 * system->N);
+    copy_host_device (t_s, dev_workspace->t, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__);
-            /*
-               copy_host_device (test, dev_workspace->droptol, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-               count = 0;
-               for (int i = 0; i < system->N; i++ ) {
-               if (check_zero (workspace->droptol[i], test[i])) {
-               fprintf (stderr, " Droptol Does not match (%f %f) \n", workspace->droptol[i], test[i]);
-               exit (-1);
-               count ++;
-               }
-               }
-            //fprintf (stderr, "droptol mismatch count %d\n", count);
-             */
+    count = 0;
+    for (int i = 0; i < 5*system->N; i++ ) {
+    if (check_zero (workspace->t[i], t_s[i])) {
+    //fprintf (stderr, " (%f %f) @ index : %d\n", workspace->t[i], t_s[i], i);
+    count ++;
+    }
+    }
+    fprintf (stderr, "t mismatch count %d\n", count);
-            //exit for  QEa calculations
-            /*
-               real *t_s;
+    t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * system->N);
+    copy_host_device (t_s, dev_workspace->v, system->N * REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-               t_s = (real *) malloc (REAL_SIZE * (system->N * 2) );
-               copy_host_device (t_s, dev_workspace->b_prm, REAL_SIZE * (system->N * 2), cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < (RESTART + 1)*system->N; i++ ) {
+    if (check_zero (workspace->v[i], t_s[i])) {
+    //fprintf (stderr, " (%f %f) @ index %d \n", workspace->v[i], t_s[i], i);
+    count ++;
+    }
+    }
+    fprintf (stderr, "v mismatch count %d\n", count);
-               count = 0;
-               for (int i = 0; i < (system->N * 2); i++ ) {
-               if (check_zero (workspace->b_prm[i], t_s[i])) {
-               fprintf (stderr, " (%f %f) \n", workspace->b_prm[i], t_s[i]);
-               exit (-1);
-               count ++;
-               }
-               }
-            //fprintf (stderr, "b_prm mismatch count %d\n", count);
+    t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
+    copy_host_device (t_s, dev_workspace->y, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-            t_s = (real *) malloc (REAL_SIZE * 5 * system->N);
-            copy_host_device (t_s, dev_workspace->s, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < (RESTART + 1); i++ ) {
+    if (check_zero (workspace->y[i], t_s[i])) {
+    //fprintf (stderr, " (%f %f) \n", workspace->y[i], t_s[i]);
+    count ++;
+    }
+    }
+    fprintf (stderr, "y mismatch count %d\n", count);
-            count = 0;
-            for (int i = 0; i < 5*system->N; i++ ) {
-            if (check_zero (workspace->s[i], t_s[i])) {
-            //fprintf (stderr, " (%f %f)  @ index %d \n", workspace->s[i], t_s[i], i);
-            count ++;
-            }
-            }
-            fprintf (stderr, "s mismatch count %d\n", count);
+    t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
+    copy_host_device (t_s, dev_workspace->hc, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < (RESTART + 1); i++ ) {
+    if (check_zero (workspace->hc[i], t_s[i])) {
+        //fprintf (stderr, " (%f %f) \n", workspace->hc[i], t_s[i]);
+        count ++;
+    }
+    }
+    fprintf (stderr, "hc mismatch count %d\n", count);
-            t_s = (real *) malloc (REAL_SIZE * 5 * system->N);
-            copy_host_device (t_s, dev_workspace->t, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__);
+    t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
+    copy_host_device (t_s, dev_workspace->hs, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < 5*system->N; i++ ) {
-            if (check_zero (workspace->t[i], t_s[i])) {
-            //fprintf (stderr, " (%f %f) @ index : %d\n", workspace->t[i], t_s[i], i);
+    count = 0;
+    for (int i = 0; i < (RESTART + 1); i++ ) {
+        if (check_zero (workspace->hs[i], t_s[i])) {
+            //fprintf (stderr, " (%f %f) \n", workspace->hs[i], t_s[i]);
             count ++;
-            }
-            }
-            fprintf (stderr, "t mismatch count %d\n", count);
+        }
+    }
+    fprintf (stderr, "hs mismatch count %d\n", count);
-            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * system->N);
-            copy_host_device (t_s, dev_workspace->v, system->N * REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+    t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * (RESTART+1) );
+    copy_host_device (t_s, dev_workspace->h, REAL_SIZE * (RESTART+1)*(RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < (RESTART + 1)*system->N; i++ ) {
-            if (check_zero (workspace->v[i], t_s[i])) {
-            //fprintf (stderr, " (%f %f) @ index %d \n", workspace->v[i], t_s[i], i);
+    count = 0;
+    for (int i = 0; i < (RESTART+1)*(RESTART+1); i++ ) {
+        if (check_zero (workspace->h[i], t_s[i])) {
+            //fprintf (stderr, " (%f %f) \n", workspace->h[i], t_s[i]);
             count ++;
-            }
-            }
-            fprintf (stderr, "v mismatch count %d\n", count);
+        }
+    }
+    fprintf (stderr, "h mismatch count %d\n", count);
-            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
-            copy_host_device (t_s, dev_workspace->y, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+    t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
+    copy_host_device (t_s, dev_workspace->g, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < (RESTART + 1); i++ ) {
-            if (check_zero (workspace->y[i], t_s[i])) {
-            //fprintf (stderr, " (%f %f) \n", workspace->y[i], t_s[i]);
+    count = 0;
+    for (int i = 0; i < (RESTART + 1); i++ ) {
+        if (check_zero (workspace->g[i], t_s[i])) {
+            //fprintf (stderr, " (%f %f) @ index %d\n", workspace->g[i], t_s[i], i);
             count ++;
-            }
-            }
-            fprintf (stderr, "y mismatch count %d\n", count);
+        }
+    }
+    fprintf (stderr, "g mismatch count %d\n", count);
+    */
-            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
-            copy_host_device (t_s, dev_workspace->hc, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+        rvec *r_s = (rvec *) malloc (RVEC_SIZE * system->N );
+    copy_host_device (r_s, dev_workspace->v_const, RVEC_SIZE * system->N,  cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < (RESTART + 1); i++ ) {
-            if (check_zero (workspace->hc[i], t_s[i])) {
-                //fprintf (stderr, " (%f %f) \n", workspace->hc[i], t_s[i]);
-                count ++;
-            }
-            }
-            fprintf (stderr, "hc mismatch count %d\n", count);
+    count = 0;
+    for (int i = 0; i < system->N; i++ ) {
+        if (check_zero (workspace->v_const[i], r_s[i])) {
+            fprintf (stderr, " v_const (%f %f %f) (%f %f %f) @ index %d\n", 
+                    workspace->v_const[i][0], 
+                    workspace->v_const[i][1], 
+                    workspace->v_const[i][2], 
+                    r_s[i][0], 
+                    r_s[i][1], 
+                    r_s[i][2], 
+                    i);
+            exit (-1);
+            count ++;
+        }
+    }
+    //fprintf (stderr, "v_const mismatch count %d\n", count);
-            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
-            copy_host_device (t_s, dev_workspace->hs, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+    free (test);
+    free (r_s);
+    return TRUE;
-            count = 0;
-            for (int i = 0; i < (RESTART + 1); i++ ) {
-                if (check_zero (workspace->hs[i], t_s[i])) {
-                    //fprintf (stderr, " (%f %f) \n", workspace->hs[i], t_s[i]);
-                    count ++;
-                }
-            }
-            fprintf (stderr, "hs mismatch count %d\n", count);
-            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * (RESTART+1) );
-            copy_host_device (t_s, dev_workspace->h, REAL_SIZE * (RESTART+1)*(RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+int validate_data (reax_system *system, simulation_data *host)
+    simulation_data device;
-            count = 0;
-            for (int i = 0; i < (RESTART+1)*(RESTART+1); i++ ) {
-                if (check_zero (workspace->h[i], t_s[i])) {
-                    //fprintf (stderr, " (%f %f) \n", workspace->h[i], t_s[i]);
-                    count ++;
-                }
-            }
-            fprintf (stderr, "h mismatch count %d\n", count);
+    copy_host_device (&device, host->d_simulation_data, SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
-            copy_host_device (t_s, dev_workspace->g, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+    if (check_zero (host->E_BE, device.E_BE)){
+        fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->E_BE, device.E_BE);
+        exit (-1);
+    }
-            count = 0;
-            for (int i = 0; i < (RESTART + 1); i++ ) {
-                if (check_zero (workspace->g[i], t_s[i])) {
-                    //fprintf (stderr, " (%f %f) @ index %d\n", workspace->g[i], t_s[i], i);
-                    count ++;
-                }
-            }
-            fprintf (stderr, "g mismatch count %d\n", count);
-            */
-                rvec *r_s = (rvec *) malloc (RVEC_SIZE * system->N );
-            copy_host_device (r_s, dev_workspace->v_const, RVEC_SIZE * system->N,  cudaMemcpyDeviceToHost, __LINE__);
-            count = 0;
-            for (int i = 0; i < system->N; i++ ) {
-                if (check_zero (workspace->v_const[i], r_s[i])) {
-                    fprintf (stderr, " v_const (%f %f %f) (%f %f %f) @ index %d\n", 
-                            workspace->v_const[i][0], 
-                            workspace->v_const[i][1], 
-                            workspace->v_const[i][2], 
-                            r_s[i][0], 
-                            r_s[i][1], 
-                            r_s[i][2], 
-                            i);
-                    exit (-1);
-                    count ++;
-                }
-            }
-            //fprintf (stderr, "v_const mismatch count %d\n", count);
+    if (check_zero (host->E_Lp, device.E_Lp)){
+        fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->E_Lp, device.E_Lp);
+        exit (-1);
+    }
-            free (test);
-            free (r_s);
-            return true;
-            }
+    if (check_zero (host->E_Ov, device.E_Ov)){
+        fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->E_Ov, device.E_Ov);
+        exit (-1);
+    }
-            bool validate_data (reax_system *system, simulation_data *host)
-            {
-                simulation_data device;
+    if (check_zero (host->E_Un, device.E_Un)){
+        fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->E_Un, device.E_Un);
+        exit (-1);
+    }
-                copy_host_device (&device, host->d_simulation_data, SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    if (check_zero (host->E_Tor, device.E_Tor)) {
+        fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->E_Tor, device.E_Tor);
+        exit (-1);
+    }
-                if (check_zero (host->E_BE, device.E_BE)){
-                    fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->E_BE, device.E_BE);
-                    exit (-1);
-                }
+    if (check_zero (host->E_Con, device.E_Con)) {
+        fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->E_Con, device.E_Con);
+        exit (-1);
+    }
-                if (check_zero (host->E_Lp, device.E_Lp)){
-                    fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->E_Lp, device.E_Lp);
-                    exit (-1);
-                }
+    if (check_zero (host->ext_press, device.ext_press)) {
+        fprintf (stderr, "ext_press does not match (%4.10e %4.10e) \n", host->ext_press, device.ext_press);
+        exit (-1);
+    }
-                if (check_zero (host->E_Ov, device.E_Ov)){
-                    fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->E_Ov, device.E_Ov);
-                    exit (-1);
-                }
+    if (check_zero (host->E_HB, device.E_HB)) {
+        fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->E_HB, device.E_HB);
+        exit (-1);
+    }
-                if (check_zero (host->E_Un, device.E_Un)){
-                    fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->E_Un, device.E_Un);
-                    exit (-1);
-                }
+    if (check_zero (host->E_Ang, device.E_Ang)) {
+        fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->E_Ang, device.E_Ang);
+        exit (-1);
+    }
-                if (check_zero (host->E_Tor, device.E_Tor)) {
-                    fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->E_Tor, device.E_Tor);
-                    exit (-1);
-                }
+    if (check_zero (host->E_Pen, device.E_Pen)) {
+        fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->E_Pen, device.E_Pen);
+        exit (-1);
+    }
-                if (check_zero (host->E_Con, device.E_Con)) {
-                    fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->E_Con, device.E_Con);
-                    exit (-1);
-                }
+    if (check_zero (host->E_Coa, device.E_Coa)) {
+        fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->E_Coa, device.E_Coa);
+        exit (-1);
+    }
-                if (check_zero (host->ext_press, device.ext_press)) {
-                    fprintf (stderr, "ext_press does not match (%4.10e %4.10e) \n", host->ext_press, device.ext_press);
-                    exit (-1);
-                }
+    if (check_zero (host->E_vdW, device.E_vdW)) {
+        fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->E_vdW, device.E_vdW);
+        exit (-1);
+    }
-                if (check_zero (host->E_HB, device.E_HB)) {
-                    fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->E_HB, device.E_HB);
-                    exit (-1);
-                }
+    if (check_zero (host->E_Ele, device.E_Ele)) {
+        fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->E_Ele, device.E_Ele);
+        exit (-1);
+    }
-                if (check_zero (host->E_Ang, device.E_Ang)) {
-                    fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->E_Ang, device.E_Ang);
-                    exit (-1);
-                }
+    if (check_zero (host->E_Pol, device.E_Pol)) {
+        fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->E_Pol, device.E_Pol);
+        exit (-1);
+    }
-                if (check_zero (host->E_Pen, device.E_Pen)) {
-                    fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->E_Pen, device.E_Pen);
-                    exit (-1);
-                }
-                if (check_zero (host->E_Coa, device.E_Coa)) {
-                    fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->E_Coa, device.E_Coa);
-                    exit (-1);
-                }
+    //fprintf (stderr, "Simulation Data match between host and device \n");
+    return TRUE;
-                if (check_zero (host->E_vdW, device.E_vdW)) {
-                    fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->E_vdW, device.E_vdW);
-                    exit (-1);
-                }
-                if (check_zero (host->E_Ele, device.E_Ele)) {
-                    fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->E_Ele, device.E_Ele);
-                    exit (-1);
-                }
+void print_bond_data (bond_order_data *s)
+    /*
+       fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", 
+       s->BO, 
+       s->BO_s, 
+       s->BO_pi,
+       s->BO_pi2 );
+     */
+    fprintf (stderr, " Cdbo (%e) ", s->Cdbo );
+    fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi );
+    fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 );
-                if (check_zero (host->E_Pol, device.E_Pol)) {
-                    fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->E_Pol, device.E_Pol);
-                    exit (-1);
-                }
+void print_bond_list (reax_system *system, static_storage *workspace, list **lists)
+    list *bonds = *lists + BONDS;
-                //fprintf (stderr, "Simulation Data match between host and device \n");
-                return true;
-            }
+    for (int i = 1; i < 2; i++)
+    {
+        fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i);
+        for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) 
+        {
+            bond_data *data = &bonds->select.bond_list [j];
+            fprintf (stderr, "  %d, ", data->nbr );
+            print_bond_data (&data->bo_data);
+            fprintf (stderr, ")\n");
+        }
+    }
-            void print_bond_data (bond_order_data *s)
-            {
-                /*
-                   fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", 
-                   s->BO, 
-                   s->BO_s, 
-                   s->BO_pi,
-                   s->BO_pi2 );
-                 */
-                fprintf (stderr, " Cdbo (%e) ", s->Cdbo );
-                fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi );
-                fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 );
-            }
+    int *b_start = (int *) malloc (INT_SIZE * system->N);
+    int *b_end = (int *) malloc (INT_SIZE * system->N);
+    list *d_bonds = dev_lists + BONDS;
+    bond_data *d_bond_data;
-            void print_bond_list (reax_system *system, static_storage *workspace, list **lists)
-            {
-                list *bonds = *lists + BONDS;
+    d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-                for (int i = 1; i < 2; i++)
-                {
-                    fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i);
-                    for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) 
-                    {
-                        bond_data *data = &bonds->select.bond_list [j];
-                        fprintf (stderr, "  %d, ", data->nbr );
-                        print_bond_data (&data->bo_data);
-                        fprintf (stderr, ")\n");
-                    }
-                }
+    copy_host_device ( b_start, d_bonds->index, 
+            INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( b_end, d_bonds->end_index, 
+            INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    for (int i = 0; i < 2; i++)
+    {
+        fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i);
+        for (int j = b_start[i]; j < b_end[i]; j ++) {
+            bond_data *src = &d_bond_data[j];
+            fprintf (stderr, "  %d, ", src->nbr );
+            print_bond_data (&src->bo_data);
+            fprintf (stderr, ")\n");
+        }
+    }
-                int *b_start = (int *) malloc (INT_SIZE * system->N);
-                int *b_end = (int *) malloc (INT_SIZE * system->N);
-                list *d_bonds = dev_lists + BONDS;
-                bond_data *d_bond_data;
-                d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+void count_three_bodies (reax_system *system, static_storage *workspace, list **lists)
+    list *three = *lists + THREE_BODIES;
+    list *bonds = *lists + BONDS;
-                copy_host_device ( b_start, d_bonds->index, 
-                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( b_end, d_bonds->end_index, 
-                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                for (int i = 0; i < 2; i++)
-                {
-                    fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i);
-                    for (int j = b_start[i]; j < b_end[i]; j ++) {
-                        bond_data *src = &d_bond_data[j];
-                        fprintf (stderr, "  %d, ", src->nbr );
-                        print_bond_data (&src->bo_data);
-                        fprintf (stderr, ")\n");
-                    }
-                }
-            }
+    list *d_three = dev_lists + THREE_BODIES;
+    list *d_bonds = dev_lists + BONDS;
+    bond_data *d_bond_data;
+    real *test;
+    three_body_interaction_data *data = (three_body_interaction_data *) 
+        malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
+    int *start = (int *) malloc (INT_SIZE * system->num_bonds);
+    int *end = (int *) malloc (INT_SIZE * system->num_bonds);
+    int *b_start = (int *) malloc (INT_SIZE * system->N);
+    int *b_end = (int *) malloc (INT_SIZE * system->N);
+    int count;
+    int hcount, dcount;
+    copy_host_device ( start, d_three->index, 
+            INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( end, d_three->end_index, 
+            INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( data, d_three->select.three_body_list, 
+            sizeof (three_body_interaction_data) * system->num_thbodies, 
+            cudaMemcpyDeviceToHost, __LINE__);
+    d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+    copy_host_device ( b_start, d_bonds->index, 
+            INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( b_end, d_bonds->end_index, 
+            INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-            void count_three_bodies (reax_system *system, static_storage *workspace, list **lists)
-            {
-                list *three = *lists + THREE_BODIES;
-                list *bonds = *lists + BONDS;
-                list *d_three = dev_lists + THREE_BODIES;
-                list *d_bonds = dev_lists + BONDS;
-                bond_data *d_bond_data;
-                real *test;
-                three_body_interaction_data *data = (three_body_interaction_data *) 
-                    malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
-                int *start = (int *) malloc (INT_SIZE * system->num_bonds);
-                int *end = (int *) malloc (INT_SIZE * system->num_bonds);
-                int *b_start = (int *) malloc (INT_SIZE * system->N);
-                int *b_end = (int *) malloc (INT_SIZE * system->N);
-                int count;
-                int hcount, dcount;
-                copy_host_device ( start, d_three->index, 
-                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( end, d_three->end_index, 
-                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( data, d_three->select.three_body_list, 
-                        sizeof (three_body_interaction_data) * system->num_thbodies, 
-                        cudaMemcpyDeviceToHost, __LINE__);
-                d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-                copy_host_device ( b_start, d_bonds->index, 
-                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( b_end, d_bonds->end_index, 
-                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                count = 0;
-                hcount = dcount = 0;
-                for (int i = 0; i < system->N; i++)
-                {
-                    for (int j = b_start[i]; j < b_end[i]; j ++) {
-                        dcount += end[j] - start[j];
-                    }
-                }
+    count = 0;
+    hcount = dcount = 0;
+    for (int i = 0; i < system->N; i++)
+    {
+        for (int j = b_start[i]; j < b_end[i]; j ++) {
+            dcount += end[j] - start[j];
+        }
+    }
-                fprintf (stderr, "Total Actual Three Body Count ---> %d \n", dcount);
+    fprintf (stderr, "Total Actual Three Body Count ---> %d \n", dcount);
-                free (data);
-                free (start);
-                free (end);
-                free (b_start);
-                free (b_end);
-                free (d_bond_data);
-            }
+    free (data);
+    free (start);
+    free (end);
+    free (b_start);
+    free (b_end);
+    free (d_bond_data);
+int validate_three_bodies (reax_system *system, static_storage *workspace, list **lists)
+    list *three = *lists + THREE_BODIES;
+    list *bonds = *lists + BONDS;
-            bool validate_three_bodies (reax_system *system, static_storage *workspace, list **lists)
-            {
-                list *three = *lists + THREE_BODIES;
-                list *bonds = *lists + BONDS;
+    list *d_three = dev_lists + THREE_BODIES;
+    list *d_bonds = dev_lists + BONDS;
+    bond_data *d_bond_data;
+    real *test;
-                list *d_three = dev_lists + THREE_BODIES;
-                list *d_bonds = dev_lists + BONDS;
-                bond_data *d_bond_data;
-                real *test;
+    three_body_interaction_data *data = (three_body_interaction_data *) 
+        malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
+    int *start = (int *) malloc (INT_SIZE * system->num_bonds);
+    int *end = (int *) malloc (INT_SIZE * system->num_bonds);
-                three_body_interaction_data *data = (three_body_interaction_data *) 
-                    malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
-                int *start = (int *) malloc (INT_SIZE * system->num_bonds);
-                int *end = (int *) malloc (INT_SIZE * system->num_bonds);
+    int *b_start = (int *) malloc (INT_SIZE * system->N);
+    int *b_end = (int *) malloc (INT_SIZE * system->N);
+    int count;
+    int hcount, dcount;
-                int *b_start = (int *) malloc (INT_SIZE * system->N);
-                int *b_end = (int *) malloc (INT_SIZE * system->N);
-                int count;
-                int hcount, dcount;
+    copy_host_device ( start, d_three->index, 
+            INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( end, d_three->end_index, 
+            INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( data, d_three->select.three_body_list, 
+            sizeof (three_body_interaction_data) * system->num_thbodies, 
+            cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( start, d_three->index, 
-                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( end, d_three->end_index, 
-                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( data, d_three->select.three_body_list, 
-                        sizeof (three_body_interaction_data) * system->num_thbodies, 
-                        cudaMemcpyDeviceToHost, __LINE__);
+    d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-                d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+    copy_host_device ( b_start, d_bonds->index, 
+            INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( b_end, d_bonds->end_index, 
+            INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( b_start, d_bonds->index, 
-                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( b_end, d_bonds->end_index, 
-                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    //test = (real *) malloc (REAL_SIZE * system->num_bonds);
+    //memset (test, 0, REAL_SIZE * system->num_bonds);
+    //copy_host_device (test, testdata, REAL_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                //test = (real *) malloc (REAL_SIZE * system->num_bonds);
-                //memset (test, 0, REAL_SIZE * system->num_bonds);
-                //copy_host_device (test, testdata, REAL_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    for (int i = 0; i < system->N; i++)
+    {
+        //for (int j = bonds->index[i]; j < bonds->end_index[i]; j ++)
-                count = 0;
-                for (int i = 0; i < system->N; i++)
-                {
-                    //for (int j = bonds->index[i]; j < bonds->end_index[i]; j ++)
+        hcount = dcount = 0;
+        for (int j = b_start[i]; j < b_end[i]; j ++) {
+            dcount += end[j] - start[j];
+            hcount += Num_Entries (j, three);
-                    hcount = dcount = 0;
-                    for (int j = b_start[i]; j < b_end[i]; j ++) {
-                        dcount += end[j] - start[j];
-                        hcount += Num_Entries (j, three);
+            /*
+               if ((end[j] - start[j]) != (End_Index (j, three) - Start_Index (j, three)))
+               {
+               fprintf (stderr, " Three body count does not match between host and device\n");
+               fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (j, three), End_Index (j, three));
+               fprintf (stderr, " Device count: (%d, %d)\n", start[j], end[j]);
+               }
+             */
+        }
-                        /*
-                           if ((end[j] - start[j]) != (End_Index (j, three) - Start_Index (j, three)))
-                           {
-                           fprintf (stderr, " Three body count does not match between host and device\n");
-                           fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (j, three), End_Index (j, three));
-                           fprintf (stderr, " Device count: (%d, %d)\n", start[j], end[j]);
-                           }
-                         */
-                    }
+        if ((dcount != hcount)) {
-                    if ((dcount != hcount)) {
+            fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); 
-                        fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); 
+            for (int j = b_start[i]; j < b_end[i]; j ++) {
+                bond_order_data *src = &d_bond_data[j].bo_data;
+                dcount = end[j] - start[j];
+                hcount = Num_Entries (j, three);
+                fprintf (stderr, "device \n");
+                print_bond_data (src);
-                        for (int j = b_start[i]; j < b_end[i]; j ++) {
-                            bond_order_data *src = &d_bond_data[j].bo_data;
-                            dcount = end[j] - start[j];
-                            hcount = Num_Entries (j, three);
-                            fprintf (stderr, "device \n");
-                            print_bond_data (src);
+                fprintf (stderr, "\n");
+                src = &bonds->select.bond_list[j].bo_data;
+                fprintf (stderr, "host \n");
+                print_bond_data (src);
+                fprintf (stderr, "\n");
-                            fprintf (stderr, "\n");
-                            src = &bonds->select.bond_list[j].bo_data;
-                            fprintf (stderr, "host \n");
-                            print_bond_data (src);
-                            fprintf (stderr, "\n");
+                //fprintf (stderr, "--- Device bo is %f \n", test[j]);
+                fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i],  
+                        Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds));
+                fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j);
+                fprintf (stderr, "------\n");
+            }
+            fprintf (stderr, " Three Bodies count does not match between host and device \n");
+            exit (-1);
+        }
+    }
-                            //fprintf (stderr, "--- Device bo is %f \n", test[j]);
-                            fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i],  
-                                    Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds));
-                            fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j);
-                            fprintf (stderr, "------\n");
-                        }
-                        fprintf (stderr, " Three Bodies count does not match between host and device \n");
-                        exit (-1);
-                    }
-                }
+    //fprintf (stderr, "Three body count on DEVICE %d  HOST %d \n", dcount, hcount);
-                //fprintf (stderr, "Three body count on DEVICE %d  HOST %d \n", dcount, hcount);
+    count = 0;
+    for (int i = 0; i < system->N; i++)
+    {
+        int x, y, z;
+        for (x = b_start[i]; x < b_end[i]; x++)
+        {
+            int t_start = start[x];
+            int t_end = end[x];
-                count = 0;
-                for (int i = 0; i < system->N; i++)
+            bond_data *dev_bond = &d_bond_data [x];
+            bond_data *host_bond;
+            for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
+            {
+                host_bond = &bonds->select.bond_list [z];
+                if ((dev_bond->nbr == host_bond->nbr) &&
+                        check_same (dev_bond->rel_box, host_bond->rel_box) && 
+                        !check_zero (dev_bond->dvec, host_bond->dvec) &&
+                        !check_zero (dev_bond->d, host_bond->d) )
-                    int x, y, z;
-                    for (x = b_start[i]; x < b_end[i]; x++)
-                    {
-                        int t_start = start[x];
-                        int t_end = end[x];
-                        bond_data *dev_bond = &d_bond_data [x];
-                        bond_data *host_bond;
-                        for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
-                        {
-                            host_bond = &bonds->select.bond_list [z];
-                            if ((dev_bond->nbr == host_bond->nbr) &&
-                                    check_same (dev_bond->rel_box, host_bond->rel_box) && 
-                                    !check_zero (dev_bond->dvec, host_bond->dvec) &&
-                                    !check_zero (dev_bond->d, host_bond->d) )
-                            {
-                                break;
-                            }
-                        }
-                        if (z >= End_Index (i, bonds)){
-                            fprintf (stderr, "Could not find the matching bond on host and device \n");
-                            exit (-1);
-                        }
-                        //find this bond in the bonds on the host side.
-                        for (y = t_start; y < t_end; y++)
-                        {
-                            three_body_interaction_data *device = data + y;
-                            three_body_interaction_data *host;
-                            //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb);
-                            int xx;    
-                            for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++)
-                            {
-                                host = &three->select.three_body_list [xx];
-                                //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb);
-                                //if ((host->thb == device->thb) && (host->pthb == device->pthb))
-                                if ((host->thb == device->thb) && !check_zero (host->theta, device->theta))
-                                {
-                                    count ++;
-                                    break;
-                                }
-                            }
-                            if ( xx >= End_Index (z, three) ) {
-                                fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, 
-                                        Start_Index (z, three), End_Index (z, three), start[x], end[x] );
-                                exit (-1);
-                            }// else fprintf (stderr, "----------------- \n");
-                        }
-                    }
+                    break;
-                free (data);
-                free (start);
-                free (end);
-                free (b_start);
-                free (b_end);
-                free (d_bond_data);
-                //fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count);
-                return true;
+            if (z >= End_Index (i, bonds)){
+                fprintf (stderr, "Could not find the matching bond on host and device \n");
+                exit (-1);
+            }
+            //find this bond in the bonds on the host side.
-            bool bin_three_bodies (reax_system *system, static_storage *workspace, list **lists)
+            for (y = t_start; y < t_end; y++)
-                list *d_three = dev_lists + THREE_BODIES;
-                list *d_bonds = dev_lists + BONDS;
-                list *three = *lists + THREE_BODIES;
-                list *bonds = *lists + BONDS;
-                bond_data *d_bond_data;
-                three_body_interaction_data *data = (three_body_interaction_data *) 
-                    malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
-                int *start = (int *) malloc (INT_SIZE * system->num_bonds);
-                int *end = (int *) malloc (INT_SIZE * system->num_bonds);
-                int *b_start = (int *) malloc (INT_SIZE * system->N);
-                int *b_end = (int *) malloc (INT_SIZE * system->N);
-                int *a = (int *) malloc (2 * INT_SIZE * system->N );
-                int *b = (int *) malloc (2 * INT_SIZE * system->N );
-                int *c = (int *) malloc (2 * INT_SIZE * system->N );
-                int *d = (int *) malloc (2 * INT_SIZE * system->N );
-                for (int i = 0; i < 2 * system->N; i++)
-                    a[i] = b[i] = c[i] = d[i] = -1;
-                int count;
-                int hcount, dcount;
-                int index_a, index_b, index_c, index_d;
-                index_a = index_b = index_c = index_d = 0;
-                copy_host_device ( start, d_three->index, 
-                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( end, d_three->end_index, 
-                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( data, d_three->select.three_body_list, 
-                        sizeof (three_body_interaction_data) * system->num_thbodies, 
-                        cudaMemcpyDeviceToHost, __LINE__);
-                d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-                copy_host_device ( b_start, d_bonds->index, 
-                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device ( b_end, d_bonds->end_index, 
-                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-                copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-                count = 0;
-                hcount = dcount = 0;
-                /*
-                   for (int i = 0; i < 20; i++)
-                   {
-                   for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++)
-                   {
-                   for ( int k = Start_Index (j, three); k < End_Index (j, three); k ++)
-                   {
-                   three_body_interaction_data *host = &three->select.three_body_list [k];
-                   fprintf (stderr, " atom %d bond (%d %d) -- %d,  (%d %d)\n", 
-                   i, Start_Index (i, bonds), End_Index (i, bonds), j, host->thb, host->pthb );
-                   }
-                   }
-                   }
-                   exit (-1);
-                 */
-                count = 0;
-                for (int i = 0; i < system->N; i++)
-                {
-                    for (int j = b_start[i]; j < b_end[i]; j ++) {
-                        /*
-                           bond_data *src;
-                           src = &d_bond_data[j];
-                           fprintf (stderr, " atom %d Neighbor %d \n", i, src->nbr );
-                         */
+                three_body_interaction_data *device = data + y;
+                three_body_interaction_data *host;
-                        for (int x = start[j]; x < end[j]; x ++)
-                        {
-                            three_body_interaction_data *device = data + x;
-                            int center = device->j;
-                            int d_i = device->i;
-                            int d_k = device->k;
-                            //fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d %d) -- (%d %d)\n", 
-                            //i, b_start[i], b_end[i], j, center, d_i, d_k, device->thb, device->pthb);
-                            if ((a[system->N + center] != -1)) {
-                                a[d_i] = a[d_k] = 1;
-                                continue;
-                            } else if ((b[system->N + center] != -1)) {
-                                b[d_i] = b[d_k] = 1;
-                                continue;
-                            } else if ((c[system->N + center] != -1)) {
-                                c[d_i] = c[d_k] = 1;
-                                continue;
-                            } else if ((d[system->N + center] != -1)) {
-                                d[d_i] = d[d_k] = 1;
-                                continue;
-                            }
-                            if ((a[center] == -1) && (a[d_i] == -1) && (a[d_k] == -1)) {
-                                a[center] = a[d_i] = a[d_k] = 1;
-                                a[system->N + center] = 1;
-                            } else if ((b[center] == -1) && (b[d_i] == -1) && (b[d_k] == -1)) {
-                                b[center] =  b[d_i] = b[d_k] = 1;
-                                b[system->N + center] = 1;
-                            } else if ((c[center] == -1) && (c[d_i] == -1) && (c[d_k] == -1)) {
-                                c[center] =  c[d_i] = c[d_k] = 1;
-                                c[system->N + center] = 1;
-                            } else if ((d[center] == -1) && (d[d_i] == -1) && (d[d_k] == -1)) {
-                                d[center] =  d[d_i] = d[d_k] = 1;
-                                d[system->N + center]= 1;
-                            }
-                            else {
-                                count ++;
-                                break;
-                                fprintf (stderr, "We have a problem with the four bins atom %d bond (%d %d) -- %d, (%d %d %d)\n", 
-                                        i, b_start[i], b_end[i], j, center, d_i, d_k);
-                                fprintf (stderr, "A's contents %d %d %d (%d %d %d)\n", 
-                                        a[system->N + center], a[system->N + d_i], a[system->N + d_k], a[center], a[d_i], a[d_k]);
-                                fprintf (stderr, "B's contents %d %d %d (%d %d %d)\n", 
-                                        b[system->N + center], b[system->N + d_i], b[system->N + d_k], b[center], b[d_i], b[d_k]);
-                                fprintf (stderr, "C's contents %d %d %d (%d %d %d)\n", 
-                                        c[system->N + center], c[system->N + d_i], c[system->N + d_k], c[center], c[d_i], c[d_k]);
-                                fprintf (stderr, "D's contents %d %d %d (%d %d %d)\n", 
-                                        d[system->N + center], d[system->N + d_i], d[system->N + d_k], d[center], d[d_i], d[d_k]);
-                            }
-                        }
-                    }
-                }
-                fprintf (stderr, "Miscount is %d \n", count);
-                exit (-1);
+                //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb);
-                count = 0;
-                for (int i = 0; i < system->N; i++)
+                int xx;    
+                for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++)
-                    if (a[system->N + i] != -1) count ++;
-                    if (b[system->N + i] != -1) count ++;
-                    if (c[system->N + i] != -1) count ++;
-                    if (d[system->N + i] != -1) count ++;
+                    host = &three->select.three_body_list [xx];
+                    //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb);
+                    //if ((host->thb == device->thb) && (host->pthb == device->pthb))
+                    if ((host->thb == device->thb) && !check_zero (host->theta, device->theta))
+                    {
+                        count ++;
+                        break;
+                    }
-                fprintf (stderr, "binned so many atoms --> %d \n", count );
+                if ( xx >= End_Index (z, three) ) {
+                    fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, 
+                            Start_Index (z, three), End_Index (z, three), start[x], end[x] );
+                    exit (-1);
+                }// else fprintf (stderr, "----------------- \n");
+        }
+    }
+    free (data);
+    free (start);
+    free (end);
+    free (b_start);
+    free (b_end);
+    free (d_bond_data);
-            bool validate_grid (reax_system *system)
-            {
-                int total = system->g.ncell[0] * system->g.ncell[1] * system->g.ncell[2];
-                int count = 0;
+    //fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count);
+    return TRUE;
-                int *dtop = (int *) malloc (INT_SIZE * total );
-                copy_host_device (dtop, system->d_g.top, INT_SIZE * total, cudaMemcpyDeviceToHost, __LINE__);
-                for (int i = 0; i < total; i++){
-                    if (system->g.top[i] != dtop[i]){
-                        fprintf (stderr, " top count does not match (%d %d) @ index %d \n", system->g.top[i], dtop[i], i );
-                        exit (-1);
-                    }
-                }
-                free (dtop);
+int bin_three_bodies (reax_system *system, static_storage *workspace, list **lists)
+    list *d_three = dev_lists + THREE_BODIES;
+    list *d_bonds = dev_lists + BONDS;
+    list *three = *lists + THREE_BODIES;
+    list *bonds = *lists + BONDS;
+    bond_data *d_bond_data;
-                int *datoms = (int *) malloc (INT_SIZE * total * system->d_g.max_atoms);
-                copy_host_device (datoms, system->d_g.atoms, INT_SIZE * total * system->d_g.max_atoms, cudaMemcpyDeviceToHost, __LINE__);
-                for (int i = 0; i < total*system->d_g.max_atoms; i++){
-                    if (system->g.atoms[i] != datoms[i]){
-                        fprintf (stderr, " atoms count does not match (%d %d) @ index %d \n", system->g.atoms[i], datoms[i], i );
-                        exit (-1);
-                    }
-                }
-                free (datoms);
+    three_body_interaction_data *data = (three_body_interaction_data *) 
+        malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
+    int *start = (int *) malloc (INT_SIZE * system->num_bonds);
+    int *end = (int *) malloc (INT_SIZE * system->num_bonds);
-                ivec *dnbrs = (ivec *) malloc (IVEC_SIZE * total * system->d_g.max_nbrs);
-                copy_host_device (dnbrs, system->d_g.nbrs, IVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__);
-                for (int i = 0; i < total*system->d_g.max_nbrs; i++){
-                    if (!check_same (system->g.nbrs[i], dnbrs[i])){
-                        fprintf (stderr, " nbrs count does not match @ index %d \n", i );
-                        exit (-1);
-                    }
-                }
-                free (dnbrs);
+    int *b_start = (int *) malloc (INT_SIZE * system->N);
+    int *b_end = (int *) malloc (INT_SIZE * system->N);
-                rvec *dnbrs_cp = (rvec *) malloc (RVEC_SIZE * total * system->d_g.max_nbrs);
-                copy_host_device (dnbrs_cp, system->d_g.nbrs_cp, RVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__);
-                for (int i = 0; i < total*system->d_g.max_nbrs; i++){
-                    if (check_zero (system->g.nbrs_cp[i], dnbrs_cp[i])){
-                        fprintf (stderr, " nbrs_cp count does not match @ index %d \n", i );
-                        exit (-1);
-                    }
-                }
-                free (dnbrs_cp);
+    int *a = (int *) malloc (2 * INT_SIZE * system->N );
+    int *b = (int *) malloc (2 * INT_SIZE * system->N );
+    int *c = (int *) malloc (2 * INT_SIZE * system->N );
+    int *d = (int *) malloc (2 * INT_SIZE * system->N );
-                //fprintf (stderr, " Grid match between device and host \n");
-                return true;
-            }
+    for (int i = 0; i < 2 * system->N; i++)
+        a[i] = b[i] = c[i] = d[i] = -1;
-            void print_atoms (reax_system *system)
-            {
-                int start, end, index;
+    int count;
+    int hcount, dcount;
+    int index_a, index_b, index_c, index_d;
+    index_a = index_b = index_c = index_d = 0;
-                reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N);
-                copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
+    copy_host_device ( start, d_three->index, 
+            INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( end, d_three->end_index, 
+            INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( data, d_three->select.three_body_list, 
+            sizeof (three_body_interaction_data) * system->num_thbodies, 
+            cudaMemcpyDeviceToHost, __LINE__);
-                //for (int i = 0; i < system->N; i++) 
-                for (int i = 0; i < 10; i++) 
-                {
-                    fprintf (stderr, "Atom:%d: Type:%d", i, test[i].type);
-                    fprintf (stderr, " x(%6.10f %6.10f %6.10f)", test[i].x[0], test[i].x[1], test[i].x[2] );
-                    fprintf (stderr, " v(%6.10f %6.10f %6.10f)", test[i].v[0], test[i].v[1], test[i].v[2] );
-                    fprintf (stderr, " f(%6.10f %6.10f %6.10f)", test[i].f[0], test[i].f[1], test[i].f[2] );
-                    fprintf (stderr, " q(%6.10f) \n", test[i].q );
-                }
-            }
+    d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+    copy_host_device ( b_start, d_bonds->index, 
+            INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( b_end, d_bonds->end_index, 
+            INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+    count = 0;
+    hcount = dcount = 0;
+    /*
+       for (int i = 0; i < 20; i++)
+       {
+       for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++)
+       {
+       for ( int k = Start_Index (j, three); k < End_Index (j, three); k ++)
+       {
+       three_body_interaction_data *host = &three->select.three_body_list [k];
+       fprintf (stderr, " atom %d bond (%d %d) -- %d,  (%d %d)\n", 
+       i, Start_Index (i, bonds), End_Index (i, bonds), j, host->thb, host->pthb );
+       }
+       }
+       }
+       exit (-1);
+     */
+    count = 0;
+    for (int i = 0; i < system->N; i++)
+    {
+        for (int j = b_start[i]; j < b_end[i]; j ++) {
-            void print_sys_atoms (reax_system *system)
+            /*
+               bond_data *src;
+               src = &d_bond_data[j];
+               fprintf (stderr, " atom %d Neighbor %d \n", i, src->nbr );
+             */
+            for (int x = start[j]; x < end[j]; x ++)
-                for (int i = 0; i < 10; i++) 
-                {
-                    fprintf (stderr, "Atom:%d: Type:%d", i, system->atoms[i].type);
-                    fprintf (stderr, " x(%6.10f %6.10f %6.10f)",system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );
-                    fprintf (stderr, " v(%6.10f %6.10f %6.10f)",system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] );
-                    fprintf (stderr, " f(%6.10f %6.10f %6.10f)", system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2] );
-                    fprintf (stderr, " q(%6.10f) \n", system->atoms[i].q );
+                three_body_interaction_data *device = data + x;
+                int center = device->j;
+                int d_i = device->i;
+                int d_k = device->k;
+                //fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d %d) -- (%d %d)\n", 
+                //i, b_start[i], b_end[i], j, center, d_i, d_k, device->thb, device->pthb);
+                if ((a[system->N + center] != -1)) {
+                    a[d_i] = a[d_k] = 1;
+                    continue;
+                } else if ((b[system->N + center] != -1)) {
+                    b[d_i] = b[d_k] = 1;
+                    continue;
+                } else if ((c[system->N + center] != -1)) {
+                    c[d_i] = c[d_k] = 1;
+                    continue;
+                } else if ((d[system->N + center] != -1)) {
+                    d[d_i] = d[d_k] = 1;
+                    continue;
-            }
+                if ((a[center] == -1) && (a[d_i] == -1) && (a[d_k] == -1)) {
+                    a[center] = a[d_i] = a[d_k] = 1;
+                    a[system->N + center] = 1;
+                } else if ((b[center] == -1) && (b[d_i] == -1) && (b[d_k] == -1)) {
+                    b[center] =  b[d_i] = b[d_k] = 1;
+                    b[system->N + center] = 1;
+                } else if ((c[center] == -1) && (c[d_i] == -1) && (c[d_k] == -1)) {
+                    c[center] =  c[d_i] = c[d_k] = 1;
+                    c[system->N + center] = 1;
+                } else if ((d[center] == -1) && (d[d_i] == -1) && (d[d_k] == -1)) {
+                    d[center] =  d[d_i] = d[d_k] = 1;
+                    d[system->N + center]= 1;
+                }
+                else {
+                    count ++;
+                    break;
+                    fprintf (stderr, "We have a problem with the four bins atom %d bond (%d %d) -- %d, (%d %d %d)\n", 
+                            i, b_start[i], b_end[i], j, center, d_i, d_k);
+                    fprintf (stderr, "A's contents %d %d %d (%d %d %d)\n", 
+                            a[system->N + center], a[system->N + d_i], a[system->N + d_k], a[center], a[d_i], a[d_k]);
+                    fprintf (stderr, "B's contents %d %d %d (%d %d %d)\n", 
+                            b[system->N + center], b[system->N + d_i], b[system->N + d_k], b[center], b[d_i], b[d_k]);
+                    fprintf (stderr, "C's contents %d %d %d (%d %d %d)\n", 
+                            c[system->N + center], c[system->N + d_i], c[system->N + d_k], c[center], c[d_i], c[d_k]);
+                    fprintf (stderr, "D's contents %d %d %d (%d %d %d)\n", 
+                            d[system->N + center], d[system->N + d_i], d[system->N + d_k], d[center], d[d_i], d[d_k]);
-            void print_grid (reax_system *system)
-            {
-                int i, j, k, x;
-                grid *g = &system->g;
-                for( i = 0; i < g->ncell[0]; i++ )
-                    for( j = 0; j < g->ncell[1]; j++ )
-                        for( k = 0; k < g->ncell[2]; k++ ){
-                            fprintf (stderr, "Cell [%d,%d,%d]--(", i, j, k);
-                            for (x = 0; x < g->top[index_grid_3d (i,j,k,g) ]; x++){
-                                fprintf (stderr, "%d,", g->atoms[ index_grid_atoms (i,j,k,x,g) ]);
-                            }
-                            fprintf (stderr, ")\n");
-                        }
+                }
+        }
+    }
+    fprintf (stderr, "Miscount is %d \n", count);
+    exit (-1);
+    count = 0;
+    for (int i = 0; i < system->N; i++)
+    {
+        if (a[system->N + i] != -1) count ++;
+        if (b[system->N + i] != -1) count ++;
+        if (c[system->N + i] != -1) count ++;
+        if (d[system->N + i] != -1) count ++;
+    }
+    fprintf (stderr, "binned so many atoms --> %d \n", count );
+int validate_grid (reax_system *system)
+    int total = system->g.ncell[0] * system->g.ncell[1] * system->g.ncell[2];
+    int count = 0;
+    int *dtop = (int *) malloc (INT_SIZE * total );
+    copy_host_device (dtop, system->d_g.top, INT_SIZE * total, cudaMemcpyDeviceToHost, __LINE__);
+    for (int i = 0; i < total; i++){
+        if (system->g.top[i] != dtop[i]){
+            fprintf (stderr, " top count does not match (%d %d) @ index %d \n", system->g.top[i], dtop[i], i );
+            exit (-1);
+        }
+    }
+    free (dtop);
+    int *datoms = (int *) malloc (INT_SIZE * total * system->d_g.max_atoms);
+    copy_host_device (datoms, system->d_g.atoms, INT_SIZE * total * system->d_g.max_atoms, cudaMemcpyDeviceToHost, __LINE__);
+    for (int i = 0; i < total*system->d_g.max_atoms; i++){
+        if (system->g.atoms[i] != datoms[i]){
+            fprintf (stderr, " atoms count does not match (%d %d) @ index %d \n", system->g.atoms[i], datoms[i], i );
+            exit (-1);
+        }
+    }
+    free (datoms);
+    ivec *dnbrs = (ivec *) malloc (IVEC_SIZE * total * system->d_g.max_nbrs);
+    copy_host_device (dnbrs, system->d_g.nbrs, IVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__);
+    for (int i = 0; i < total*system->d_g.max_nbrs; i++){
+        if (!check_same (system->g.nbrs[i], dnbrs[i])){
+            fprintf (stderr, " nbrs count does not match @ index %d \n", i );
+            exit (-1);
+        }
+    }
+    free (dnbrs);
+    rvec *dnbrs_cp = (rvec *) malloc (RVEC_SIZE * total * system->d_g.max_nbrs);
+    copy_host_device (dnbrs_cp, system->d_g.nbrs_cp, RVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__);
+    for (int i = 0; i < total*system->d_g.max_nbrs; i++){
+        if (check_zero (system->g.nbrs_cp[i], dnbrs_cp[i])){
+            fprintf (stderr, " nbrs_cp count does not match @ index %d \n", i );
+            exit (-1);
+        }
+    }
+    free (dnbrs_cp);
+    //fprintf (stderr, " Grid match between device and host \n");
+    return TRUE;
+void print_atoms (reax_system *system)
+    int start, end, index;
+    reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N);
+    copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
+    //for (int i = 0; i < system->N; i++) 
+    for (int i = 0; i < 10; i++) 
+    {
+        fprintf (stderr, "Atom:%d: Type:%d", i, test[i].type);
+        fprintf (stderr, " x(%6.10f %6.10f %6.10f)", test[i].x[0], test[i].x[1], test[i].x[2] );
+        fprintf (stderr, " v(%6.10f %6.10f %6.10f)", test[i].v[0], test[i].v[1], test[i].v[2] );
+        fprintf (stderr, " f(%6.10f %6.10f %6.10f)", test[i].f[0], test[i].f[1], test[i].f[2] );
+        fprintf (stderr, " q(%6.10f) \n", test[i].q );
+    }
+void print_sys_atoms (reax_system *system)
+    for (int i = 0; i < 10; i++) 
+    {
+        fprintf (stderr, "Atom:%d: Type:%d", i, system->atoms[i].type);
+        fprintf (stderr, " x(%6.10f %6.10f %6.10f)",system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );
+        fprintf (stderr, " v(%6.10f %6.10f %6.10f)",system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] );
+        fprintf (stderr, " f(%6.10f %6.10f %6.10f)", system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2] );
+        fprintf (stderr, " q(%6.10f) \n", system->atoms[i].q );
+    }
+void print_grid (reax_system *system)
+    int i, j, k, x;
+    grid *g = &system->g;
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ ){
+                fprintf (stderr, "Cell [%d,%d,%d]--(", i, j, k);
+                for (x = 0; x < g->top[index_grid_3d (i,j,k,g) ]; x++){
+                    fprintf (stderr, "%d,", g->atoms[ index_grid_atoms (i,j,k,x,g) ]);
+                }
+                fprintf (stderr, ")\n");
+            }
diff --git a/PuReMD-GPU/src/validation.h b/PuReMD-GPU/src/validation.h
index 5ef9d2f37393c37eb4430238bdf0c68734842bcc..5eccf7d94f8c7716d45b45305155b0492b70dda2 100644
--- a/PuReMD-GPU/src/validation.h
+++ b/PuReMD-GPU/src/validation.h
@@ -23,35 +23,35 @@
 #include "mytypes.h"
-bool check_zero (real , real );
-bool check_zero (rvec , rvec );
-bool check_same (ivec , ivec );
+int check_zero (real , real );
+int check_zero (rvec , rvec );
+int check_same (ivec , ivec );
-bool validate_box (simulation_box *host, simulation_box *dev);
-bool validate_atoms (reax_system *, list **);
-bool validate_grid (reax_system *);
+int validate_box (simulation_box *host, simulation_box *dev);
+int validate_atoms (reax_system *, list **);
+int validate_grid (reax_system *);
-bool validate_bonds (reax_system *, static_storage *, list **);
-bool validate_hbonds (reax_system *, static_storage *, list **);
-bool validate_sym_dbond_indices (reax_system *, static_storage *, list **);
-bool validate_three_bodies (reax_system *, static_storage *, list **);
+int validate_bonds (reax_system *, static_storage *, list **);
+int validate_hbonds (reax_system *, static_storage *, list **);
+int validate_sym_dbond_indices (reax_system *, static_storage *, list **);
+int validate_three_bodies (reax_system *, static_storage *, list **);
 void count_three_bodies (reax_system *system, static_storage *workspace, list **lists);
-bool bin_three_bodies (reax_system *, static_storage *, list **);
+int bin_three_bodies (reax_system *, static_storage *, list **);
-bool validate_sort_matrix (reax_system *, static_storage *);
-bool validate_sparse_matrix (reax_system *, static_storage *);
-bool validate_lu (static_storage *);
+int validate_sort_matrix (reax_system *, static_storage *);
+int validate_sparse_matrix (reax_system *, static_storage *);
+int validate_lu (static_storage *);
 void print_sparse_matrix (reax_system *, static_storage *);
 void print_bond_list (reax_system *, static_storage *, list **);
-bool validate_workspace (reax_system *, static_storage *, list **);
-bool validate_neighbors (reax_system *, list **lists);
+int validate_workspace (reax_system *, static_storage *, list **);
+int validate_neighbors (reax_system *, list **lists);
-bool validate_data (reax_system *, simulation_data *);
+int validate_data (reax_system *, simulation_data *);
-bool analyze_hbonds (reax_system *, static_storage *, list **);
+int analyze_hbonds (reax_system *, static_storage *, list **);
 void Print_Matrix (sparse_matrix *);
 void Print_Matrix_L (sparse_matrix *);
diff --git a/PuReMD-GPU/src/vector.cu b/PuReMD-GPU/src/vector.c
similarity index 100%
rename from PuReMD-GPU/src/vector.cu
rename to PuReMD-GPU/src/vector.c
diff --git a/PuReMD-GPU/src/vector.h b/PuReMD-GPU/src/vector.h
index 336534784e50fd5552aaf27ff0c9531c3b97a02a..e1111e514928e79fc79197a0f2486d5eefb1cfa3 100644
--- a/PuReMD-GPU/src/vector.h
+++ b/PuReMD-GPU/src/vector.h
@@ -22,8 +22,14 @@
 #define __VECTOR_H_
 #include "mytypes.h"
 #include "random.h"
+#ifdef __cplusplus
+extern "C"  {
 int  Vector_isZero( real*, int );
 void Vector_MakeZero( real*, int );
 void Vector_Copy( real*, real*, int );
@@ -33,8 +39,6 @@ void Vector_Copy( real*, real*, int );
 void Vector_Print( FILE*, char*, real*, int );
 real Norm( real*, int );
-HOST_DEVICE inline real Dot( real*, real*, int );
 void rvec_Sum( rvec, rvec, rvec );
 real rvec_ScaledDot( real, rvec, real, rvec );
 void rvec_Multiply( rvec, rvec, rvec );
@@ -44,19 +48,6 @@ void rvec_Invert( rvec, rvec );
 void rvec_OuterProduct( rtensor, rvec, rvec );
 int  rvec_isZero( rvec );
-HOST_DEVICE inline real rvec_Dot( rvec, rvec );
-HOST_DEVICE inline void rvec_Scale( rvec, real, rvec );
-HOST_DEVICE inline real rvec_Norm_Sqr( rvec );
-HOST_DEVICE inline void rvec_Random( rvec );
-HOST_DEVICE inline void rvec_MakeZero( rvec );
-HOST_DEVICE inline void rvec_Add( rvec, rvec );
-HOST_DEVICE inline void rvec_Copy( rvec, rvec );
-HOST_DEVICE inline void rvec_Cross( rvec, rvec, rvec );
-HOST_DEVICE inline void rvec_ScaledAdd( rvec, real, rvec );
-HOST_DEVICE inline void rvec_ScaledSum( rvec, real, rvec, real, rvec );
-HOST_DEVICE inline void rvec_iMultiply( rvec, ivec, rvec );
-HOST_DEVICE inline real rvec_Norm( rvec );
 void rtensor_MakeZero( rtensor );
 void rtensor_Multiply( rtensor, rtensor, rtensor );
 void rtensor_MatVec( rvec, rtensor, rvec );
@@ -80,16 +71,7 @@ void ivec_MakeZero( ivec );
 void ivec_rScale( ivec, real, rvec );
-HOST_DEVICE inline void ivec_Copy( ivec, ivec );
-HOST_DEVICE inline void ivec_Scale( ivec, real, ivec );
-HOST_DEVICE inline void ivec_Sum( ivec, ivec, ivec );
- * Code which is common to multiple HOST and DEVICE
- *
- */
-HOST_DEVICE inline real Dot( real* v1, real* v2, int k )
+static inline HOST_DEVICE real Dot( real* v1, real* v2, int k )
     real ret = 0;
@@ -100,102 +82,109 @@ HOST_DEVICE inline real Dot( real* v1, real* v2, int k )
 //rvec functions
-HOST_DEVICE inline void rvec_MakeZero( rvec v )
+static inline HOST_DEVICE void rvec_MakeZero( rvec v )
     v[0] = v[1] = v[2] = ZERO;
-HOST_DEVICE inline void rvec_Add( rvec ret, rvec v )
+static inline HOST_DEVICE void rvec_Add( rvec ret, rvec v )
     ret[0] += v[0];
     ret[1] += v[1];
     ret[2] += v[2];
-HOST_DEVICE inline void rvec_Copy( rvec dest, rvec src )
+static inline HOST_DEVICE void rvec_Copy( rvec dest, rvec src )
     dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
-HOST_DEVICE inline void rvec_Cross( rvec ret, rvec v1, rvec v2 )
+static inline HOST_DEVICE void rvec_Cross( rvec ret, rvec v1, rvec v2 )
     ret[0] = v1[1] * v2[2] - v1[2] * v2[1];
     ret[1] = v1[2] * v2[0] - v1[0] * v2[2];
     ret[2] = v1[0] * v2[1] - v1[1] * v2[0];
-HOST_DEVICE inline void rvec_ScaledAdd( rvec ret, real c, rvec v )
+static inline HOST_DEVICE void rvec_ScaledAdd( rvec ret, real c, rvec v )
     ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
-HOST_DEVICE inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 )
+static inline HOST_DEVICE void rvec_ScaledSum( rvec ret, real c1, rvec v1 , real c2, rvec v2 )
     ret[0] = c1 * v1[0] + c2 * v2[0];
     ret[1] = c1 * v1[1] + c2 * v2[1];
     ret[2] = c1 * v1[2] + c2 * v2[2];
-HOST_DEVICE inline void rvec_Random( rvec v )
+static inline HOST_DEVICE void rvec_Random( rvec v )
     v[0] = Random(2.0) - 1.0;
     v[1] = Random(2.0) - 1.0;
     v[2] = Random(2.0) - 1.0;
-HOST_DEVICE inline real rvec_Norm_Sqr( rvec v )
+static inline HOST_DEVICE real rvec_Norm_Sqr( rvec v )
     return SQR(v[0]) + SQR(v[1]) + SQR(v[2]);
-HOST_DEVICE inline void rvec_Scale( rvec ret, real c, rvec v )
+static inline HOST_DEVICE void rvec_Scale( rvec ret, real c, rvec v )
     ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
-HOST_DEVICE inline real rvec_Dot( rvec v1, rvec v2 )
+static inline HOST_DEVICE real rvec_Dot( rvec v1, rvec v2 )
     return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
-HOST_DEVICE inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
+static inline HOST_DEVICE void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
     r[0] = v1[0] * v2[0];
     r[1] = v1[1] * v2[1];
     r[2] = v1[2] * v2[2];
-HOST_DEVICE inline real rvec_Norm( rvec v )
+static inline HOST_DEVICE real rvec_Norm( rvec v )
     return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) );
 //ivec functions
-HOST_DEVICE inline void ivec_Copy( ivec dest , ivec src )
+static inline HOST_DEVICE void ivec_Copy( ivec dest , ivec src )
     dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
-HOST_DEVICE inline void ivec_Scale( ivec dest, real C, ivec src )
+static inline HOST_DEVICE void ivec_Scale( ivec dest, real C, ivec src )
     dest[0] = C * src[0];
     dest[1] = C * src[1];
     dest[2] = C * src[2];
-HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
+static inline HOST_DEVICE void ivec_Sum( ivec dest, ivec v1, ivec v2 )
     dest[0] = v1[0] + v2[0];
     dest[1] = v1[1] + v2[1];
@@ -203,26 +192,32 @@ HOST_DEVICE inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
 //vector functions
-HOST_DEVICE inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
+static inline HOST_DEVICE void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
     for (k--; k >= 0; k--)
         dest[k] = c * v[k] + d * y[k];
-HOST_DEVICE inline void Vector_Scale( real* dest, real c, real* v, int k )
+static inline HOST_DEVICE void Vector_Scale( real* dest, real c, real* v, int k )
     for (k--; k >= 0; k--)
         dest[k] = c * v[k];
-HOST_DEVICE inline void Vector_Add( real* dest, real c, real* v, int k )
+static inline HOST_DEVICE void Vector_Add( real* dest, real c, real* v, int k )
     for (k--; k >= 0; k--)
         dest[k] += c * v[k];
+#ifdef __cplusplus
diff --git a/PuReMD/src/linear_solvers.c b/PuReMD/src/linear_solvers.c
index 835ffa358e4e6ff995e2f69dc694bb1c3f7798c1..d08dfe1ed75780bc90b5aa65bb13462393d9af47 100644
--- a/PuReMD/src/linear_solvers.c
+++ b/PuReMD/src/linear_solvers.c
@@ -36,7 +36,9 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
     real H;
     for ( i = 0; i < N; ++i )
+    {
         b[i][0] = b[i][1] = 0;
+    }
     /* perform multiplication */
     for ( i = 0; i < A->n; ++i )
@@ -64,7 +66,7 @@ void dual_Sparse_MatVec( sparse_matrix *A, rvec2 *x, rvec2 *b, int N )
 int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
-             rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout )
+        rvec2 *b, real tol, rvec2 *x, mpi_datatypes* mpi_data, FILE *fout )
     int  i, j, n, N, matvecs, scale;
     rvec2 tmp, alpha, beta;
@@ -86,13 +88,17 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         t_start = Get_Time( );
     Dist( system, mpi_data, x, mpi_data->mpi_rvec2, scale, rvec2_packer );
     dual_Sparse_MatVec( H, x, workspace->q2, N );
     // tryQEq
     Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &matvec_time );
+    }
     for ( j = 0; j < system->n; ++j )
@@ -126,6 +132,7 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
     //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] );
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
         Update_Timing_Info( &t_start, &dot_time );
@@ -137,9 +144,12 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         dual_Sparse_MatVec( H, workspace->d2, workspace->q2, N );
         // tryQEq
         Coll(system, mpi_data, workspace->q2, mpi_data->mpi_rvec2, scale, rvec2_unpacker);
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &matvec_time );
+        }
         /* dot product: d.q */
@@ -174,12 +184,18 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
         sig_old[1] = sig_new[1];
         MPI_Allreduce( &my_dot, &sig_new, 2, MPI_DOUBLE, MPI_SUM, comm );
         //fprintf( stderr, "sig_new: %f %f\n", sig_new[0], sig_new[1] );
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &dot_time );
+        }
         if ( sqrt(sig_new[0]) / b_norm[0] <= tol || sqrt(sig_new[1]) / b_norm[1] <= tol )
+        {
+        }
         beta[0] = sig_new[0] / sig_old[0];
         beta[1] = sig_new[1] / sig_old[1];
@@ -194,30 +210,41 @@ int dual_CG( reax_system *system, storage *workspace, sparse_matrix *H,
     if ( sqrt(sig_new[0]) / b_norm[0] <= tol )
         for ( j = 0; j < n; ++j )
+        {
             workspace->t[j] = workspace->x[j][1];
-        matvecs = CG( system, workspace, H, workspace->b_t, tol, workspace->t,
-                      mpi_data, fout );
+        }
+        matvecs = CG( system, workspace, H, workspace->b_t, tol,
+                workspace->t,mpi_data, fout );
         for ( j = 0; j < n; ++j )
+        {
             workspace->x[j][1] = workspace->t[j];
+        }
     else if ( sqrt(sig_new[1]) / b_norm[1] <= tol )
         for ( j = 0; j < n; ++j )
+        {
             workspace->s[j] = workspace->x[j][0];
+        }
         matvecs = CG( system, workspace, H, workspace->b_s, tol, workspace->s,
-                      mpi_data, fout );
+                mpi_data, fout );
         for ( j = 0; j < system->n; ++j )
+        {
             workspace->x[j][0] = workspace->s[j];
+        }
     if ( i >= 300 )
+    {
         fprintf( stderr, "CG convergence failed!\n" );
+    }
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
-        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n",
-                 i + 1, matvecs, matvec_time, dot_time );
+    {
+        fprintf( fout, "QEq %d + %d iters. matvecs: %f  dot: %f\n", i + 1,
+                matvecs, matvec_time, dot_time );
+    }
     return (i + 1) + matvecs;
@@ -230,7 +257,9 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
     real H;
     for ( i = 0; i < N; ++i )
+    {
         b[i] = 0;
+    }
     /* perform multiplication */
     for ( i = 0; i < A->n; ++i )
@@ -249,8 +278,8 @@ void Sparse_MatVec( sparse_matrix *A, real *x, real *b, int N )
-int CG( reax_system *system, storage *workspace, sparse_matrix *H,
-        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+int CG( reax_system *system, storage *workspace, sparse_matrix *H, real *b,
+        real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
     int  i, j, scale;
     real tmp, alpha, beta, b_norm;
@@ -269,21 +298,29 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
     Sparse_MatVec( H, x, workspace->q, system->N );
     // tryQEq
     Coll( system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker );
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &matvec_time );
+    }
     Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, system->n );
     for ( j = 0; j < system->n; ++j )
+    {
         workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j]; //pre-condition
+    }
     b_norm = Parallel_Norm( b, system->n, mpi_data->world );
     sig_new = Parallel_Dot(workspace->r, workspace->d, system->n, mpi_data->world);
     sig0 = sig_new;
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
+    {
         Update_Timing_Info( &t_start, &dot_time );
+    }
     for ( i = 1; i < 300 && sqrt(sig_new) / b_norm > tol; ++i )
@@ -292,9 +329,12 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
         Sparse_MatVec( H, workspace->d, workspace->q, system->N );
         Coll(system, mpi_data, workspace->q, MPI_DOUBLE, scale, real_unpacker);
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &matvec_time );
+        }
         tmp = Parallel_Dot(workspace->d, workspace->q, system->n, mpi_data->world);
@@ -303,15 +343,20 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
         Vector_Add( workspace->r, -alpha, workspace->q, system->n );
         /* pre-conditioning */
         for ( j = 0; j < system->n; ++j )
+        {
             workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+        }
         sig_old = sig_new;
         sig_new = Parallel_Dot(workspace->r, workspace->p, system->n, mpi_data->world);
         beta = sig_new / sig_old;
         Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, system->n );
 #if defined(CG_PERFORMANCE)
         if ( system->my_rank == MASTER_NODE )
+        {
             Update_Timing_Info( &t_start, &dot_time );
+        }
@@ -323,8 +368,10 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
 #if defined(CG_PERFORMANCE)
     if ( system->my_rank == MASTER_NODE )
-        fprintf( fout, "QEq %d iters. matvecs: %f  dot: %f\n",
-                 i, matvec_time, dot_time );
+    {
+        fprintf( fout, "QEq %d iters. matvecs: %f  dot: %f\n", i, matvec_time,
+                dot_time );
+    }
     return i;
@@ -332,7 +379,7 @@ int CG( reax_system *system, storage *workspace, sparse_matrix *H,
 int CG_test( reax_system *system, storage *workspace, sparse_matrix *H,
-             real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
+        real *b, real tol, real *x, mpi_datatypes* mpi_data, FILE *fout )
     int  i, j, scale;
     real tmp, alpha, beta, b_norm;