From bb006b442445353331dbd0968bf83fe7bf0ae746 Mon Sep 17 00:00:00 2001
From: "Kurt A. O'Hearn" <ohearnku@cse.msu.edu>
Date: Mon, 20 Jun 2016 20:59:17 -0400
Subject: [PATCH] More code formatting.

---
 PG-PuReMD/src/center_mass.cu               | 1000 ++--
 PG-PuReMD/src/cuda_bond_orders.cu          | 1564 +++---
 PG-PuReMD/src/cuda_bonds.cu                |  220 +-
 PG-PuReMD/src/cuda_copy.cu                 |  316 +-
 PG-PuReMD/src/cuda_environment.cu          |   64 +-
 PG-PuReMD/src/cuda_forces.cu               | 3113 ++++++------
 PG-PuReMD/src/cuda_hydrogen_bonds.cu       | 1212 ++---
 PG-PuReMD/src/cuda_init_md.cu              |    4 +-
 PG-PuReMD/src/cuda_integrate.cu            |  122 +-
 PG-PuReMD/src/cuda_linear_solvers.cu       |  360 +-
 PG-PuReMD/src/cuda_lookup.cu               |  132 +-
 PG-PuReMD/src/cuda_multi_body.cu           |  518 +-
 PG-PuReMD/src/cuda_neighbors.cu            | 1318 ++---
 PG-PuReMD/src/cuda_nonbonded.cu            | 1000 ++--
 PG-PuReMD/src/cuda_post_evolve.cu          |   38 +-
 PG-PuReMD/src/cuda_qEq.cu                  |  190 +-
 PG-PuReMD/src/cuda_reset_tools.cu          |  286 +-
 PG-PuReMD/src/cuda_torsion_angles.cu       | 1160 ++---
 PG-PuReMD/src/cuda_utils.cu                |  144 +-
 PG-PuReMD/src/cuda_valence_angles.cu       | 1094 ++---
 PG-PuReMD/src/dev_alloc.cu                 |  794 +--
 PG-PuReMD/src/dev_list.cu                  |  138 +-
 PG-PuReMD/src/dev_system_props.cu          |  444 +-
 PG-PuReMD/src/dual_matvec.cu               |  208 +-
 PG-PuReMD/src/matvec.cu                    |  100 +-
 PG-PuReMD/src/reduction.cu                 |  756 +--
 PG-PuReMD/src/validation.cu                | 3104 ++++++------
 PG-PuReMD/src/vector.cu                    |  772 +--
 PuReMD-GPU/src/GMRES.cu                    | 1940 ++++----
 PuReMD-GPU/src/QEq.cu                      | 1446 +++---
 PuReMD-GPU/src/allocate.cu                 |  898 ++--
 PuReMD-GPU/src/bond_orders.cu              | 3104 ++++++------
 PuReMD-GPU/src/box.cu                      |  746 +--
 PuReMD-GPU/src/center_mass.cu              |  436 +-
 PuReMD-GPU/src/cuda_copy.cu                |  224 +-
 PuReMD-GPU/src/cuda_init.cu                |  412 +-
 PuReMD-GPU/src/cuda_utils.cu               |  156 +-
 PuReMD-GPU/src/forces.cu                   | 5168 ++++++++++----------
 PuReMD-GPU/src/four_body_interactions.cu   | 2538 +++++-----
 PuReMD-GPU/src/grid.cu                     |  698 +--
 PuReMD-GPU/src/helpers.cu                  |   14 +-
 PuReMD-GPU/src/init_md.cu                  | 2202 ++++-----
 PuReMD-GPU/src/integrate.cu                | 1804 +++----
 PuReMD-GPU/src/list.cu                     |  404 +-
 PuReMD-GPU/src/lookup.cu                   | 1424 +++---
 PuReMD-GPU/src/matvec.cu                   |  102 +-
 PuReMD-GPU/src/neighbors.cu                | 2602 +++++-----
 PuReMD-GPU/src/reduction.cu                |  338 +-
 PuReMD-GPU/src/reset_utils.cu              |  290 +-
 PuReMD-GPU/src/single_body_interactions.cu | 1524 +++---
 PuReMD-GPU/src/system_props.cu             | 1130 ++---
 PuReMD-GPU/src/testmd.cu                   |  570 +--
 PuReMD-GPU/src/three_body_interactions.cu  | 4340 ++++++++--------
 PuReMD-GPU/src/traj.cu                     |  818 ++--
 PuReMD-GPU/src/two_body_interactions.cu    | 2700 +++++-----
 PuReMD-GPU/src/validation.cu               | 3704 +++++++-------
 PuReMD-GPU/src/vector.cu                   |  294 +-
 57 files changed, 31106 insertions(+), 31091 deletions(-)

diff --git a/PG-PuReMD/src/center_mass.cu b/PG-PuReMD/src/center_mass.cu
index 16d34141..725cafbb 100644
--- a/PG-PuReMD/src/center_mass.cu
+++ b/PG-PuReMD/src/center_mass.cu
@@ -3,549 +3,549 @@
 #include "cuda_shuffle.h"
 
 CUDA_GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms,
-		rvec *res_xcm, 
-		rvec *res_vcm, 
-		rvec *res_amcm, 
-		size_t n)
+        rvec *res_xcm, 
+        rvec *res_vcm, 
+        rvec *res_amcm, 
+        size_t n)
 {
-	extern __shared__ rvec xcm[];
-	extern __shared__ rvec vcm[];
-	extern __shared__ rvec amcm[];
-
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-	//unsigned int xcm_id = threadIdx.x;
-	unsigned int vcm_id = blockDim.x;
-	unsigned int amcm_id = 2 *(blockDim.x);
-
-	unsigned int index = 0;
-	rvec tmp;
-	real m;
-
-	rvec_MakeZero (xcm [threadIdx.x]);
-	rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
-	rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
-	rvec_MakeZero (tmp);
-
-	if (i < n){
-		m = sbp [ atoms[i].type ].mass;
-		rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
-		rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
-		rvec_Cross (tmp, atoms[i].x, atoms [i].v);
-		rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
-	}
-	__syncthreads ();
-
-	for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-		if ((threadIdx.x < offset)) {
-			index = threadIdx.x + offset;
-			rvec_Add (xcm [threadIdx.x], xcm[index]);
-			rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
-			rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
-		} 
-		__syncthreads ();
-	}
-
-	if ((threadIdx.x == 0)){
-		rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
-		rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
-		rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
-	}
+    extern __shared__ rvec xcm[];
+    extern __shared__ rvec vcm[];
+    extern __shared__ rvec amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    //unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 *(blockDim.x);
+
+    unsigned int index = 0;
+    rvec tmp;
+    real m;
+
+    rvec_MakeZero (xcm [threadIdx.x]);
+    rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
+    rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
+    rvec_MakeZero (tmp);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
+        rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
+        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
+        rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
+    }
+    __syncthreads ();
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (xcm [threadIdx.x], xcm[index]);
+            rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
+            rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
+        } 
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0)){
+        rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
+    }
 }
 
 #if defined( __SM_35__)
 CUDA_GLOBAL void center_of_mass_blocks_xcm (single_body_parameters *sbp, reax_atom *atoms,
-		rvec *res_xcm,
-		size_t n)
+        rvec *res_xcm,
+        size_t n)
 {
-	extern __shared__ rvec my_xcm[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int xcm_id = threadIdx.x;
-	unsigned int index = 0;
-	rvec xcm;
-	real m;
-
-	rvec_MakeZero (xcm);
-
-	if (i < n){
-		m = sbp [ atoms[i].type ].mass;
-		rvec_ScaledAdd (xcm , m, atoms [i].x);
-	}
-	__syncthreads ();
-
-	for (int z = 16; z >= 1; z /= 2){
-		xcm[0] += shfl( xcm[0], z);
-		xcm[1] += shfl( xcm[1], z);
-		xcm[2] += shfl( xcm[2], z);
-	}
-	__syncthreads ();
-
-	if (threadIdx.x % 32 == 0)
-		rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm );
-	__syncthreads ();
-
-	for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-		if ((threadIdx.x < offset)) {
-			index = threadIdx.x + offset;
-			rvec_Add (my_xcm [threadIdx.x], my_xcm[index]);
-		}
-		__syncthreads ();
-	}
-
-	if ((threadIdx.x == 0))
-		rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]);
+    extern __shared__ rvec my_xcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int index = 0;
+    rvec xcm;
+    real m;
+
+    rvec_MakeZero (xcm);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (xcm , m, atoms [i].x);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        xcm[0] += shfl( xcm[0], z);
+        xcm[1] += shfl( xcm[1], z);
+        xcm[2] += shfl( xcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_xcm[ threadIdx.x >> 5], xcm );
+    __syncthreads ();
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_xcm [threadIdx.x], my_xcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0))
+        rvec_Copy (res_xcm[blockIdx.x], my_xcm[0]);
 }
 
 CUDA_GLOBAL void center_of_mass_blocks_vcm (single_body_parameters *sbp, reax_atom *atoms,
-		rvec *res_vcm,
-		size_t n)
+        rvec *res_vcm,
+        size_t n)
 {
-	extern __shared__ rvec my_vcm[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int index = 0;
-	rvec vcm;
-	real m;
-
-	rvec_MakeZero (vcm);
-
-	if (i < n){
-		m = sbp [ atoms[i].type ].mass;
-		rvec_ScaledAdd (vcm , m, atoms [i].v);
-	}
-	__syncthreads ();
-
-	for (int z = 16; z >= 1; z /= 2){
-		vcm[0] += shfl( vcm[0], z);
-		vcm[1] += shfl( vcm[1], z);
-		vcm[2] += shfl( vcm[2], z);
-	}
-	__syncthreads ();
-
-	if (threadIdx.x % 32 == 0)
-		rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm );
-	__syncthreads ();
-
-	for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-		if ((threadIdx.x < offset)) {
-			index = threadIdx.x + offset;
-			rvec_Add (my_vcm [threadIdx.x], my_vcm[index]);
-		}
-		__syncthreads ();
-	}
-
-	if ((threadIdx.x == 0))
-		rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]);
+    extern __shared__ rvec my_vcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    rvec vcm;
+    real m;
+
+    rvec_MakeZero (vcm);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (vcm , m, atoms [i].v);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        vcm[0] += shfl( vcm[0], z);
+        vcm[1] += shfl( vcm[1], z);
+        vcm[2] += shfl( vcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_vcm[ threadIdx.x >> 5], vcm );
+    __syncthreads ();
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_vcm [threadIdx.x], my_vcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0))
+        rvec_Copy (res_vcm[blockIdx.x], my_vcm[0]);
 }
 
 CUDA_GLOBAL void center_of_mass_blocks_amcm (single_body_parameters *sbp, reax_atom *atoms,
-		rvec *res_amcm,
-		size_t n)
+        rvec *res_amcm,
+        size_t n)
 {
-	extern __shared__ rvec my_amcm[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int index = 0;
-	rvec amcm;
-	real m;
-	rvec tmp;
-
-	rvec_MakeZero (amcm);
-	rvec_MakeZero( tmp );
-
-	if (i < n){
-		m = sbp [ atoms[i].type ].mass;
-		rvec_Cross (tmp, atoms[i].x, atoms [i].v);
-		rvec_ScaledAdd (amcm, m, tmp);
-	}
-	__syncthreads ();
-
-	for (int z = 16; z >= 1; z /= 2){
-		amcm[0] += shfl( amcm[0], z);
-		amcm[1] += shfl( amcm[1], z);
-		amcm[2] += shfl( amcm[2], z);
-	}
-	__syncthreads ();
-
-	if (threadIdx.x % 32 == 0)
-		rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm );
-	__syncthreads ();
-
-
-	for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
-
-		if ((threadIdx.x < offset)) {
-			index = threadIdx.x + offset;
-			rvec_Add (my_amcm[threadIdx.x], my_amcm[index]);
-		}
-		__syncthreads ();
-	}
-
-	if ((threadIdx.x == 0)){
-		rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]);
-	}
+    extern __shared__ rvec my_amcm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    rvec amcm;
+    real m;
+    rvec tmp;
+
+    rvec_MakeZero (amcm);
+    rvec_MakeZero( tmp );
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
+        rvec_ScaledAdd (amcm, m, tmp);
+    }
+    __syncthreads ();
+
+    for (int z = 16; z >= 1; z /= 2){
+        amcm[0] += shfl( amcm[0], z);
+        amcm[1] += shfl( amcm[1], z);
+        amcm[2] += shfl( amcm[2], z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_amcm[ threadIdx.x >> 5], amcm );
+    __syncthreads ();
+
+
+    for( int offset = blockDim.x >> 6; offset > 0; offset >>= 1 ) {
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (my_amcm[threadIdx.x], my_amcm[index]);
+        }
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0)){
+        rvec_Copy (res_amcm[blockIdx.x], my_amcm[0]);
+    }
 }
 
 #endif
 
 
 CUDA_GLOBAL void center_of_mass (rvec *xcm, 
-		rvec *vcm, 
-		rvec *amcm, 
-		rvec *res_xcm,
-		rvec *res_vcm,
-		rvec *res_amcm,
-		size_t n)
+        rvec *vcm, 
+        rvec *amcm, 
+        rvec *res_xcm,
+        rvec *res_vcm,
+        rvec *res_amcm,
+        size_t n)
 {
-	extern __shared__ rvec sh_xcm[];
-	extern __shared__ rvec sh_vcm[];
-	extern __shared__ rvec sh_amcm[];
-
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-	unsigned int xcm_id = threadIdx.x;
-	unsigned int vcm_id = blockDim.x;
-	unsigned int amcm_id = 2 * (blockDim.x);
-
-	unsigned int index = 0;
-	rvec t_xcm, t_vcm, t_amcm;
-
-	rvec_MakeZero (t_xcm);
-	rvec_MakeZero (t_vcm);
-	rvec_MakeZero (t_amcm);
-
-	if (i < n){
-		rvec_Copy ( t_xcm, xcm[threadIdx.x]);
-		rvec_Copy ( t_vcm, vcm[threadIdx.x]);
-		rvec_Copy ( t_amcm, amcm[threadIdx.x]);
-	}
-
-	rvec_Copy (sh_xcm[xcm_id], t_xcm);
-	rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
-	rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
-
-	__syncthreads ();
-
-	for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-		if (threadIdx.x < offset) {
-			index = threadIdx.x + offset;
-			rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
-			rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
-			rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
-		} 
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0){
-		rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
-		rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
-		rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
-	}
+    extern __shared__ rvec sh_xcm[];
+    extern __shared__ rvec sh_vcm[];
+    extern __shared__ rvec sh_amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 * (blockDim.x);
+
+    unsigned int index = 0;
+    rvec t_xcm, t_vcm, t_amcm;
+
+    rvec_MakeZero (t_xcm);
+    rvec_MakeZero (t_vcm);
+    rvec_MakeZero (t_amcm);
+
+    if (i < n){
+        rvec_Copy ( t_xcm, xcm[threadIdx.x]);
+        rvec_Copy ( t_vcm, vcm[threadIdx.x]);
+        rvec_Copy ( t_amcm, amcm[threadIdx.x]);
+    }
+
+    rvec_Copy (sh_xcm[xcm_id], t_xcm);
+    rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
+    rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
+
+    __syncthreads ();
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+
+        if (threadIdx.x < offset) {
+            index = threadIdx.x + offset;
+            rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
+            rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
+            rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
+        } 
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0){
+        rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
+    }
 }
 
 CUDA_GLOBAL void compute_center_mass (single_body_parameters *sbp, 
-		reax_atom *atoms,
-		real *results, 
-		real xcm0, real xcm1, real xcm2,
-		size_t n)
+        reax_atom *atoms,
+        real *results, 
+        real xcm0, real xcm1, real xcm2,
+        size_t n)
 {
-	extern __shared__ real xx[];
-	extern __shared__ real xy[];
-	extern __shared__ real xz[];
-	extern __shared__ real yy[];
-	extern __shared__ real yz[];
-	extern __shared__ real zz[];
-
-	unsigned int xx_i = threadIdx.x;
-	unsigned int xy_i = blockDim.x;
-	unsigned int xz_i = 2 * blockDim.x;
-	unsigned int yy_i = 3 * blockDim.x;
-	unsigned int yz_i = 4 * blockDim.x;
-	unsigned int zz_i = 5 * blockDim.x;
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int index = 0;
-
-	rvec diff, xcm;
-	real m = 0;
-	rvec_MakeZero (diff);
-	xcm[0] = xcm0;
-	xcm[1] = xcm1;
-	xcm[2] = xcm2;
-
-
-	xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-		yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-	if (i < n){
-		m = sbp[ atoms[i].type ].mass;
-		rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-		xx[ xx_i ] = diff[0] * diff[0] * m;
-		xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
-		xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
-		yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
-		yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
-		zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
-	}
-	__syncthreads ();
-
-	for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-		if (threadIdx.x < offset){
-			index = threadIdx.x + offset;
-			xx[ threadIdx.x ] += xx[ index ];
-			xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-			xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-			yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-			yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-			zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-		}
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0) {
-		results [ blockIdx.x*6 ] = xx [ 0 ];
-		results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
-		results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
-		results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
-		results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
-		results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
-	}
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xx[ xx_i ] = diff[0] * diff[0] * m;
+        xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
+        xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
+        yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
+        yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
+        zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            xx[ threadIdx.x ] += xx[ index ];
+            xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 ] = xx [ 0 ];
+        results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
+        results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
+        results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
+        results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
+        results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
+    }
 }
 
 CUDA_GLOBAL void compute_center_mass (real *input, real *output, size_t n)
 {
-	extern __shared__ real xx[];
-	extern __shared__ real xy[];
-	extern __shared__ real xz[];
-	extern __shared__ real yy[];
-	extern __shared__ real yz[];
-	extern __shared__ real zz[];
-
-	unsigned int xx_i = threadIdx.x;
-	unsigned int xy_i = blockDim.x;
-	unsigned int xz_i = 2 * blockDim.x;
-	unsigned int yy_i = 3 * blockDim.x;
-	unsigned int yz_i = 4 * blockDim.x;
-	unsigned int zz_i = 5 * blockDim.x;
-
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int index = 0;
-
-	xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-		yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-	if (i < n)
-	{
-		xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
-		xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
-		xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
-		yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
-		yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
-		zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
-	}
-	__syncthreads ();
-
-	for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if (threadIdx.x < offset )
-		{
-			index = threadIdx.x + offset;
-			xx [ threadIdx.x ] += xx [ index ];
-			xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-			xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-			yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-			yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-			zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-		}
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0)
-	{
-		output[0] = xx[0];
-		output[1] = xy[xy_i];
-		output[2] = xz[xz_i];
-		output[3] = xz[yy_i];
-		output[4] = xz[yz_i];
-		output[5] = xz[zz_i];
-	}
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n)
+    {
+        xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
+        xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
+        xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
+        yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
+        yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
+        zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset )
+        {
+            index = threadIdx.x + offset;
+            xx [ threadIdx.x ] += xx [ index ];
+            xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0)
+    {
+        output[0] = xx[0];
+        output[1] = xy[xy_i];
+        output[2] = xz[xz_i];
+        output[3] = xz[yy_i];
+        output[4] = xz[yz_i];
+        output[5] = xz[zz_i];
+    }
 }
 
 #if defined( __SM_35__)
 
 CUDA_GLOBAL void compute_center_mass_xx_xy (single_body_parameters *sbp,
-		reax_atom *atoms,
-		real *results,
-		real xcm0, real xcm1, real xcm2,
-		size_t n)
+        reax_atom *atoms,
+        real *results,
+        real xcm0, real xcm1, real xcm2,
+        size_t n)
 {
-	extern __shared__ real my_results_xx[];
-	extern __shared__ real my_results_xy[];
-
-	unsigned int xx_i = threadIdx.x;
-	unsigned int xy_i = blockDim.x;
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int index = 0;
-	real xx = 0;
-	real xy = 0;
-
-	rvec diff, xcm;
-	real m = 0;
-	rvec_MakeZero (diff);
-	xcm[0] = xcm0;
-	xcm[1] = xcm1;
-	xcm[2] = xcm2;
-
-
-	if (i < n){
-		m = sbp[ atoms[i].type ].mass;
-		rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-		xx = diff[0] * diff[0] * m;
-		xy = diff[0] * diff[1] * m;
-	}
-	__syncthreads ();
-
-	for (int z = 16; z <= 1; z++){
-		xx += shfl( xx, z);
-		xy += shfl( xy, z);
-	}
-	__syncthreads ();
-
-	if (threadIdx.x % 32 == 0){
-		my_results_xx[threadIdx.x >> 5] = xx;	
-		my_results_xy[threadIdx.x >> 5] = xy;	
-	}
-	__syncthreads ();
-
-	for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-		if (threadIdx.x < offset){
-			index = threadIdx.x + offset;
-			my_results_xx[ threadIdx.x ] += my_results_xx[ index ];
-			my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ];
-		}
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0) {
-		results [ blockIdx.x*6 ] = my_results_xx [ 0 ];
-		results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ];
-	}
+    extern __shared__ real my_results_xx[];
+    extern __shared__ real my_results_xy[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real xx = 0;
+    real xy = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xx = diff[0] * diff[0] * m;
+        xy = diff[0] * diff[1] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        xx += shfl( xx, z);
+        xy += shfl( xy, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_xx[threadIdx.x >> 5] = xx;    
+        my_results_xy[threadIdx.x >> 5] = xy;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_xx[ threadIdx.x ] += my_results_xx[ index ];
+            my_results_xy[ xy_i + threadIdx.x ] += my_results_xy [ xy_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 ] = my_results_xx [ 0 ];
+        results [ blockIdx.x*6 + 1 ] = my_results_xy [ xy_i + 0 ];
+    }
 }
 
 CUDA_GLOBAL void compute_center_mass_xz_yy (single_body_parameters *sbp,
-		reax_atom *atoms,
-		real *results,
-		real xcm0, real xcm1, real xcm2,
-		size_t n)
+        reax_atom *atoms,
+        real *results,
+        real xcm0, real xcm1, real xcm2,
+        size_t n)
 {
-	extern __shared__ real my_results_xz[];
-	extern __shared__ real my_results_yy[];
-
-	unsigned int yy_i = blockDim.x;
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int index = 0;
-	real xz = 0;
-	real yy = 0;
-
-	rvec diff, xcm;
-	real m = 0;
-	rvec_MakeZero (diff);
-	xcm[0] = xcm0;
-	xcm[1] = xcm1;
-	xcm[2] = xcm2;
-
-	if (i < n){
-		m = sbp[ atoms[i].type ].mass;
-		rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-		xz = diff[0] * diff[2] * m;
-		yy = diff[1] * diff[1] * m;
-	}
-	__syncthreads ();
-
-	for (int z = 16; z <= 1; z++){
-		xz += shfl( xz, z);
-		yy += shfl( yy, z);
-	}
-	__syncthreads ();
-
-	if (threadIdx.x % 32 == 0){
-		my_results_xz[threadIdx.x >> 5] = xz;	
-		my_results_yy[threadIdx.x >> 5] = yy;	
-	}
-	__syncthreads ();
-
-	for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-		if (threadIdx.x < offset){
-			index = threadIdx.x + offset;
-			my_results_xz[ threadIdx.x ] += my_results_xz [ index ];
-			my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ];
-		}
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0) {
-		results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ];
-		results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ];
-	}
+    extern __shared__ real my_results_xz[];
+    extern __shared__ real my_results_yy[];
+
+    unsigned int yy_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real xz = 0;
+    real yy = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xz = diff[0] * diff[2] * m;
+        yy = diff[1] * diff[1] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        xz += shfl( xz, z);
+        yy += shfl( yy, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_xz[threadIdx.x >> 5] = xz;    
+        my_results_yy[threadIdx.x >> 5] = yy;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_xz[ threadIdx.x ] += my_results_xz [ index ];
+            my_results_yy[ yy_i + threadIdx.x ] += my_results_yy [ yy_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 + 2 ] = my_results_xz [ 0 ];
+        results [ blockIdx.x*6 + 3 ] = my_results_yy [ yy_i + 0 ];
+    }
 }
 
 CUDA_GLOBAL void compute_center_mass_yz_zz (single_body_parameters *sbp,
-		reax_atom *atoms,
-		real *results,
-		real xcm0, real xcm1, real xcm2,
-		size_t n)
+        reax_atom *atoms,
+        real *results,
+        real xcm0, real xcm1, real xcm2,
+        size_t n)
 {
-	extern __shared__ real my_results_yz[];
-	extern __shared__ real my_results_zz[];
-
-	unsigned int zz_i = blockDim.x;
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int index = 0;
-	real yz = 0;
-	real zz = 0;
-
-	rvec diff, xcm;
-	real m = 0;
-	rvec_MakeZero (diff);
-	xcm[0] = xcm0;
-	xcm[1] = xcm1;
-	xcm[2] = xcm2;
-
-
-	if (i < n){
-		m = sbp[ atoms[i].type ].mass;
-		rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-		yz = diff[1] * diff[2] * m;
-		zz = diff[2] * diff[2] * m;
-	}
-	__syncthreads ();
-
-	for (int z = 16; z <= 1; z++){
-		yz += shfl( yz, z);
-		zz += shfl( zz, z);
-	}
-	__syncthreads ();
-
-	if (threadIdx.x % 32 == 0){
-		my_results_yz[threadIdx.x >> 5] = yz;	
-		my_results_zz[threadIdx.x >> 5] = zz;	
-	}
-	__syncthreads ();
-
-	for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-		if (threadIdx.x < offset){
-			index = threadIdx.x + offset;
-			my_results_yz[ threadIdx.x ] += my_results_yz [ index ];
-			my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ];
-		}
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0) {
-		results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ];
-		results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ];
-	}
+    extern __shared__ real my_results_yz[];
+    extern __shared__ real my_results_zz[];
+
+    unsigned int zz_i = blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+    real yz = 0;
+    real zz = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        yz = diff[1] * diff[2] * m;
+        zz = diff[2] * diff[2] * m;
+    }
+    __syncthreads ();
+
+    for (int z = 16; z <= 1; z++){
+        yz += shfl( yz, z);
+        zz += shfl( zz, z);
+    }
+    __syncthreads ();
+
+    if (threadIdx.x % 32 == 0){
+        my_results_yz[threadIdx.x >> 5] = yz;    
+        my_results_zz[threadIdx.x >> 5] = zz;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            my_results_yz[ threadIdx.x ] += my_results_yz [ index ];
+            my_results_zz[ zz_i + threadIdx.x ] += my_results_zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 + 4 ] = my_results_yz [ 0 ];
+        results [ blockIdx.x*6 + 5 ] = my_results_zz [ zz_i + 0 ];
+    }
 }
 
 #endif
diff --git a/PG-PuReMD/src/cuda_bond_orders.cu b/PG-PuReMD/src/cuda_bond_orders.cu
index 3a208f44..05257c94 100644
--- a/PG-PuReMD/src/cuda_bond_orders.cu
+++ b/PG-PuReMD/src/cuda_bond_orders.cu
@@ -8,813 +8,813 @@
 #include "reduction.h"
 
 CUDA_GLOBAL void Cuda_Calculate_BO_init (  reax_atom *my_atoms, 
-		single_body_parameters *sbp, 
-		storage p_workspace, 
-		int N )
+        single_body_parameters *sbp, 
+        storage p_workspace, 
+        int N )
 {
-	int i, type_i;
-	single_body_parameters *sbp_i;
+    int i, type_i;
+    single_body_parameters *sbp_i;
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
 
-	storage *workspace = & (p_workspace);
+    storage *workspace = & (p_workspace);
 
-	/* Calculate Deltaprime, Deltaprime_boc values */
-	type_i = my_atoms[i].type;
-	sbp_i = &(sbp[type_i]);
-	workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
-	workspace->Deltap_boc[i] = 
-		workspace->total_bond_order[i] - sbp_i->valency_val;
-	workspace->total_bond_order[i] = 0; 
+    /* Calculate Deltaprime, Deltaprime_boc values */
+    type_i = my_atoms[i].type;
+    sbp_i = &(sbp[type_i]);
+    workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
+    workspace->Deltap_boc[i] = 
+        workspace->total_bond_order[i] - sbp_i->valency_val;
+    workspace->total_bond_order[i] = 0; 
 }
 
 CUDA_GLOBAL void Cuda_Calculate_BO (  reax_atom *my_atoms, global_parameters gp, 
-		single_body_parameters *sbp, two_body_parameters *tbp, 
-		storage p_workspace, reax_list p_bonds, 
-		int num_atom_types, int N )
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        storage p_workspace, reax_list p_bonds, 
+        int num_atom_types, int N )
 {
-	int i, j, pj, type_i, type_j;
-	int start_i, end_i, sym_index, num_bonds;
-	real val_i, Deltap_i, Deltap_boc_i;
-	real val_j, Deltap_j, Deltap_boc_j;
-	real f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
-	real exp_p1i,   exp_p2i, exp_p1j, exp_p2j;
-	real temp, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
-	real Cf45_ij, Cf45_ji, p_lp1; //u_ij, u_ji
-	real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
-	real explp1, p_boc1, p_boc2;
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_order_data *bo_ij, *bo_ji;
-
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	storage *workspace = & (p_workspace);
-	reax_list *bonds = &(p_bonds);
-
-	num_bonds = 0; 
-	p_boc1 = gp.l[0];
-	p_boc2 = gp.l[1];
-
-	/* Calculate Deltaprime, Deltaprime_boc values */
-	/*
-	//for( i = 0; i < system->N; ++i ) {
-	type_i = my_atoms[i].type;
-	sbp_i = &(sbp[type_i]);
-	workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
-	workspace->Deltap_boc[i] = 
-	workspace->total_bond_order[i] - sbp_i->valency_val;
-
-	//fprintf( stdout, "%d(%d) %24.15f\n", 
-	//     i, workspace->bond_mark[i], workspace->total_bond_order[i] );
-	workspace->total_bond_order[i] = 0; 
-	//}
-	 */
-
-	/* Corrected Bond Order calculations */
-	//for( i = 0; i < system->N; ++i ) {
-	type_i = my_atoms[i].type;
-	sbp_i = &(sbp[type_i]);
-	val_i = sbp_i->valency;
-	Deltap_i = workspace->Deltap[i];
-	Deltap_boc_i = workspace->Deltap_boc[i];
-	start_i = Dev_Start_Index(i, bonds);
-	end_i = Dev_End_Index(i, bonds);
-	// fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
-	//       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
-	for( pj = start_i; pj < end_i; ++pj ) {
-		j = bonds->select.bond_list[pj].nbr;
-		type_j = my_atoms[j].type;
-		bo_ij = &( bonds->select.bond_list[pj].bo_data );
-		// fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
-
-		//TODO
-		//TODO
-		//TODO
-		//TODO
-		//TODO
-		//if( i < j || workspace->bond_mark[j] > 3 ) {
-		if( i < j ) {
-			twbp = &( tbp[ index_tbp (type_i, type_j, num_atom_types)] );
+    int i, j, pj, type_i, type_j;
+    int start_i, end_i, sym_index, num_bonds;
+    real val_i, Deltap_i, Deltap_boc_i;
+    real val_j, Deltap_j, Deltap_boc_j;
+    real f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
+    real exp_p1i,   exp_p2i, exp_p1j, exp_p2j;
+    real temp, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
+    real Cf45_ij, Cf45_ji, p_lp1; //u_ij, u_ji
+    real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
+    real explp1, p_boc1, p_boc2;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij, *bo_ji;
+
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    storage *workspace = & (p_workspace);
+    reax_list *bonds = &(p_bonds);
+
+    num_bonds = 0; 
+    p_boc1 = gp.l[0];
+    p_boc2 = gp.l[1];
+
+    /* Calculate Deltaprime, Deltaprime_boc values */
+    /*
+    //for( i = 0; i < system->N; ++i ) {
+    type_i = my_atoms[i].type;
+    sbp_i = &(sbp[type_i]);
+    workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
+    workspace->Deltap_boc[i] = 
+    workspace->total_bond_order[i] - sbp_i->valency_val;
+
+    //fprintf( stdout, "%d(%d) %24.15f\n", 
+    //     i, workspace->bond_mark[i], workspace->total_bond_order[i] );
+    workspace->total_bond_order[i] = 0; 
+    //}
+     */
+
+    /* Corrected Bond Order calculations */
+    //for( i = 0; i < system->N; ++i ) {
+    type_i = my_atoms[i].type;
+    sbp_i = &(sbp[type_i]);
+    val_i = sbp_i->valency;
+    Deltap_i = workspace->Deltap[i];
+    Deltap_boc_i = workspace->Deltap_boc[i];
+    start_i = Dev_Start_Index(i, bonds);
+    end_i = Dev_End_Index(i, bonds);
+    // fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
+    //       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
+    for( pj = start_i; pj < end_i; ++pj ) {
+        j = bonds->select.bond_list[pj].nbr;
+        type_j = my_atoms[j].type;
+        bo_ij = &( bonds->select.bond_list[pj].bo_data );
+        // fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
+
+        //TODO
+        //TODO
+        //TODO
+        //TODO
+        //TODO
+        //if( i < j || workspace->bond_mark[j] > 3 ) {
+        if( i < j ) {
+            twbp = &( tbp[ index_tbp (type_i, type_j, num_atom_types)] );
 
 #ifdef TEST_FORCES
-			Set_Start_Index( pj, top_dbo, dBOs );
-			/* fprintf( stderr, "%6d%6d%12.6f%12.6f%12.6f\n", 
-			   workspace->reverse_map[i], workspace->reverse_map[j],
-			   twbp->ovc, twbp->v13cor, bo_ij->BO ); */
+            Set_Start_Index( pj, top_dbo, dBOs );
+            /* fprintf( stderr, "%6d%6d%12.6f%12.6f%12.6f\n", 
+               workspace->reverse_map[i], workspace->reverse_map[j],
+               twbp->ovc, twbp->v13cor, bo_ij->BO ); */
 #endif
 
-			if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
-				/* There is no correction to bond orders nor to derivatives
-				   of bond order prime! So we leave bond orders unchanged and
-				   set derivative of bond order coefficients such that 
-				   dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
-				bo_ij->C1dbo = 1.000000;
-				bo_ij->C2dbo = 0.000000;
-				bo_ij->C3dbo = 0.000000;
-
-				bo_ij->C1dbopi = bo_ij->BO_pi;
-				bo_ij->C2dbopi = 0.000000;
-				bo_ij->C3dbopi = 0.000000;
-				bo_ij->C4dbopi = 0.000000;
-
-				bo_ij->C1dbopi2 = bo_ij->BO_pi2;
-				bo_ij->C2dbopi2 = 0.000000;
-				bo_ij->C3dbopi2 = 0.000000;
-				bo_ij->C4dbopi2 = 0.000000;
+            if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
+                /* There is no correction to bond orders nor to derivatives
+                   of bond order prime! So we leave bond orders unchanged and
+                   set derivative of bond order coefficients such that 
+                   dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
+                bo_ij->C1dbo = 1.000000;
+                bo_ij->C2dbo = 0.000000;
+                bo_ij->C3dbo = 0.000000;
+
+                bo_ij->C1dbopi = bo_ij->BO_pi;
+                bo_ij->C2dbopi = 0.000000;
+                bo_ij->C3dbopi = 0.000000;
+                bo_ij->C4dbopi = 0.000000;
+
+                bo_ij->C1dbopi2 = bo_ij->BO_pi2;
+                bo_ij->C2dbopi2 = 0.000000;
+                bo_ij->C3dbopi2 = 0.000000;
+                bo_ij->C4dbopi2 = 0.000000;
 
 #ifdef TEST_FORCES
-				pdbo = &(dBOs->select.dbo_list[ top_dbo ]);
-
-				// compute dBO_ij/dr_i
-				pdbo->wrt = i;
-				rvec_Copy( pdbo->dBO, bo_ij->dBOp );
-				rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
-				rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2);
-
-				// compute dBO_ij/dr_j
-				pdbo++;
-				pdbo->wrt = j;
-				rvec_Scale( pdbo->dBO, -1.0, bo_ij->dBOp );
-				rvec_Scale( pdbo->dBOpi, -bo_ij->BO_pi, bo_ij->dln_BOp_pi );
-				rvec_Scale(pdbo->dBOpi2, -bo_ij->BO_pi2, bo_ij->dln_BOp_pi2);
-
-				top_dbo += 2;
+                pdbo = &(dBOs->select.dbo_list[ top_dbo ]);
+
+                // compute dBO_ij/dr_i
+                pdbo->wrt = i;
+                rvec_Copy( pdbo->dBO, bo_ij->dBOp );
+                rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
+                rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2);
+
+                // compute dBO_ij/dr_j
+                pdbo++;
+                pdbo->wrt = j;
+                rvec_Scale( pdbo->dBO, -1.0, bo_ij->dBOp );
+                rvec_Scale( pdbo->dBOpi, -bo_ij->BO_pi, bo_ij->dln_BOp_pi );
+                rvec_Scale(pdbo->dBOpi2, -bo_ij->BO_pi2, bo_ij->dln_BOp_pi2);
+
+                top_dbo += 2;
 #endif
-			}
-			else {
-				val_j = sbp[type_j].valency;
-				Deltap_j = workspace->Deltap[j];
-				Deltap_boc_j = workspace->Deltap_boc[j];
-
-				/* on page 1 */
-				if( twbp->ovc >= 0.001 ) {
-					/* Correction for overcoordination */
-					exp_p1i = EXP( -p_boc1 * Deltap_i );
-					exp_p2i = EXP( -p_boc2 * Deltap_i );
-					exp_p1j = EXP( -p_boc1 * Deltap_j );
-					exp_p2j = EXP( -p_boc2 * Deltap_j );
-
-					f2 = exp_p1i + exp_p1j;
-					f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
-					f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) +
-							( val_j + f2 )/( val_j + f2 + f3 ) );
-
-
-					/*fprintf( stderr,"%d %d\t%g %g   j:%g %g  p_boc:%g %g\n"
-					  "\tf:%g  %g  %g, exp:%g %g %g %g\n", 
-					  i+1, j+1, 
-					  val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2,
-					  f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
-
-					/* Now come the derivates */
-					/* Bond Order pages 5-7, derivative of f1 */
-					temp = f2 + f3;
-					u1_ij = val_i + temp;
-					u1_ji = val_j + temp;
-					Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) +
-							1.0 / SQR( u1_ji ));
-					Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) +
-							( u1_ji - f3 ) / SQR( u1_ji ));
-
-					//Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
-					//          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
-					Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij -
-							((val_i+f2) / SQR(u1_ij)) *
-							( -p_boc1 * exp_p1i +
-							  exp_p2i / ( exp_p2i + exp_p2j ) ) +
-							-p_boc1 * exp_p1i / u1_ji -
-							((val_j+f2) / SQR(u1_ji)) *
-							( -p_boc1 * exp_p1i +
-							  exp_p2i / ( exp_p2i + exp_p2j ) ));
-
-
-					Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j +
-						Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j );
-
-					//fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
-				}
-				else {
-					/* No overcoordination correction! */
-					f1 = 1.0;
-					Cf1_ij = Cf1_ji = 0.0;
-				}
-
-				if( twbp->v13cor >= 0.001 ) {
-					/* Correction for 1-3 bond orders */
-					exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
-								Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
-					exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
-								Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
-
-					f4 = 1. / (1. + exp_f4);
-					f5 = 1. / (1. + exp_f5);
-					f4f5 = f4 * f5;
-
-					/* Bond Order pages 8-9, derivative of f4 and f5 */
-					/*temp = twbp->p_boc5 - 
-					  twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
-					  u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
-					  u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
-					  Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
-					  Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
-					Cf45_ij = -f4 * exp_f4;
-					Cf45_ji = -f5 * exp_f5;
-				}
-				else {
-					f4 = f5 = f4f5 = 1.0;
-					Cf45_ij = Cf45_ji = 0.0;
-				}
-
-				/* Bond Order page 10, derivative of total bond order */
-				A0_ij = f1 * f4f5;
-				A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO *
-					(Cf45_ij + Cf45_ji);
-				A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
-				A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
-				A3_ij = A2_ij + Cf1_ij / f1;
-				A3_ji = A2_ji + Cf1_ji / f1;
-
-				/*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f" 
-				  "A2_ij: %f A2_ji: %f, A3_ij: %f, A3_ji: %f\n",
-				  bo_ij->BO, 
-				  A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/
-
-
-				/* find corrected bond orders and their derivative coef */
-				bo_ij->BO    = bo_ij->BO    * A0_ij;
-				bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
-				bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
-				bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
-
-				bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
-				bo_ij->C2dbo = bo_ij->BO * A2_ij;
-				bo_ij->C3dbo = bo_ij->BO * A2_ji;
-
-				bo_ij->C1dbopi = f1*f1*f4*f5;
-				bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
-				bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
-				bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
-
-				bo_ij->C1dbopi2 = f1*f1*f4*f5;
-				bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
-				bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
-				bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji;
-
-				//CHANGE ORIGINAL
-			}
-			//CHANGE ORIGINAL
-
-			/* neglect bonds that are < 1e-10 */
-			if( bo_ij->BO < 1e-10 )
-				bo_ij->BO = 0.0;
-			if( bo_ij->BO_s < 1e-10 )
-				bo_ij->BO_s = 0.0;
-			if( bo_ij->BO_pi < 1e-10 )
-				bo_ij->BO_pi = 0.0;
-			if( bo_ij->BO_pi2 < 1e-10 )
-				bo_ij->BO_pi2 = 0.0;
-
-			workspace->total_bond_order[i] += bo_ij->BO; //now keeps total_BO
-
-
-			/* fprintf( stderr, "%d %d\t%g %g %g %g\n"
-			   "Cdbo:\t%g %g %g\n"
-			   "Cdbopi:\t%g %g %g %g\n"
-			   "Cdbopi2:%g %g %g %g\n\n", 
-			   i+1, j+1, 
-			   bonds->select.bond_list[ pj ].d, 
-			   bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, 
-			   bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo,
-			   bo_ij->C1dbopi, bo_ij->C2dbopi, 
-			   bo_ij->C3dbopi, bo_ij->C4dbopi,
-			   bo_ij->C1dbopi2,bo_ij->C2dbopi2, 
-			   bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
-
-			/* fprintf( stderr, "%d %d  BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
-			   i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 );*/
+            }
+            else {
+                val_j = sbp[type_j].valency;
+                Deltap_j = workspace->Deltap[j];
+                Deltap_boc_j = workspace->Deltap_boc[j];
+
+                /* on page 1 */
+                if( twbp->ovc >= 0.001 ) {
+                    /* Correction for overcoordination */
+                    exp_p1i = EXP( -p_boc1 * Deltap_i );
+                    exp_p2i = EXP( -p_boc2 * Deltap_i );
+                    exp_p1j = EXP( -p_boc1 * Deltap_j );
+                    exp_p2j = EXP( -p_boc2 * Deltap_j );
+
+                    f2 = exp_p1i + exp_p1j;
+                    f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
+                    f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) +
+                            ( val_j + f2 )/( val_j + f2 + f3 ) );
+
+
+                    /*fprintf( stderr,"%d %d\t%g %g   j:%g %g  p_boc:%g %g\n"
+                      "\tf:%g  %g  %g, exp:%g %g %g %g\n", 
+                      i+1, j+1, 
+                      val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2,
+                      f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
+
+                    /* Now come the derivates */
+                    /* Bond Order pages 5-7, derivative of f1 */
+                    temp = f2 + f3;
+                    u1_ij = val_i + temp;
+                    u1_ji = val_j + temp;
+                    Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) +
+                            1.0 / SQR( u1_ji ));
+                    Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) +
+                            ( u1_ji - f3 ) / SQR( u1_ji ));
+
+                    //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
+                    //          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
+                    Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij -
+                            ((val_i+f2) / SQR(u1_ij)) *
+                            ( -p_boc1 * exp_p1i +
+                              exp_p2i / ( exp_p2i + exp_p2j ) ) +
+                            -p_boc1 * exp_p1i / u1_ji -
+                            ((val_j+f2) / SQR(u1_ji)) *
+                            ( -p_boc1 * exp_p1i +
+                              exp_p2i / ( exp_p2i + exp_p2j ) ));
+
+
+                    Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j +
+                        Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j );
+
+                    //fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
+                }
+                else {
+                    /* No overcoordination correction! */
+                    f1 = 1.0;
+                    Cf1_ij = Cf1_ji = 0.0;
+                }
+
+                if( twbp->v13cor >= 0.001 ) {
+                    /* Correction for 1-3 bond orders */
+                    exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
+                                Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
+                    exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) -
+                                Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
+
+                    f4 = 1. / (1. + exp_f4);
+                    f5 = 1. / (1. + exp_f5);
+                    f4f5 = f4 * f5;
+
+                    /* Bond Order pages 8-9, derivative of f4 and f5 */
+                    /*temp = twbp->p_boc5 - 
+                      twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
+                      u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
+                      u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
+                      Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
+                      Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
+                    Cf45_ij = -f4 * exp_f4;
+                    Cf45_ji = -f5 * exp_f5;
+                }
+                else {
+                    f4 = f5 = f4f5 = 1.0;
+                    Cf45_ij = Cf45_ji = 0.0;
+                }
+
+                /* Bond Order page 10, derivative of total bond order */
+                A0_ij = f1 * f4f5;
+                A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO *
+                    (Cf45_ij + Cf45_ji);
+                A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
+                A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
+                A3_ij = A2_ij + Cf1_ij / f1;
+                A3_ji = A2_ji + Cf1_ji / f1;
+
+                /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f" 
+                  "A2_ij: %f A2_ji: %f, A3_ij: %f, A3_ji: %f\n",
+                  bo_ij->BO, 
+                  A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/
+
+
+                /* find corrected bond orders and their derivative coef */
+                bo_ij->BO    = bo_ij->BO    * A0_ij;
+                bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
+                bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
+                bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+
+                bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
+                bo_ij->C2dbo = bo_ij->BO * A2_ij;
+                bo_ij->C3dbo = bo_ij->BO * A2_ji;
+
+                bo_ij->C1dbopi = f1*f1*f4*f5;
+                bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
+                bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
+                bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
+
+                bo_ij->C1dbopi2 = f1*f1*f4*f5;
+                bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
+                bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
+                bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji;
+
+                //CHANGE ORIGINAL
+            }
+            //CHANGE ORIGINAL
+
+            /* neglect bonds that are < 1e-10 */
+            if( bo_ij->BO < 1e-10 )
+                bo_ij->BO = 0.0;
+            if( bo_ij->BO_s < 1e-10 )
+                bo_ij->BO_s = 0.0;
+            if( bo_ij->BO_pi < 1e-10 )
+                bo_ij->BO_pi = 0.0;
+            if( bo_ij->BO_pi2 < 1e-10 )
+                bo_ij->BO_pi2 = 0.0;
+
+            workspace->total_bond_order[i] += bo_ij->BO; //now keeps total_BO
+
+
+            /* fprintf( stderr, "%d %d\t%g %g %g %g\n"
+               "Cdbo:\t%g %g %g\n"
+               "Cdbopi:\t%g %g %g %g\n"
+               "Cdbopi2:%g %g %g %g\n\n", 
+               i+1, j+1, 
+               bonds->select.bond_list[ pj ].d, 
+               bo_ij->BO,bo_ij->BO_pi, bo_ij->BO_pi2, 
+               bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo,
+               bo_ij->C1dbopi, bo_ij->C2dbopi, 
+               bo_ij->C3dbopi, bo_ij->C4dbopi,
+               bo_ij->C1dbopi2,bo_ij->C2dbopi2, 
+               bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
+
+            /* fprintf( stderr, "%d %d  BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
+               i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 );*/
 
 #ifdef TEST_FORCES
-			Set_End_Index( pj, top_dbo, dBOs );
-			Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
+            Set_End_Index( pj, top_dbo, dBOs );
+            Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
 #endif
-			//CHANGE ORIGINAL
-			//}
-			//CHANGE ORIGINAL
-			/*
-			   else {
-			// We only need to update bond orders from bo_ji
-			//   everything else is set in uncorrected_bo calculations
-			sym_index = bonds->select.bond_list[pj].sym_index;
-			bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
-			bo_ij->BO = bo_ji->BO;
-			bo_ij->BO_s = bo_ji->BO_s;
-			bo_ij->BO_pi = bo_ji->BO_pi;
-			bo_ij->BO_pi2 = bo_ji->BO_pi2;
-
-			workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
+            //CHANGE ORIGINAL
+            //}
+            //CHANGE ORIGINAL
+            /*
+               else {
+            // We only need to update bond orders from bo_ji
+            //   everything else is set in uncorrected_bo calculations
+            sym_index = bonds->select.bond_list[pj].sym_index;
+            bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
+            bo_ij->BO = bo_ji->BO;
+            bo_ij->BO_s = bo_ji->BO_s;
+            bo_ij->BO_pi = bo_ji->BO_pi;
+            bo_ij->BO_pi2 = bo_ji->BO_pi2;
+
+            workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
 #ifdef TEST_FORCES
 Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta );
 #endif
 }
-			 */
-			}
+             */
+            }
 }
 //} COMMENTED FOR CUDA KERNEL
 }
 
 CUDA_GLOBAL void Cuda_Update_Uncorrected_BO (  storage p_workspace, reax_list p_bonds, int N )
 {
-	int i, j, pj;
-	int start_i, end_i;
-	int sym_index;
-	storage *workspace = &( p_workspace );
-	reax_list *bonds = &( p_bonds );
-
-	bond_order_data *bo_ij, *bo_ji;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	start_i = Dev_Start_Index(i, bonds);
-	end_i = Dev_End_Index(i, bonds);
-
-	for( pj = start_i; pj < end_i; ++pj ) {
-
-		j = bonds->select.bond_list[pj].nbr;
-		bo_ij = &( bonds->select.bond_list[pj].bo_data );
-
-		//if( (i >= j)  || (workspace->bond_mark [i] <= 3)) {
-		if( (i >= j) ) {
-
-			/* We only need to update bond orders from bo_ji
-			   everything else is set in uncorrected_bo calculations */
-			sym_index = bonds->select.bond_list[pj].sym_index;
-			bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
-			bo_ij->BO = bo_ji->BO;
-			bo_ij->BO_s = bo_ji->BO_s;
-			bo_ij->BO_pi = bo_ji->BO_pi;
-			bo_ij->BO_pi2 = bo_ji->BO_pi2;
-
-			workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
-		}
-	}
-	}
-
-	CUDA_GLOBAL void Cuda_Update_Workspace_After_BO ( reax_atom *my_atoms, global_parameters gp, 
-			single_body_parameters *sbp, storage p_workspace, 
-			int N)
-	{
-		int j, type_j;
-		real explp1;
-		real p_lp1;
-		single_body_parameters *sbp_i, *sbp_j;
-		storage *workspace = &( p_workspace );
-
-		j = blockIdx.x * blockDim.x + threadIdx.x;
-		if (j >= N) return;
-
-		p_lp1 = gp.l[15];
-		/* Calculate some helper variables that are  used at many places
-		   throughout force calculations */
-		//for( j = 0; j < system->N; ++j ){
-		type_j = my_atoms[j].type;
-		sbp_j = &(sbp[ type_j ]);
-
-		workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
-		workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
-		workspace->Delta_boc[j] = workspace->total_bond_order[j] -
-			sbp_j->valency_boc;
-
-		workspace->vlpex[j] = workspace->Delta_e[j] -
-			2.0 * (int)(workspace->Delta_e[j]/2.0);
-		explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
-		workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
-		workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
-		workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]);
-		/* Adri uses different dDelta_lp values than the ones in notes... */
-		workspace->dDelta_lp[j] = workspace->Clp[j];
-		//workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
-		//((fabs(workspace->Delta_e[j]/2.0 -
-		//       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
-
-		if( sbp_j->mass > 21.0 ) {
-			workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
-			workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
-			workspace->dDelta_lp_temp[j] = 0.;
-		}
-		else {
-			workspace->nlp_temp[j] = workspace->nlp[j];
-			workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
-			workspace->dDelta_lp_temp[j] = workspace->Clp[j];
-		}
-		//} Commented for Cuda
-	}
-
-
-	CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, simulation_data *data,
-			storage *workspace, reax_list *bonds, rvec data_ext_press)
-	{
-		bond_data *nbr_j, *nbr_k;
-		bond_order_data *bo_ij, *bo_ji;
-		dbond_coefficients coef;
-		rvec temp, ext_press;
-		ivec rel_box;
-		int pk, k, j;
-		rvec tf_f;
-
-		/* Initializations */
-		nbr_j = &(bonds->select.bond_list[pj]);
-		j = nbr_j->nbr;
-
-		//bo_ij = &(nbr_j->bo_data);
-		//bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-		if (i < j) {
-			bo_ij = &(nbr_j->bo_data);
-			bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-		} else {
-			bo_ji = &(nbr_j->bo_data);
-			bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-		}
-
-		coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-		coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-		coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-
-		coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-		coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-		coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-		coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-
-		coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-		coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-		coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-		coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-
-		coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-		coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-		coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-
-
-		/************************************
-		 * forces related to atom i          *
-		 * first neighbors of atom i         *
-		 ************************************/
-		if (i < j) {
-			for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
-				nbr_k = &(bonds->select.bond_list[pk]);
-				k = nbr_k->nbr;
-
-				rvec_MakeZero (nbr_k->tf_f);
-
-				rvec_Scale(temp, -coef.C2dbo, nbr_k->bo_data.dBOp);       /*2nd, dBO*/
-				rvec_ScaledAdd(temp, -coef.C2dDelta, nbr_k->bo_data.dBOp);/*dDelta*/
-				rvec_ScaledAdd(temp, -coef.C3dbopi, nbr_k->bo_data.dBOp); /*3rd, dBOpi*/
-				rvec_ScaledAdd(temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp);/*3rd, dBOpi2*/
-
-				/* force */
-				rvec_Add( nbr_k->tf_f, temp );
-				/* pressure */
-				rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
-				rvec_Add( data_ext_press, ext_press );
-
-				/* if( !ivec_isZero( nbr_k->rel_box ) )
-				   fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]"
-				   "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
-				   i+1, system->my_atoms[i].x[0], 
-				   system->my_atoms[i].x[1], system->my_atoms[i].x[2], 
-				   j+1, k+1, system->my_atoms[k].x[0], 
-				   system->my_atoms[k].x[1], system->my_atoms[k].x[2],
-				   nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
-				   nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2],
-				   temp[0], temp[1], temp[2] ); */
-			}
-
-			/* then atom i itself  */
-			rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st,dBO*/
-			rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd,dBO*/
-			rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st,dBO*/
-			rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd,dBO*/
-			rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
-			rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
-			rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i]);/*3rd,dBOpi*/
-
-			rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );  /*1st,dBO_pi2*/
-			rvec_ScaledAdd( temp, coef.C2dbopi2, bo_ij->dBOp );         /*2nd,dBO_pi2*/
-			rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );/*3rd*/
-
-			/* force */
-			rvec_Add( workspace->f[i], temp );
-			/* ext pressure due to i is dropped, counting force on j will be enough */
-		}
-		else {
-
-			/******************************************************
-			 * forces and pressure related to atom j               * 
-			 * first neighbors of atom j                           *
-			 ******************************************************/
-			for( pk = Dev_Start_Index(j, bonds); pk < Dev_End_Index(j, bonds); ++pk ) {
-				nbr_k = &(bonds->select.bond_list[pk]);
-				k = nbr_k->nbr;
-
-				rvec_MakeZero (nbr_k->tf_f);
-
-				rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );      /*3rd,dBO*/
-				rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp);/*dDelta*/
-				rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp); /*4th,dBOpi*/
-				rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp);/*4th,dBOpi2*/
-
-				/* force */
-				rvec_Add( nbr_k->tf_f, temp );
-				/* pressure */
-				if( k != i ) {
-					ivec_Sum( rel_box, nbr_k->rel_box, nbr_j->rel_box ); //rel_box(k, i)
-					rvec_iMultiply( ext_press, rel_box, temp );
-					rvec_Add( data_ext_press, ext_press );
-
-					/* if( !ivec_isZero( rel_box ) )
-					   fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]"
-					   "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
-					   i+1, j+1, system->my_atoms[j].x[0], 
-					   system->my_atoms[j].x[1], system->my_atoms[j].x[2], 
-					   k+1, system->my_atoms[k].x[0], 
-					   system->my_atoms[k].x[1], system->my_atoms[k].x[2],
-					   nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
-					   rel_box[0], rel_box[1], rel_box[2],
-					   temp[0], temp[1], temp[2] ); */
-				}
-			}
-
-			/* then atom j itself */
-			rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                    /*1st, dBO*/
-			rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );  /*2nd, dBO*/
-			rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );             /*1st, dBO*/
-			rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j]);/*2nd, dBO*/
-
-			rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );       /*1st,dBOpi*/
-			rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );             /*2nd,dBOpi*/
-			rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j]);/*3rd,dBOpi*/
-
-			rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );    /*1st,dBOpi2*/
-			rvec_ScaledAdd( temp, -coef.C2dbopi2, bo_ij->dBOp );           /*2nd,dBOpi2*/
-			rvec_ScaledAdd( temp,coef.C4dbopi2,workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
-
-			/* force */
-			rvec_Add( workspace->f[j], temp );
-			/* pressure */
-			rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
-			rvec_Add( data->my_ext_press, ext_press );
-
-			/* if( !ivec_isZero( nbr_j->rel_box ) )
-			   fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" 
-			   "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
-			   i+1, system->my_atoms[i].x[0], system->my_atoms[i].x[1], 
-			   system->my_atoms[i].x[2], 
-			   j+1,system->my_atoms[j].x[0], system->my_atoms[j].x[1], 
-			   system->my_atoms[j].x[2],
-			   j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2],
-			   nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2],
-			   temp[0], temp[1], temp[2] ); */
-		}
-	}
-
-	CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
-			storage *workspace, reax_list *bonds )
-	{
-		bond_data *nbr_j, *nbr_k;
-		bond_order_data *bo_ij, *bo_ji;
-		dbond_coefficients coef;
-		int pk, k, j;
-
-		rvec tf_f;
-		rvec_MakeZero (tf_f);
-
-		/* Initializations */
-		nbr_j = &(bonds->select.bond_list[pj]);
-		j = nbr_j->nbr;
-		//bo_ij = &(nbr_j->bo_data);
-		//bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-
-		if (i < j) {
-			bo_ij = &(nbr_j->bo_data);
-			bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-		} else {
-			bo_ji = &(nbr_j->bo_data);
-			bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-		}
-
-		coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-		coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-		coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-
-		coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-		coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-		coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-		coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-
-		coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-		coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-		coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-		coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-
-		coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-		coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-		coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-
-		if (i < j) {
-			for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
-				nbr_k = &(bonds->select.bond_list[pk]);
-				k = nbr_k->nbr;
-				rvec_MakeZero (tf_f);
-
-				/*2nd,dBO*/
-				rvec_ScaledAdd( tf_f, -coef.C2dbo, nbr_k->bo_data.dBOp );
-				/*dDelta*/
-				rvec_ScaledAdd( tf_f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
-				/*3rd, dBOpi*/
-				rvec_ScaledAdd( tf_f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
-				/*3rd, dBOpi2*/
-				rvec_ScaledAdd( tf_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
-
-				//Temp storage
-				rvec_Add (nbr_k->tf_f, tf_f);
-			}
-			/*1st, dBO*/
-			rvec_ScaledAdd( workspace->f[i], coef.C1dbo, bo_ij->dBOp );
-			/*2nd, dBO*/
-			rvec_ScaledAdd( workspace->f[i], coef.C2dbo, workspace->dDeltap_self[i] );
-
-			/*1st, dBO*/
-			rvec_ScaledAdd( workspace->f[i], coef.C1dDelta, bo_ij->dBOp );
-			/*2nd, dBO*/
-			rvec_ScaledAdd( workspace->f[i], coef.C2dDelta, workspace->dDeltap_self[i] );
-
-			/*1st, dBOpi*/
-			rvec_ScaledAdd( workspace->f[i], coef.C1dbopi, bo_ij->dln_BOp_pi );
-			/*2nd, dBOpi*/
-			rvec_ScaledAdd( workspace->f[i], coef.C2dbopi, bo_ij->dBOp );
-			/*3rd, dBOpi*/
-			rvec_ScaledAdd( workspace->f[i], coef.C3dbopi, workspace->dDeltap_self[i] );
-
-			/*1st, dBO_pi2*/
-			rvec_ScaledAdd( workspace->f[i], coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-			/*2nd, dBO_pi2*/
-			rvec_ScaledAdd( workspace->f[i], coef.C2dbopi2, bo_ij->dBOp );
-			/*3rd, dBO_pi2*/
-			rvec_ScaledAdd( workspace->f[i], coef.C3dbopi2, workspace->dDeltap_self[i] );
-
-		} else {
-
-			for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
-				nbr_k = &(bonds->select.bond_list[pk]);
-				k = nbr_k->nbr;
-				rvec_MakeZero (tf_f);
-
-				/*3rd, dBO*/
-				rvec_ScaledAdd( tf_f, -coef.C3dbo, nbr_k->bo_data.dBOp );
-				/*dDelta*/
-				rvec_ScaledAdd( tf_f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
-				/*4th, dBOpi*/
-				rvec_ScaledAdd( tf_f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
-				/*4th, dBOpi2*/
-				rvec_ScaledAdd( tf_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
-
-				//Temp Storage
-				rvec_Add (nbr_k->tf_f, tf_f);
-			}
-
-			/*1st,dBO*/
-			rvec_ScaledAdd( workspace->f[i], -coef.C1dbo, bo_ij->dBOp );
-			/*2nd,dBO*/
-			rvec_ScaledAdd( workspace->f[i], coef.C3dbo, workspace->dDeltap_self[i] );
-
-			/*1st, dBO*/
-			rvec_ScaledAdd( workspace->f[i], -coef.C1dDelta, bo_ij->dBOp );
-			/*2nd, dBO*/
-			rvec_ScaledAdd( workspace->f[i], coef.C3dDelta, workspace->dDeltap_self[i] );
-
-			/*1st, dBOpi*/
-			rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi, bo_ij->dln_BOp_pi );
-			/*2nd, dBOpi*/
-			rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi, bo_ij->dBOp );
-			/*3rd, dBOpi*/
-			rvec_ScaledAdd( workspace->f[i], coef.C4dbopi, workspace->dDeltap_self[i] );
-
-			/*1st, dBOpi2*/
-			rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-			/*2nd, dBOpi2*/
-			rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi2, bo_ij->dBOp );
-			/*3rd, dBOpi2*/
-			rvec_ScaledAdd( workspace->f[i], coef.C4dbopi2, workspace->dDeltap_self[i] );
-		}
-	}
-
-	CUDA_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, reax_list *bonds, storage *workspace)
-	{
-		int pk;
-		bond_data *nbr_k, *nbr_k_sym;
-
-		for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
-			nbr_k = &(bonds->select.bond_list[pk]);
-			nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] );
-
-			//rvec_Add (atoms[i].f, nbr_k_sym->tf_f);
-			rvec_Add (workspace->f[i], nbr_k_sym->tf_f);
-		}
-	}
-
-	CUDA_GLOBAL void ker_total_forces_postprocess (reax_atom *my_atoms, reax_list p_bonds, storage p_workspace,  int N)
-	{
-		int i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= N) return;
-
-		reax_list *bonds = &( p_bonds );
-		storage *workspace = &( p_workspace );
-		Cuda_dbond_to_Forces_postprocess (i, my_atoms, bonds, workspace );
-	}
-
-	CUDA_GLOBAL void ker_total_forces (storage p_workspace, reax_list p_bonds, 
-			control_params *control,
-			simulation_data *data, 
-			rvec *data_ext_press,
-			int N )
-	{
-		int i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= N) return;
-
-		int pj;
-		reax_list *bonds = &( p_bonds );
-		storage *workspace = &( p_workspace );
-
-		for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
-			//if( i < bonds->select.bond_list[pj].nbr ) {
-			if( control->virial == 0 )
-				Cuda_Add_dBond_to_Forces( i, pj, workspace, bonds);
-			else 
-				Cuda_Add_dBond_to_Forces_NPT( i, pj, data, workspace, bonds, data_ext_press [i]);
-		//}  
-	}
-
-	void Cuda_Total_Forces (reax_system *system, control_params *control, 
-			simulation_data *data, storage *workspace)
-	{
-		int blocks;
-		rvec *spad_rvec = (rvec *) scratch;
-		cuda_memset (spad_rvec, 0, system->N * 2 * sizeof (rvec), "total_forces:ext_press");
-
-		blocks = system->N / DEF_BLOCK_SIZE + 
-			((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-		ker_total_forces <<< blocks, DEF_BLOCK_SIZE >>>
-			( *dev_workspace, *(*dev_lists + BONDS), 
-			  (control_params *) control->d_control_params, 
-			  (simulation_data *)data->d_simulation_data, 
-			  spad_rvec, system->N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		if (control->virial != 0) 
-		{
-			//do the reduction here for ext press
-			k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>> 
-				( spad_rvec, spad_rvec + system->N, system->N);
-			cudaThreadSynchronize (); 
-			cudaCheckError (); 
-
-			k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>>
-				( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks);
-			cudaThreadSynchronize (); 
-			cudaCheckError (); 
-		}
-
-		//do the post processing for the atomic forces here
-		ker_total_forces_postprocess  <<< blocks, DEF_BLOCK_SIZE >>>
-			(system->d_my_atoms, *(*dev_lists + BONDS), *dev_workspace, system->N);
-		cudaThreadSynchronize (); 
-		cudaCheckError (); 
-	}
-
-	CUDA_GLOBAL void ker_total_forces_pure (reax_atom *my_atoms, int n, 
-			storage p_workspace)
-	{
-		int i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= n) return;
-
-		storage *workspace = &( p_workspace );
-
-		rvec_Copy (my_atoms[i].f, workspace->f[i]);
-	}
-
-	void Cuda_Total_Forces_PURE (reax_system *system, storage *workspace)
-	{
-		int blocks;
-
-		blocks = system->n / DEF_BLOCK_SIZE + 
-			((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-		ker_total_forces_pure <<< blocks, DEF_BLOCK_SIZE >>>
-			( system->d_my_atoms, system->n, *dev_workspace);
-		cudaThreadSynchronize (); 
-		cudaCheckError (); 
-	}
+    int i, j, pj;
+    int start_i, end_i;
+    int sym_index;
+    storage *workspace = &( p_workspace );
+    reax_list *bonds = &( p_bonds );
+
+    bond_order_data *bo_ij, *bo_ji;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    start_i = Dev_Start_Index(i, bonds);
+    end_i = Dev_End_Index(i, bonds);
+
+    for( pj = start_i; pj < end_i; ++pj ) {
+
+        j = bonds->select.bond_list[pj].nbr;
+        bo_ij = &( bonds->select.bond_list[pj].bo_data );
+
+        //if( (i >= j)  || (workspace->bond_mark [i] <= 3)) {
+        if( (i >= j) ) {
+
+            /* We only need to update bond orders from bo_ji
+               everything else is set in uncorrected_bo calculations */
+            sym_index = bonds->select.bond_list[pj].sym_index;
+            bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
+            bo_ij->BO = bo_ji->BO;
+            bo_ij->BO_s = bo_ji->BO_s;
+            bo_ij->BO_pi = bo_ji->BO_pi;
+            bo_ij->BO_pi2 = bo_ji->BO_pi2;
+
+            workspace->total_bond_order[i] += bo_ij->BO;// now keeps total_BO
+        }
+    }
+    }
+
+    CUDA_GLOBAL void Cuda_Update_Workspace_After_BO ( reax_atom *my_atoms, global_parameters gp, 
+            single_body_parameters *sbp, storage p_workspace, 
+            int N)
+    {
+        int j, type_j;
+        real explp1;
+        real p_lp1;
+        single_body_parameters *sbp_i, *sbp_j;
+        storage *workspace = &( p_workspace );
+
+        j = blockIdx.x * blockDim.x + threadIdx.x;
+        if (j >= N) return;
+
+        p_lp1 = gp.l[15];
+        /* Calculate some helper variables that are  used at many places
+           throughout force calculations */
+        //for( j = 0; j < system->N; ++j ){
+        type_j = my_atoms[j].type;
+        sbp_j = &(sbp[ type_j ]);
+
+        workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
+        workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
+        workspace->Delta_boc[j] = workspace->total_bond_order[j] -
+            sbp_j->valency_boc;
+
+        workspace->vlpex[j] = workspace->Delta_e[j] -
+            2.0 * (int)(workspace->Delta_e[j]/2.0);
+        explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
+        workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
+        workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
+        workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]);
+        /* Adri uses different dDelta_lp values than the ones in notes... */
+        workspace->dDelta_lp[j] = workspace->Clp[j];
+        //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
+        //((fabs(workspace->Delta_e[j]/2.0 -
+        //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
+
+        if( sbp_j->mass > 21.0 ) {
+            workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
+            workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+            workspace->dDelta_lp_temp[j] = 0.;
+        }
+        else {
+            workspace->nlp_temp[j] = workspace->nlp[j];
+            workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+            workspace->dDelta_lp_temp[j] = workspace->Clp[j];
+        }
+        //} Commented for Cuda
+    }
+
+
+    CUDA_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, simulation_data *data,
+            storage *workspace, reax_list *bonds, rvec data_ext_press)
+    {
+        bond_data *nbr_j, *nbr_k;
+        bond_order_data *bo_ij, *bo_ji;
+        dbond_coefficients coef;
+        rvec temp, ext_press;
+        ivec rel_box;
+        int pk, k, j;
+        rvec tf_f;
+
+        /* Initializations */
+        nbr_j = &(bonds->select.bond_list[pj]);
+        j = nbr_j->nbr;
+
+        //bo_ij = &(nbr_j->bo_data);
+        //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+        if (i < j) {
+            bo_ij = &(nbr_j->bo_data);
+            bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+        } else {
+            bo_ji = &(nbr_j->bo_data);
+            bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+        }
+
+        coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+        coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+        coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+
+        coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+        coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+        coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+        coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+
+        coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+        coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+        coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+        coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+
+        coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+        coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+        coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+
+
+        /************************************
+         * forces related to atom i          *
+         * first neighbors of atom i         *
+         ************************************/
+        if (i < j) {
+            for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
+                nbr_k = &(bonds->select.bond_list[pk]);
+                k = nbr_k->nbr;
+
+                rvec_MakeZero (nbr_k->tf_f);
+
+                rvec_Scale(temp, -coef.C2dbo, nbr_k->bo_data.dBOp);       /*2nd, dBO*/
+                rvec_ScaledAdd(temp, -coef.C2dDelta, nbr_k->bo_data.dBOp);/*dDelta*/
+                rvec_ScaledAdd(temp, -coef.C3dbopi, nbr_k->bo_data.dBOp); /*3rd, dBOpi*/
+                rvec_ScaledAdd(temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp);/*3rd, dBOpi2*/
+
+                /* force */
+                rvec_Add( nbr_k->tf_f, temp );
+                /* pressure */
+                rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
+                rvec_Add( data_ext_press, ext_press );
+
+                /* if( !ivec_isZero( nbr_k->rel_box ) )
+                   fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]"
+                   "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
+                   i+1, system->my_atoms[i].x[0], 
+                   system->my_atoms[i].x[1], system->my_atoms[i].x[2], 
+                   j+1, k+1, system->my_atoms[k].x[0], 
+                   system->my_atoms[k].x[1], system->my_atoms[k].x[2],
+                   nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
+                   nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2],
+                   temp[0], temp[1], temp[2] ); */
+            }
+
+            /* then atom i itself  */
+            rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st,dBO*/
+            rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd,dBO*/
+            rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st,dBO*/
+            rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd,dBO*/
+            rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
+            rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
+            rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i]);/*3rd,dBOpi*/
+
+            rvec_ScaledAdd( temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );  /*1st,dBO_pi2*/
+            rvec_ScaledAdd( temp, coef.C2dbopi2, bo_ij->dBOp );         /*2nd,dBO_pi2*/
+            rvec_ScaledAdd( temp, coef.C3dbopi2, workspace->dDeltap_self[i] );/*3rd*/
+
+            /* force */
+            rvec_Add( workspace->f[i], temp );
+            /* ext pressure due to i is dropped, counting force on j will be enough */
+        }
+        else {
+
+            /******************************************************
+             * forces and pressure related to atom j               * 
+             * first neighbors of atom j                           *
+             ******************************************************/
+            for( pk = Dev_Start_Index(j, bonds); pk < Dev_End_Index(j, bonds); ++pk ) {
+                nbr_k = &(bonds->select.bond_list[pk]);
+                k = nbr_k->nbr;
+
+                rvec_MakeZero (nbr_k->tf_f);
+
+                rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );      /*3rd,dBO*/
+                rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp);/*dDelta*/
+                rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp); /*4th,dBOpi*/
+                rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp);/*4th,dBOpi2*/
+
+                /* force */
+                rvec_Add( nbr_k->tf_f, temp );
+                /* pressure */
+                if( k != i ) {
+                    ivec_Sum( rel_box, nbr_k->rel_box, nbr_j->rel_box ); //rel_box(k, i)
+                    rvec_iMultiply( ext_press, rel_box, temp );
+                    rvec_Add( data_ext_press, ext_press );
+
+                    /* if( !ivec_isZero( rel_box ) )
+                       fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]"
+                       "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
+                       i+1, j+1, system->my_atoms[j].x[0], 
+                       system->my_atoms[j].x[1], system->my_atoms[j].x[2], 
+                       k+1, system->my_atoms[k].x[0], 
+                       system->my_atoms[k].x[1], system->my_atoms[k].x[2],
+                       nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
+                       rel_box[0], rel_box[1], rel_box[2],
+                       temp[0], temp[1], temp[2] ); */
+                }
+            }
+
+            /* then atom j itself */
+            rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                    /*1st, dBO*/
+            rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );  /*2nd, dBO*/
+            rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );             /*1st, dBO*/
+            rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j]);/*2nd, dBO*/
+
+            rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );       /*1st,dBOpi*/
+            rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );             /*2nd,dBOpi*/
+            rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j]);/*3rd,dBOpi*/
+
+            rvec_ScaledAdd( temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );    /*1st,dBOpi2*/
+            rvec_ScaledAdd( temp, -coef.C2dbopi2, bo_ij->dBOp );           /*2nd,dBOpi2*/
+            rvec_ScaledAdd( temp,coef.C4dbopi2,workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
+
+            /* force */
+            rvec_Add( workspace->f[j], temp );
+            /* pressure */
+            rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
+            rvec_Add( data->my_ext_press, ext_press );
+
+            /* if( !ivec_isZero( nbr_j->rel_box ) )
+               fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f]" 
+               "ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
+               i+1, system->my_atoms[i].x[0], system->my_atoms[i].x[1], 
+               system->my_atoms[i].x[2], 
+               j+1,system->my_atoms[j].x[0], system->my_atoms[j].x[1], 
+               system->my_atoms[j].x[2],
+               j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2],
+               nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2],
+               temp[0], temp[1], temp[2] ); */
+        }
+    }
+
+    CUDA_DEVICE void Cuda_Add_dBond_to_Forces( int i, int pj,
+            storage *workspace, reax_list *bonds )
+    {
+        bond_data *nbr_j, *nbr_k;
+        bond_order_data *bo_ij, *bo_ji;
+        dbond_coefficients coef;
+        int pk, k, j;
+
+        rvec tf_f;
+        rvec_MakeZero (tf_f);
+
+        /* Initializations */
+        nbr_j = &(bonds->select.bond_list[pj]);
+        j = nbr_j->nbr;
+        //bo_ij = &(nbr_j->bo_data);
+        //bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+
+        if (i < j) {
+            bo_ij = &(nbr_j->bo_data);
+            bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+        } else {
+            bo_ji = &(nbr_j->bo_data);
+            bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+        }
+
+        coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+        coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+        coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+
+        coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+        coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+        coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+        coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+
+        coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+        coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+        coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+        coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+
+        coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+        coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+        coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+
+        if (i < j) {
+            for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
+                nbr_k = &(bonds->select.bond_list[pk]);
+                k = nbr_k->nbr;
+                rvec_MakeZero (tf_f);
+
+                /*2nd,dBO*/
+                rvec_ScaledAdd( tf_f, -coef.C2dbo, nbr_k->bo_data.dBOp );
+                /*dDelta*/
+                rvec_ScaledAdd( tf_f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
+                /*3rd, dBOpi*/
+                rvec_ScaledAdd( tf_f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
+                /*3rd, dBOpi2*/
+                rvec_ScaledAdd( tf_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
+
+                //Temp storage
+                rvec_Add (nbr_k->tf_f, tf_f);
+            }
+            /*1st, dBO*/
+            rvec_ScaledAdd( workspace->f[i], coef.C1dbo, bo_ij->dBOp );
+            /*2nd, dBO*/
+            rvec_ScaledAdd( workspace->f[i], coef.C2dbo, workspace->dDeltap_self[i] );
+
+            /*1st, dBO*/
+            rvec_ScaledAdd( workspace->f[i], coef.C1dDelta, bo_ij->dBOp );
+            /*2nd, dBO*/
+            rvec_ScaledAdd( workspace->f[i], coef.C2dDelta, workspace->dDeltap_self[i] );
+
+            /*1st, dBOpi*/
+            rvec_ScaledAdd( workspace->f[i], coef.C1dbopi, bo_ij->dln_BOp_pi );
+            /*2nd, dBOpi*/
+            rvec_ScaledAdd( workspace->f[i], coef.C2dbopi, bo_ij->dBOp );
+            /*3rd, dBOpi*/
+            rvec_ScaledAdd( workspace->f[i], coef.C3dbopi, workspace->dDeltap_self[i] );
+
+            /*1st, dBO_pi2*/
+            rvec_ScaledAdd( workspace->f[i], coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+            /*2nd, dBO_pi2*/
+            rvec_ScaledAdd( workspace->f[i], coef.C2dbopi2, bo_ij->dBOp );
+            /*3rd, dBO_pi2*/
+            rvec_ScaledAdd( workspace->f[i], coef.C3dbopi2, workspace->dDeltap_self[i] );
+
+        } else {
+
+            for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
+                nbr_k = &(bonds->select.bond_list[pk]);
+                k = nbr_k->nbr;
+                rvec_MakeZero (tf_f);
+
+                /*3rd, dBO*/
+                rvec_ScaledAdd( tf_f, -coef.C3dbo, nbr_k->bo_data.dBOp );
+                /*dDelta*/
+                rvec_ScaledAdd( tf_f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
+                /*4th, dBOpi*/
+                rvec_ScaledAdd( tf_f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
+                /*4th, dBOpi2*/
+                rvec_ScaledAdd( tf_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
+
+                //Temp Storage
+                rvec_Add (nbr_k->tf_f, tf_f);
+            }
+
+            /*1st,dBO*/
+            rvec_ScaledAdd( workspace->f[i], -coef.C1dbo, bo_ij->dBOp );
+            /*2nd,dBO*/
+            rvec_ScaledAdd( workspace->f[i], coef.C3dbo, workspace->dDeltap_self[i] );
+
+            /*1st, dBO*/
+            rvec_ScaledAdd( workspace->f[i], -coef.C1dDelta, bo_ij->dBOp );
+            /*2nd, dBO*/
+            rvec_ScaledAdd( workspace->f[i], coef.C3dDelta, workspace->dDeltap_self[i] );
+
+            /*1st, dBOpi*/
+            rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi, bo_ij->dln_BOp_pi );
+            /*2nd, dBOpi*/
+            rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi, bo_ij->dBOp );
+            /*3rd, dBOpi*/
+            rvec_ScaledAdd( workspace->f[i], coef.C4dbopi, workspace->dDeltap_self[i] );
+
+            /*1st, dBOpi2*/
+            rvec_ScaledAdd( workspace->f[i], -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+            /*2nd, dBOpi2*/
+            rvec_ScaledAdd( workspace->f[i], -coef.C2dbopi2, bo_ij->dBOp );
+            /*3rd, dBOpi2*/
+            rvec_ScaledAdd( workspace->f[i], coef.C4dbopi2, workspace->dDeltap_self[i] );
+        }
+    }
+
+    CUDA_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, reax_list *bonds, storage *workspace)
+    {
+        int pk;
+        bond_data *nbr_k, *nbr_k_sym;
+
+        for( pk = Dev_Start_Index(i, bonds); pk < Dev_End_Index(i, bonds); ++pk ) {
+            nbr_k = &(bonds->select.bond_list[pk]);
+            nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] );
+
+            //rvec_Add (atoms[i].f, nbr_k_sym->tf_f);
+            rvec_Add (workspace->f[i], nbr_k_sym->tf_f);
+        }
+    }
+
+    CUDA_GLOBAL void ker_total_forces_postprocess (reax_atom *my_atoms, reax_list p_bonds, storage p_workspace,  int N)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i >= N) return;
+
+        reax_list *bonds = &( p_bonds );
+        storage *workspace = &( p_workspace );
+        Cuda_dbond_to_Forces_postprocess (i, my_atoms, bonds, workspace );
+    }
+
+    CUDA_GLOBAL void ker_total_forces (storage p_workspace, reax_list p_bonds, 
+            control_params *control,
+            simulation_data *data, 
+            rvec *data_ext_press,
+            int N )
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i >= N) return;
+
+        int pj;
+        reax_list *bonds = &( p_bonds );
+        storage *workspace = &( p_workspace );
+
+        for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+            //if( i < bonds->select.bond_list[pj].nbr ) {
+            if( control->virial == 0 )
+                Cuda_Add_dBond_to_Forces( i, pj, workspace, bonds);
+            else 
+                Cuda_Add_dBond_to_Forces_NPT( i, pj, data, workspace, bonds, data_ext_press [i]);
+        //}  
+    }
+
+    void Cuda_Total_Forces (reax_system *system, control_params *control, 
+            simulation_data *data, storage *workspace)
+    {
+        int blocks;
+        rvec *spad_rvec = (rvec *) scratch;
+        cuda_memset (spad_rvec, 0, system->N * 2 * sizeof (rvec), "total_forces:ext_press");
+
+        blocks = system->N / DEF_BLOCK_SIZE + 
+            ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+        ker_total_forces <<< blocks, DEF_BLOCK_SIZE >>>
+            ( *dev_workspace, *(*dev_lists + BONDS), 
+              (control_params *) control->d_control_params, 
+              (simulation_data *)data->d_simulation_data, 
+              spad_rvec, system->N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        if (control->virial != 0) 
+        {
+            //do the reduction here for ext press
+            k_reduction_rvec <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>> 
+                ( spad_rvec, spad_rvec + system->N, system->N);
+            cudaThreadSynchronize (); 
+            cudaCheckError (); 
+
+            k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>>
+                ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, blocks);
+            cudaThreadSynchronize (); 
+            cudaCheckError (); 
+        }
+
+        //do the post processing for the atomic forces here
+        ker_total_forces_postprocess  <<< blocks, DEF_BLOCK_SIZE >>>
+            (system->d_my_atoms, *(*dev_lists + BONDS), *dev_workspace, system->N);
+        cudaThreadSynchronize (); 
+        cudaCheckError (); 
+    }
+
+    CUDA_GLOBAL void ker_total_forces_pure (reax_atom *my_atoms, int n, 
+            storage p_workspace)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i >= n) return;
+
+        storage *workspace = &( p_workspace );
+
+        rvec_Copy (my_atoms[i].f, workspace->f[i]);
+    }
+
+    void Cuda_Total_Forces_PURE (reax_system *system, storage *workspace)
+    {
+        int blocks;
+
+        blocks = system->n / DEF_BLOCK_SIZE + 
+            ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+        ker_total_forces_pure <<< blocks, DEF_BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->n, *dev_workspace);
+        cudaThreadSynchronize (); 
+        cudaCheckError (); 
+    }
diff --git a/PG-PuReMD/src/cuda_bonds.cu b/PG-PuReMD/src/cuda_bonds.cu
index bce63602..90f1480b 100644
--- a/PG-PuReMD/src/cuda_bonds.cu
+++ b/PG-PuReMD/src/cuda_bonds.cu
@@ -26,124 +26,124 @@
 
 
 CUDA_GLOBAL void Cuda_Bonds( reax_atom *my_atoms, 
-		global_parameters gp, 
-		single_body_parameters *sbp, 
-		two_body_parameters *tbp, 
-		storage p_workspace, 
-		reax_list p_bonds, 
-		int n, int num_atom_types, 
-		real *e_bond
-		)
+        global_parameters gp, 
+        single_body_parameters *sbp, 
+        two_body_parameters *tbp, 
+        storage p_workspace, 
+        reax_list p_bonds, 
+        int n, int num_atom_types, 
+        real *e_bond
+        )
 {
-	int i, j, pj, natoms;
-	int start_i, end_i;
-	int type_i, type_j;
-	real ebond, pow_BOs_be2, exp_be12, CEbo;
-	real gp3, gp4, gp7, gp10, gp37;
-	real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
-	real decobdbo, decobdboua, decobdboub;
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_order_data *bo_ij;
-	reax_list *bonds;
-	storage *workspace;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= n) return;
-
-	bonds = &( p_bonds);
-	workspace = &( p_workspace );
-	gp3 = gp.l[3];
-	gp4 = gp.l[4];
-	gp7 = gp.l[7];
-	gp10 = gp.l[10];
-	gp37 = (int) gp.l[37];
-
-	//for( i = 0; i < natoms; ++i ) {
-	start_i = Dev_Start_Index(i, bonds);
-	end_i = Dev_End_Index(i, bonds);
-
-	for( pj = start_i; pj < end_i; ++pj ) {
-		j = bonds->select.bond_list[pj].nbr;
-
-		if( my_atoms[i].orig_id <= my_atoms[j].orig_id ) {
-			/* set the pointers */
-			type_i = my_atoms[i].type;
-			type_j = my_atoms[j].type;
-			sbp_i = &( sbp[type_i] );
-			sbp_j = &( sbp[type_j] );
-
-			twbp = &( tbp[ index_tbp (type_i,type_j, num_atom_types) ] );
-			bo_ij = &( bonds->select.bond_list[pj].bo_data );
-
-			/* calculate the constants */
-			pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
-			exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
-			CEbo = -twbp->De_s * exp_be12 * 
-				( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
-
-			/* calculate the Bond Energy */
-			e_bond[ i ] += ebond = 
-				-twbp->De_s * bo_ij->BO_s * exp_be12 
-				-twbp->De_p * bo_ij->BO_pi 
-				-twbp->De_pp * bo_ij->BO_pi2;
-
-			/* calculate derivatives of Bond Orders */
-			bo_ij->Cdbo += CEbo;
-			bo_ij->Cdbopi -= (CEbo + twbp->De_p);
-			bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
+    int i, j, pj, natoms;
+    int start_i, end_i;
+    int type_i, type_j;
+    real ebond, pow_BOs_be2, exp_be12, CEbo;
+    real gp3, gp4, gp7, gp10, gp37;
+    real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
+    real decobdbo, decobdboua, decobdboub;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij;
+    reax_list *bonds;
+    storage *workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    bonds = &( p_bonds);
+    workspace = &( p_workspace );
+    gp3 = gp.l[3];
+    gp4 = gp.l[4];
+    gp7 = gp.l[7];
+    gp10 = gp.l[10];
+    gp37 = (int) gp.l[37];
+
+    //for( i = 0; i < natoms; ++i ) {
+    start_i = Dev_Start_Index(i, bonds);
+    end_i = Dev_End_Index(i, bonds);
+
+    for( pj = start_i; pj < end_i; ++pj ) {
+        j = bonds->select.bond_list[pj].nbr;
+
+        if( my_atoms[i].orig_id <= my_atoms[j].orig_id ) {
+            /* set the pointers */
+            type_i = my_atoms[i].type;
+            type_j = my_atoms[j].type;
+            sbp_i = &( sbp[type_i] );
+            sbp_j = &( sbp[type_j] );
+
+            twbp = &( tbp[ index_tbp (type_i,type_j, num_atom_types) ] );
+            bo_ij = &( bonds->select.bond_list[pj].bo_data );
+
+            /* calculate the constants */
+            pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
+            exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
+            CEbo = -twbp->De_s * exp_be12 * 
+                ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
+
+            /* calculate the Bond Energy */
+            e_bond[ i ] += ebond = 
+                -twbp->De_s * bo_ij->BO_s * exp_be12 
+                -twbp->De_p * bo_ij->BO_pi 
+                -twbp->De_pp * bo_ij->BO_pi2;
+
+            /* calculate derivatives of Bond Orders */
+            bo_ij->Cdbo += CEbo;
+            bo_ij->Cdbopi -= (CEbo + twbp->De_p);
+            bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
 
 #ifdef TEST_ENERGY
-			//fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e%24.15e\n",
-			fprintf( out_control->ebond, "%6d%6d%12.4f%12.4f%12.4f\n",
-					system->my_atoms[i].orig_id, 
-					system->my_atoms[j].orig_id, 
-					bo_ij->BO, ebond, data->my_en.e_bond );
+            //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e%24.15e\n",
+            fprintf( out_control->ebond, "%6d%6d%12.4f%12.4f%12.4f\n",
+                    system->my_atoms[i].orig_id, 
+                    system->my_atoms[j].orig_id, 
+                    bo_ij->BO, ebond, data->my_en.e_bond );
 #endif
 #ifdef TEST_FORCES
-			Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
-			Add_dBOpinpi2( system, lists, i, pj, 
-					-(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
-					workspace->f_be, workspace->f_be );
+            Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
+            Add_dBOpinpi2( system, lists, i, pj, 
+                    -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
+                    workspace->f_be, workspace->f_be );
 #endif
-			/* Stabilisation terminal triple bond */
-			if( bo_ij->BO >= 1.00 ) {
-				if( gp37 == 2 ||
-						(sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
-						(sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
-					exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
-					exphua1 = EXP(-gp3 * (workspace->total_bond_order[i]-bo_ij->BO));
-					exphub1 = EXP(-gp3 * (workspace->total_bond_order[j]-bo_ij->BO));
-					exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j]));
-					hulpov = 1.0 / (1.0 + 25.0 * exphuov);
-
-					estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
-					e_bond [i] += estriph;
-
-					decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) *
-						( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
-					decobdboua = -gp10 * exphu * hulpov *
-						(gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-					decobdboub = -gp10 * exphu * hulpov *
-						(gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-
-					bo_ij->Cdbo += decobdbo;
-					workspace->CdDelta[i] += decobdboua;
-					workspace->CdDelta[j] += decobdboub;
+            /* Stabilisation terminal triple bond */
+            if( bo_ij->BO >= 1.00 ) {
+                if( gp37 == 2 ||
+                        (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
+                        (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
+                    exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
+                    exphua1 = EXP(-gp3 * (workspace->total_bond_order[i]-bo_ij->BO));
+                    exphub1 = EXP(-gp3 * (workspace->total_bond_order[j]-bo_ij->BO));
+                    exphuov = EXP(gp4 * (workspace->Delta[i] + workspace->Delta[j]));
+                    hulpov = 1.0 / (1.0 + 25.0 * exphuov);
+
+                    estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
+                    e_bond [i] += estriph;
+
+                    decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) *
+                        ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
+                    decobdboua = -gp10 * exphu * hulpov *
+                        (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+                    decobdboub = -gp10 * exphu * hulpov *
+                        (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+
+                    bo_ij->Cdbo += decobdbo;
+                    workspace->CdDelta[i] += decobdboua;
+                    workspace->CdDelta[j] += decobdboub;
 #ifdef TEST_ENERGY
-					//fprintf( out_control->ebond, 
-					//  "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-					//  system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-					//  estriph, decobdbo, decobdboua, decobdboub );
+                    //fprintf( out_control->ebond, 
+                    //  "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                    //  system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+                    //  estriph, decobdbo, decobdboua, decobdboub );
 #endif
 #ifdef TEST_FORCES
-					Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
-					Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
-					Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
+                    Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
+                    Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
+                    Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
 #endif
-				}
-			}
-		}
-	}
-	//  }
+                }
+            }
+        }
+    }
+    //  }
 }
diff --git a/PG-PuReMD/src/cuda_copy.cu b/PG-PuReMD/src/cuda_copy.cu
index 4172fc35..a40a5b90 100644
--- a/PG-PuReMD/src/cuda_copy.cu
+++ b/PG-PuReMD/src/cuda_copy.cu
@@ -13,151 +13,151 @@ extern "C" void Delete_List( reax_list*);
 
 void Sync_Grid (grid *host, grid *device)
 {
-	int total;
-	grid_cell local_cell;
-	total = host->ncells[0] * host->ncells[1] * host->ncells[2];
-
-	ivec_Copy (device->ncells, host->ncells);
-	rvec_Copy (device->cell_len, host->cell_len);
-	rvec_Copy (device->inv_len, host->inv_len);
-
-	ivec_Copy (device->bond_span, host->bond_span );
-	ivec_Copy (device->nonb_span, host->nonb_span );
-	ivec_Copy (device->vlist_span, host->vlist_span );
-
-	ivec_Copy (device->native_cells, host->native_cells );
-	ivec_Copy (device->native_str, host->native_str );
-	ivec_Copy (device->native_end, host->native_end );
-
-	device->ghost_cut = host->ghost_cut;
-	ivec_Copy (device->ghost_span, host->ghost_span );
-	ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span );
-	ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span );
-	ivec_Copy (device->ghost_bond_span, host->ghost_bond_span );
-
-	copy_host_device (host->str, device->str, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:str");
-	copy_host_device (host->end, device->end, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:end");
-	copy_host_device (host->cutoff, device->cutoff, sizeof (real) * total, cudaMemcpyHostToDevice, "grid:cutoff");
-	copy_host_device (host->nbrs_x, device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x");
-	copy_host_device (host->nbrs_cp, device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp");
-
-	copy_host_device (host->rel_box, device->rel_box, sizeof (ivec) * total, cudaMemcpyHostToDevice, "grid:rel_box");
-
-	device->max_nbrs = host->max_nbrs;
-
-	/*
-	   for (int i = 0; i < total; i++) {
-
-	   copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-cuda_copy");
-
-	//fprintf (stderr, " Atoms address %ld (%d) \n", local_cell.atoms, host->max_atoms );
-	//cuda_memset (local_cell.atoms, 0, sizeof (int) * host->max_atoms, "grid:cell:atoms-memset");
-	//fprintf (stderr, "host native atoms -> %d %d \n", host->native_str[0], host->native_end[0]);
-	//fprintf (stderr, "host atoms -> %d \n", host->cells[i].atoms[i]);
-	//fprintf (stderr, "Host Max atoms : %d \n", host->max_atoms ); 
-	//copy_host_device (host->cells[i].atoms, 
-	//		(local_cell.atoms), sizeof (int) * host->max_atoms, cudaMemcpyHostToDevice, "grid:cell:atoms");
-
-	////////////////////////////////////////////
-	//No need to copy atoms from the cells from host to device. 
-	// str and end has positions in the d_my_atoms list, which are just indexes into this list
-	// this index is used in the cuda_neighbors to compute the neighbors. 
-	// This is the only place where atoms is used. 
-	////////////////////////////////////////////////
-
-	//fprintf (stderr, " cells:nbrs_x %ld \n", local_cell.nbrs_x);
-	copy_host_device (host->cells[i].nbrs_x, 
-	local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x");
-
-	//fprintf (stderr, " Atoms address %ld \n", local_cell.nbrs_cp);
-	copy_host_device (host->cells[i].nbrs_cp, 
-	local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp");
-
-	//no need to copy pointers for device->cells[i].nbrs. 
-	// we can extract the pointer by nbrs_x (ivec) into the cells array. 
-	// This makes nbrs member redundant on the device
-
-	local_cell.cutoff = host->cells[i].cutoff;
-	rvec_Copy (local_cell.min, host->cells[i].min);
-	rvec_Copy (local_cell.max, host->cells[i].max);
-	ivec_Copy (local_cell.rel_box, host->cells[i].rel_box);
-
-	local_cell.mark = host->cells[i].mark;
-	local_cell.type = host->cells[i].type;
-	local_cell.str = host->cells[i].str;
-	local_cell.end = host->cells[i].end;
-	local_cell.top = host->cells[i].top;
-
-	copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), 
-	cudaMemcpyHostToDevice, "grid:cell-cuda_copy");
-	}
-	 */
+    int total;
+    grid_cell local_cell;
+    total = host->ncells[0] * host->ncells[1] * host->ncells[2];
+
+    ivec_Copy (device->ncells, host->ncells);
+    rvec_Copy (device->cell_len, host->cell_len);
+    rvec_Copy (device->inv_len, host->inv_len);
+
+    ivec_Copy (device->bond_span, host->bond_span );
+    ivec_Copy (device->nonb_span, host->nonb_span );
+    ivec_Copy (device->vlist_span, host->vlist_span );
+
+    ivec_Copy (device->native_cells, host->native_cells );
+    ivec_Copy (device->native_str, host->native_str );
+    ivec_Copy (device->native_end, host->native_end );
+
+    device->ghost_cut = host->ghost_cut;
+    ivec_Copy (device->ghost_span, host->ghost_span );
+    ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span );
+    ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span );
+    ivec_Copy (device->ghost_bond_span, host->ghost_bond_span );
+
+    copy_host_device (host->str, device->str, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:str");
+    copy_host_device (host->end, device->end, sizeof (int) * total, cudaMemcpyHostToDevice, "grid:end");
+    copy_host_device (host->cutoff, device->cutoff, sizeof (real) * total, cudaMemcpyHostToDevice, "grid:cutoff");
+    copy_host_device (host->nbrs_x, device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x");
+    copy_host_device (host->nbrs_cp, device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp");
+
+    copy_host_device (host->rel_box, device->rel_box, sizeof (ivec) * total, cudaMemcpyHostToDevice, "grid:rel_box");
+
+    device->max_nbrs = host->max_nbrs;
+
+    /*
+       for (int i = 0; i < total; i++) {
+
+       copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-cuda_copy");
+
+    //fprintf (stderr, " Atoms address %ld (%d) \n", local_cell.atoms, host->max_atoms );
+    //cuda_memset (local_cell.atoms, 0, sizeof (int) * host->max_atoms, "grid:cell:atoms-memset");
+    //fprintf (stderr, "host native atoms -> %d %d \n", host->native_str[0], host->native_end[0]);
+    //fprintf (stderr, "host atoms -> %d \n", host->cells[i].atoms[i]);
+    //fprintf (stderr, "Host Max atoms : %d \n", host->max_atoms ); 
+    //copy_host_device (host->cells[i].atoms, 
+    //        (local_cell.atoms), sizeof (int) * host->max_atoms, cudaMemcpyHostToDevice, "grid:cell:atoms");
+
+    ////////////////////////////////////////////
+    //No need to copy atoms from the cells from host to device. 
+    // str and end has positions in the d_my_atoms list, which are just indexes into this list
+    // this index is used in the cuda_neighbors to compute the neighbors. 
+    // This is the only place where atoms is used. 
+    ////////////////////////////////////////////////
+
+    //fprintf (stderr, " cells:nbrs_x %ld \n", local_cell.nbrs_x);
+    copy_host_device (host->cells[i].nbrs_x, 
+    local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_x");
+
+    //fprintf (stderr, " Atoms address %ld \n", local_cell.nbrs_cp);
+    copy_host_device (host->cells[i].nbrs_cp, 
+    local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, cudaMemcpyHostToDevice, "grid:nbrs_cp");
+
+    //no need to copy pointers for device->cells[i].nbrs. 
+    // we can extract the pointer by nbrs_x (ivec) into the cells array. 
+    // This makes nbrs member redundant on the device
+
+    local_cell.cutoff = host->cells[i].cutoff;
+    rvec_Copy (local_cell.min, host->cells[i].min);
+    rvec_Copy (local_cell.max, host->cells[i].max);
+    ivec_Copy (local_cell.rel_box, host->cells[i].rel_box);
+
+    local_cell.mark = host->cells[i].mark;
+    local_cell.type = host->cells[i].type;
+    local_cell.str = host->cells[i].str;
+    local_cell.end = host->cells[i].end;
+    local_cell.top = host->cells[i].top;
+
+    copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), 
+    cudaMemcpyHostToDevice, "grid:cell-cuda_copy");
+    }
+     */
 }
 
 void Sync_Atoms (reax_system *sys)
 {
-	//TODO
-	//TODO METIN FIX, coredump on his machine
-	//TODO
-	//TODO
-	//copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->total_cap, cudaMemcpyHostToDevice, "system:my_atoms");
+    //TODO
+    //TODO METIN FIX, coredump on his machine
+    //TODO
+    //TODO
+    //copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->total_cap, cudaMemcpyHostToDevice, "system:my_atoms");
 #if defined(__CUDA_DEBUG_LOG__)
-	fprintf (stderr, "p:%d - Synching atoms: n: %d N: %d, total_cap: %d \n", 
-			sys->my_rank, sys->n, sys->N, sys->total_cap);
+    fprintf (stderr, "p:%d - Synching atoms: n: %d N: %d, total_cap: %d \n", 
+            sys->my_rank, sys->n, sys->N, sys->total_cap);
 #endif
-	copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->N, cudaMemcpyHostToDevice, "system:my_atoms");
-	//TODO
-	//TODO METIN FIX, coredump on his machine
-	//TODO
-	//TODO
+    copy_host_device (sys->my_atoms, sys->d_my_atoms, sizeof (reax_atom) * sys->N, cudaMemcpyHostToDevice, "system:my_atoms");
+    //TODO
+    //TODO METIN FIX, coredump on his machine
+    //TODO
+    //TODO
 }
 
 void Sync_System (reax_system *sys)
 {
-	//fprintf (stderr, "p:%d - trying to copy atoms : %d \n", sys->my_rank, sys->local_cap);
-	Sync_Atoms (sys);
-
-	copy_host_device (&(sys->my_box), sys->d_my_box, 
-			sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_box");
-
-	copy_host_device (&(sys->my_ext_box), sys->d_my_ext_box, 
-			sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_ext_box");
-
-	copy_host_device (sys->reax_param.sbp, sys->reax_param.d_sbp, 
-			sizeof (single_body_parameters) * sys->reax_param.num_atom_types, cudaMemcpyHostToDevice, "system:sbp");
-	copy_host_device (sys->reax_param.tbp, sys->reax_param.d_tbp, 
-			sizeof (two_body_parameters) * pow (sys->reax_param.num_atom_types, 2), cudaMemcpyHostToDevice, "system:tbp");
-	copy_host_device (sys->reax_param.thbp, sys->reax_param.d_thbp, 
-			sizeof (three_body_header) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:thbh");
-	copy_host_device (sys->reax_param.hbp, sys->reax_param.d_hbp, 
-			sizeof (hbond_parameters) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:hbond");
-	copy_host_device (sys->reax_param.fbp, sys->reax_param.d_fbp, 
-			sizeof (four_body_header) * pow (sys->reax_param.num_atom_types, 4), cudaMemcpyHostToDevice, "system:four_header");
-
-	copy_host_device (sys->reax_param.gp.l, sys->reax_param.d_gp.l, 
-			sizeof (real) * sys->reax_param.gp.n_global, cudaMemcpyHostToDevice, "system:global_parameters");
-
-	sys->reax_param.d_gp.n_global = sys->reax_param.gp.n_global; 
-	sys->reax_param.d_gp.vdw_type = sys->reax_param.gp.vdw_type; 
+    //fprintf (stderr, "p:%d - trying to copy atoms : %d \n", sys->my_rank, sys->local_cap);
+    Sync_Atoms (sys);
+
+    copy_host_device (&(sys->my_box), sys->d_my_box, 
+            sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_box");
+
+    copy_host_device (&(sys->my_ext_box), sys->d_my_ext_box, 
+            sizeof (simulation_box), cudaMemcpyHostToDevice, "system:my_ext_box");
+
+    copy_host_device (sys->reax_param.sbp, sys->reax_param.d_sbp, 
+            sizeof (single_body_parameters) * sys->reax_param.num_atom_types, cudaMemcpyHostToDevice, "system:sbp");
+    copy_host_device (sys->reax_param.tbp, sys->reax_param.d_tbp, 
+            sizeof (two_body_parameters) * pow (sys->reax_param.num_atom_types, 2), cudaMemcpyHostToDevice, "system:tbp");
+    copy_host_device (sys->reax_param.thbp, sys->reax_param.d_thbp, 
+            sizeof (three_body_header) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:thbh");
+    copy_host_device (sys->reax_param.hbp, sys->reax_param.d_hbp, 
+            sizeof (hbond_parameters) * pow (sys->reax_param.num_atom_types, 3), cudaMemcpyHostToDevice, "system:hbond");
+    copy_host_device (sys->reax_param.fbp, sys->reax_param.d_fbp, 
+            sizeof (four_body_header) * pow (sys->reax_param.num_atom_types, 4), cudaMemcpyHostToDevice, "system:four_header");
+
+    copy_host_device (sys->reax_param.gp.l, sys->reax_param.d_gp.l, 
+            sizeof (real) * sys->reax_param.gp.n_global, cudaMemcpyHostToDevice, "system:global_parameters");
+
+    sys->reax_param.d_gp.n_global = sys->reax_param.gp.n_global; 
+    sys->reax_param.d_gp.vdw_type = sys->reax_param.gp.vdw_type; 
 }
 
 void Output_Sync_Atoms (reax_system *sys)
 {
-	//TODO changed this from sys->n to sys->N
-	copy_host_device (sys->my_atoms, sys->d_my_atoms, 
-			sizeof (reax_atom) * sys->total_cap, cudaMemcpyDeviceToHost, "system:my_atoms");
+    //TODO changed this from sys->n to sys->N
+    copy_host_device (sys->my_atoms, sys->d_my_atoms, 
+            sizeof (reax_atom) * sys->total_cap, cudaMemcpyDeviceToHost, "system:my_atoms");
 }
 
 void Output_Sync_Simulation_Data (simulation_data *host, simulation_data *dev)
 {
-	copy_host_device (&host->my_en, &dev->my_en, sizeof (energy_data), 
-			cudaMemcpyDeviceToHost, "simulation_data:energy_data");
-	copy_host_device (&host->kin_press, &dev->kin_press, sizeof (real), 
-			cudaMemcpyDeviceToHost, "simulation_data:kin_press");
-	copy_host_device (host->int_press, dev->int_press, sizeof (rvec), 
-			cudaMemcpyDeviceToHost, "simulation_data:int_press");
-	copy_host_device (host->ext_press, dev->ext_press, sizeof (rvec), 
-			cudaMemcpyDeviceToHost, "simulation_data:ext_press");
+    copy_host_device (&host->my_en, &dev->my_en, sizeof (energy_data), 
+            cudaMemcpyDeviceToHost, "simulation_data:energy_data");
+    copy_host_device (&host->kin_press, &dev->kin_press, sizeof (real), 
+            cudaMemcpyDeviceToHost, "simulation_data:kin_press");
+    copy_host_device (host->int_press, dev->int_press, sizeof (rvec), 
+            cudaMemcpyDeviceToHost, "simulation_data:int_press");
+    copy_host_device (host->ext_press, dev->ext_press, sizeof (rvec), 
+            cudaMemcpyDeviceToHost, "simulation_data:ext_press");
 }
 
 void Sync_Workspace (storage *workspace, enum cudaMemcpyKind dir)
@@ -183,37 +183,37 @@ void Prep_Device_For_Output (reax_system *system, simulation_data *data )
 
 void Output_Sync_Lists (reax_list *host, reax_list *device, int type)
 {
-	//fprintf (stderr, " Trying to copy *%d* list from device to host \n", type);
-
-	//list is already allocated -- discard it first
-	//if (host->n > 0)
-	//if (host->allocated > 0)
-	//  Delete_List (host);
-
-	//memory is allocated on the host
-	//Make_List(device->n, device->num_intrs, type, host);
-
-	//memcpy the entries from device to host
-	copy_host_device (host->index, device->index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync_list:list:index");
-	copy_host_device (host->end_index, device->end_index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync:list:end_index");
-
-	switch (type)
-	{   
-		case TYP_BOND:
-			copy_host_device (host->select.bond_list, device->select.bond_list, 
-					sizeof (bond_data) * device->num_intrs, cudaMemcpyDeviceToHost, "bond_list");
-			break;
-
-		case TYP_THREE_BODY:
-			copy_host_device (host->select.three_body_list, device->select.three_body_list, 
-					sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, "three_body_list");
-			break;
-
-		default:
-			fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type );
-			exit (1);
-			break;
-	}  
+    //fprintf (stderr, " Trying to copy *%d* list from device to host \n", type);
+
+    //list is already allocated -- discard it first
+    //if (host->n > 0)
+    //if (host->allocated > 0)
+    //  Delete_List (host);
+
+    //memory is allocated on the host
+    //Make_List(device->n, device->num_intrs, type, host);
+
+    //memcpy the entries from device to host
+    copy_host_device (host->index, device->index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync_list:list:index");
+    copy_host_device (host->end_index, device->end_index, sizeof (int) * device->n, cudaMemcpyDeviceToHost, "output_sync:list:end_index");
+
+    switch (type)
+    {   
+        case TYP_BOND:
+            copy_host_device (host->select.bond_list, device->select.bond_list, 
+                    sizeof (bond_data) * device->num_intrs, cudaMemcpyDeviceToHost, "bond_list");
+            break;
+
+        case TYP_THREE_BODY:
+            copy_host_device (host->select.three_body_list, device->select.three_body_list, 
+                    sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, "three_body_list");
+            break;
+
+        default:
+            fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type );
+            exit (1);
+            break;
+    }  
 }
 
 //#ifdef __cplusplus
diff --git a/PG-PuReMD/src/cuda_environment.cu b/PG-PuReMD/src/cuda_environment.cu
index 2583a97b..dbbaba9b 100644
--- a/PG-PuReMD/src/cuda_environment.cu
+++ b/PG-PuReMD/src/cuda_environment.cu
@@ -6,41 +6,41 @@
 extern "C" void Setup_Cuda_Environment (int rank, int nprocs, int gpus_per_node)
 {
 
-	int deviceCount = 0;
-	cudaGetDeviceCount (&deviceCount);
-
-	//Calculate the # of GPUs per processor
-	//and assign the GPU for each process
-
-	//hpcc changes
-	//if (gpus_per_node == 2) {
-	cudaSetDevice ( (rank % (deviceCount)) );
-	//cudaSetDevice( 1 );
-	fprintf( stderr, "p:%d is using GPU: %d \n", rank, (rank % deviceCount));
-	//} else {
-	//	cudaSetDevice ( 0 );
-	//}
-
-	///////////////////////////////////////////////
-	///////////////////////////////////////////////
-	///////////////////////////////////////////////
-	// CHANGE ORIGINAL/////////////////////////////
-	///////////////////////////////////////////////
-	///////////////////////////////////////////////
-	///////////////////////////////////////////////
-	//cudaDeviceSetLimit ( cudaLimitStackSize, 8192 );
-	//cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 );
-	//cudaCheckError ();
-	///////////////////////////////////////////////
-	///////////////////////////////////////////////
-	///////////////////////////////////////////////
-	///////////////////////////////////////////////
-	///////////////////////////////////////////////
+    int deviceCount = 0;
+    cudaGetDeviceCount (&deviceCount);
+
+    //Calculate the # of GPUs per processor
+    //and assign the GPU for each process
+
+    //hpcc changes
+    //if (gpus_per_node == 2) {
+    cudaSetDevice ( (rank % (deviceCount)) );
+    //cudaSetDevice( 1 );
+    fprintf( stderr, "p:%d is using GPU: %d \n", rank, (rank % deviceCount));
+    //} else {
+    //    cudaSetDevice ( 0 );
+    //}
+
+    ///////////////////////////////////////////////
+    ///////////////////////////////////////////////
+    ///////////////////////////////////////////////
+    // CHANGE ORIGINAL/////////////////////////////
+    ///////////////////////////////////////////////
+    ///////////////////////////////////////////////
+    ///////////////////////////////////////////////
+    //cudaDeviceSetLimit ( cudaLimitStackSize, 8192 );
+    //cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 );
+    //cudaCheckError ();
+    ///////////////////////////////////////////////
+    ///////////////////////////////////////////////
+    ///////////////////////////////////////////////
+    ///////////////////////////////////////////////
+    ///////////////////////////////////////////////
 
 }
 
 extern "C" void Cleanup_Cuda_Environment ()
 {
-	cudaDeviceReset ();
-	cudaDeviceSynchronize ();
+    cudaDeviceReset ();
+    cudaDeviceSynchronize ();
 }
diff --git a/PG-PuReMD/src/cuda_forces.cu b/PG-PuReMD/src/cuda_forces.cu
index 0e40cc2f..063554bf 100644
--- a/PG-PuReMD/src/cuda_forces.cu
+++ b/PG-PuReMD/src/cuda_forces.cu
@@ -31,1580 +31,1595 @@ extern "C" void Delete_List( reax_list*);
 
 
 CUDA_GLOBAL void ker_estimate_storages (reax_atom *my_atoms, 
-		single_body_parameters *sbp, 
-		two_body_parameters *tbp,
-		control_params *control,
-		reax_list far_nbrs, 
-		int num_atom_types,
-		int n, int N, 
-		int Hcap, int total_cap,
-		int *Htop, int *num_3body,
-		int *bond_top, int *hb_top
-		)
+        single_body_parameters *sbp, 
+        two_body_parameters *tbp,
+        control_params *control,
+        reax_list far_nbrs, 
+        int num_atom_types,
+        int n, int N, 
+        int Hcap, int total_cap,
+        int *Htop, int *num_3body,
+        int *bond_top, int *hb_top
+        )
 {
-	int i, j, pj; 
-	int start_i, end_i;
-	int type_i, type_j;
-	int ihb, jhb;
-	int local;
-	real cutoff;
-	real r_ij, r2; 
-	real C12, C34, C56;
-	real BO, BO_s, BO_pi, BO_pi2;
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	far_neighbor_data *nbr_pj;
-	reax_atom *atom_i, *atom_j;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-
-	//Commented in CUDA_KERNEL
-	//for( i = 0; i < N; ++i ) { 
-	atom_i = &(my_atoms[i]);
-	type_i  = atom_i->type;
-	start_i = Dev_Start_Index(i, &far_nbrs);
-	end_i   = Dev_End_Index(i, &far_nbrs);
-	sbp_i = &(sbp[type_i]);
-
-	if( i < n ) { 
-		local = 1;
-		cutoff = control->nonb_cut;
-		//++(*Htop);
-		atomicAdd (Htop, 1);
-		ihb = sbp_i->p_hbond;
-	}   
-	else {
-		local = 0;
-		cutoff = control->bond_cut;
-		ihb = -1; 
-	} 
-
-	for( pj = start_i; pj < end_i; ++pj ) { 
-		nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-		j = nbr_pj->nbr;
-		atom_j = &(my_atoms[j]);
-
-		if (nbr_pj->d <= control->nonb_cut) {
-			type_j = my_atoms[j].type;
-			sbp_j = &(sbp[type_j]);
-			ihb = sbp_i->p_hbond;
-			jhb = sbp_j->p_hbond;
-			if ((control->hbond_cut > 0.1) 
-					&& (nbr_pj->d <= control->hbond_cut) 
-					&& (ihb == 2) 
-					&& (jhb == 1) 
-					&& (j < n)
-					&& (i > n)
-			   )
-				atomicAdd (&hb_top [i], 1);
-
-			if (i >= n) ihb = -1;
-		}
-
-
-
-		if(nbr_pj->d <= cutoff) {
-			type_j = my_atoms[j].type;
-			r_ij = nbr_pj->d;
-			sbp_j = &(sbp[type_j]);
-			twbp = &(tbp[index_tbp (type_i,type_j,num_atom_types)]);
-
-			if( local ) {
-				//if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
-				if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
-					//++(*Htop);
-					atomicAdd (Htop, 1);
-				else if( j < n || atom_i->orig_id > atom_j->orig_id ) //tryQEq ||1
-					//++(*Htop);
-					atomicAdd (Htop, 1);
-
-				if( control->hbond_cut > 0.1 && (ihb==1 || ihb==2) &&
-						nbr_pj->d <= control->hbond_cut 
-				  ) {
-					jhb = sbp_j->p_hbond;
-					if( (ihb == 1) && (jhb == 2))
-						//++hb_top[i];
-						atomicAdd (&hb_top[i], 1);
-					//else if( j < n && ihb == 2 && jhb == 1 )
-					//else if( ihb == 2 && jhb == 1 && j < n)
-					else if( ihb == 2 && jhb == 1 && j < n)
-						//++hb_top[j];
-						atomicAdd (&hb_top[i], 1);
-				}
-			}
-
-			// uncorrected bond orders 
-			if( nbr_pj->d <= control->bond_cut ) {
-				r2 = SQR(r_ij);
-
-				if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-					C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-					BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-				}
-				else BO_s = C12 = 0.0;
-
-				if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-					C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-					BO_pi = EXP( C34 );
-				}
-				else BO_pi = C34 = 0.0;
-
-				if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-					C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
-					BO_pi2= EXP( C56 );
-				}
-				else BO_pi2 = C56 = 0.0;
-
-				// Initially BO values are the uncorrected ones, page 1 
-				BO = BO_s + BO_pi + BO_pi2;
-
-				if( BO >= control->bo_cut ) {
-					//++bond_top[i];
-					//++bond_top[j];
-					atomicAdd (&bond_top [i], 1);
-					//atomicAdd (&bond_top [j], 1);
-				}
-			}
-		}
-	}
-	//} -- Commented in CUDA_KERNEL
+    int i, j, pj; 
+    int start_i, end_i;
+    int type_i, type_j;
+    int ihb, jhb;
+    int local;
+    real cutoff;
+    real r_ij, r2; 
+    real C12, C34, C56;
+    real BO, BO_s, BO_pi, BO_pi2;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+
+    //Commented in CUDA_KERNEL
+    //for( i = 0; i < N; ++i ) { 
+    atom_i = &(my_atoms[i]);
+    type_i  = atom_i->type;
+    start_i = Dev_Start_Index(i, &far_nbrs);
+    end_i   = Dev_End_Index(i, &far_nbrs);
+    sbp_i = &(sbp[type_i]);
+
+    if( i < n ) { 
+        local = 1;
+        cutoff = control->nonb_cut;
+        //++(*Htop);
+        atomicAdd (Htop, 1);
+        ihb = sbp_i->p_hbond;
+    }   
+    else {
+        local = 0;
+        cutoff = control->bond_cut;
+        ihb = -1; 
+    } 
+
+    for( pj = start_i; pj < end_i; ++pj ) { 
+        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &(my_atoms[j]);
+
+        if (nbr_pj->d <= control->nonb_cut) {
+            type_j = my_atoms[j].type;
+            sbp_j = &(sbp[type_j]);
+            ihb = sbp_i->p_hbond;
+            jhb = sbp_j->p_hbond;
+            if ((control->hbond_cut > 0.1) 
+                    && (nbr_pj->d <= control->hbond_cut) 
+                    && (ihb == 2) 
+                    && (jhb == 1) 
+                    && (j < n)
+                    && (i > n)
+               )
+                atomicAdd (&hb_top [i], 1);
+
+            if (i >= n) ihb = -1;
+        }
+
+
+
+        if(nbr_pj->d <= cutoff) {
+            type_j = my_atoms[j].type;
+            r_ij = nbr_pj->d;
+            sbp_j = &(sbp[type_j]);
+            twbp = &(tbp[index_tbp (type_i,type_j,num_atom_types)]);
+
+            if( local ) {
+                //if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
+                if( j < n || atom_i->orig_id < atom_j->orig_id ) //tryQEq ||1
+                    //++(*Htop);
+                    atomicAdd (Htop, 1);
+                else if( j < n || atom_i->orig_id > atom_j->orig_id ) //tryQEq ||1
+                    //++(*Htop);
+                    atomicAdd (Htop, 1);
+
+                if( control->hbond_cut > 0.1 && (ihb==1 || ihb==2) &&
+                        nbr_pj->d <= control->hbond_cut 
+                  ) {
+                    jhb = sbp_j->p_hbond;
+                    if( (ihb == 1) && (jhb == 2))
+                        //++hb_top[i];
+                        atomicAdd (&hb_top[i], 1);
+                    //else if( j < n && ihb == 2 && jhb == 1 )
+                    //else if( ihb == 2 && jhb == 1 && j < n)
+                    else if( ihb == 2 && jhb == 1 && j < n)
+                        //++hb_top[j];
+                        atomicAdd (&hb_top[i], 1);
+                }
+            }
+
+            // uncorrected bond orders 
+            if( nbr_pj->d <= control->bond_cut ) {
+                r2 = SQR(r_ij);
+
+                if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                }
+                else BO_s = C12 = 0.0;
+
+                if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                    BO_pi = EXP( C34 );
+                }
+                else BO_pi = C34 = 0.0;
+
+                if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );
+                    BO_pi2= EXP( C56 );
+                }
+                else BO_pi2 = C56 = 0.0;
+
+                // Initially BO values are the uncorrected ones, page 1 
+                BO = BO_s + BO_pi + BO_pi2;
+
+                if( BO >= control->bo_cut ) {
+                    //++bond_top[i];
+                    //++bond_top[j];
+                    atomicAdd (&bond_top [i], 1);
+                    //atomicAdd (&bond_top [j], 1);
+                }
+            }
+        }
+    }
+    //} -- Commented in CUDA_KERNEL
 }
 
+
 CUDA_GLOBAL void ker_init_system_atoms(reax_atom *my_atoms, int N, 
-		int *hb_top, int *bond_top)
+        int *hb_top, int *bond_top)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
 
-	my_atoms[i].num_bonds = bond_top [i];
-	my_atoms[i].num_hbonds = hb_top [i];
+    my_atoms[i].num_bonds = bond_top [i];
+    my_atoms[i].num_hbonds = hb_top [i];
 }
 
+
 void Cuda_Estimate_Storages(reax_system *system, control_params *control, 
-		reax_list **lists, int local_cap, int total_cap,
-		int *Htop, int *hb_top, 
-		int *bond_top, int *num_3body)
+        reax_list **lists, int local_cap, int total_cap,
+        int *Htop, int *hb_top, 
+        int *bond_top, int *num_3body)
 {
-	int blocks = 0;
-	int *l_Htop, *l_hb_top, *l_bond_top, *l_num_3body;
-	int *tmp = (int *)scratch;
-
-	*Htop = 0;
-	//memset( hb_top, 0, sizeof(int) * local_cap);
-	memset( hb_top, 0, sizeof(int) * total_cap);
-	memset( bond_top, 0, sizeof(int) * total_cap );
-	*num_3body = 0;
-
-	//cuda_memset (tmp, 0, 1 + 1 + sizeof (int) * (local_cap+ total_cap), "Cuda_Estimate_Storages");
-	cuda_memset (tmp, 0, sizeof (int) * (1 + 1 + total_cap+ total_cap), "Cuda_Estimate_Storages");
-
-	l_Htop = tmp; 
-	l_num_3body = l_Htop + 1;
-	l_hb_top = l_num_3body + 1;
-	//l_bond_top = l_hb_top + local_cap;
-	l_bond_top = l_hb_top + total_cap;
-
-	blocks = system->N / ST_BLOCK_SIZE + 
-		((system->N % ST_BLOCK_SIZE == 0) ? 0 : 1);
-
-	ker_estimate_storages <<< blocks, ST_BLOCK_SIZE>>>
-		(system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
-		 (control_params *)control->d_control_params, *(*dev_lists + FAR_NBRS), system->reax_param.num_atom_types,
-		 system->n, system->N, system->Hcap, system->total_cap, 
-		 l_Htop, l_num_3body, l_bond_top, l_hb_top );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	copy_host_device( Htop, l_Htop, sizeof (int), cudaMemcpyDeviceToHost, "Htop");
-	copy_host_device( num_3body, l_num_3body, sizeof (int), cudaMemcpyDeviceToHost, "num_3body");
-	//copy_host_device( hb_top, l_hb_top, sizeof (int) * local_cap, cudaMemcpyDeviceToHost, "hb_top");
-	copy_host_device( hb_top, l_hb_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "hb_top");
-	copy_host_device( bond_top, l_bond_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "bond_top");
-
-
-	int bond_count = 0;
-	int hbond_count = 0;
-	int max_bonds = 0, min_bonds = 999999;
-	int max_hbonds = 0, min_hbonds = 999999;
-
-	for (int i = 0; i < system->N; i++) {
-		if (bond_top[i] >= max_bonds) max_bonds = bond_top[i];
-		if (bond_top[i] <= min_bonds) min_bonds = bond_top[i];
-		bond_count += bond_top[i];
-	}
-	system->max_bonds = max_bonds * SAFER_ZONE;
-	//for (int i = 0; i < system->n; i++)
-	for (int i = 0; i < system->N; i++){
-		if (hb_top[i] >= max_hbonds) max_hbonds = hb_top[i];
-		if (hb_top[i] <= min_hbonds) min_hbonds = hb_top[i];
-		hbond_count += hb_top [i];
-	}
-	system->max_hbonds = max_hbonds * SAFER_ZONE;
-	//fprintf (stderr, " TOTAL DEVICE BOND COUNT: %d \n", bond_count);
-	//fprintf (stderr, " TOTAL DEVICE HBOND COUNT: %d \n", hbond_count);
-	//fprintf (stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", *Htop);
-	fprintf (stderr, "p:%d --> Bonds(%d, %d) HBonds (%d, %d) *******\n", 
-			system->my_rank, min_bonds, max_bonds, min_hbonds, max_hbonds);
-
-	ker_init_system_atoms <<<blocks, ST_BLOCK_SIZE>>>
-		(system->d_my_atoms, system->N, l_hb_top, l_bond_top );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    int blocks = 0;
+    int *l_Htop, *l_hb_top, *l_bond_top, *l_num_3body;
+    int *tmp = (int *)scratch;
+
+    *Htop = 0;
+    //memset( hb_top, 0, sizeof(int) * local_cap);
+    memset( hb_top, 0, sizeof(int) * total_cap);
+    memset( bond_top, 0, sizeof(int) * total_cap );
+    *num_3body = 0;
+
+    //cuda_memset (tmp, 0, 1 + 1 + sizeof (int) * (local_cap+ total_cap), "Cuda_Estimate_Storages");
+    cuda_memset (tmp, 0, sizeof (int) * (1 + 1 + total_cap+ total_cap), "Cuda_Estimate_Storages");
+
+    l_Htop = tmp; 
+    l_num_3body = l_Htop + 1;
+    l_hb_top = l_num_3body + 1;
+    //l_bond_top = l_hb_top + local_cap;
+    l_bond_top = l_hb_top + total_cap;
+
+    blocks = system->N / ST_BLOCK_SIZE + 
+        ((system->N % ST_BLOCK_SIZE == 0) ? 0 : 1);
+
+    ker_estimate_storages <<< blocks, ST_BLOCK_SIZE>>>
+        (system->d_my_atoms, system->reax_param.d_sbp, system->reax_param.d_tbp, 
+         (control_params *)control->d_control_params, *(*dev_lists + FAR_NBRS), system->reax_param.num_atom_types,
+         system->n, system->N, system->Hcap, system->total_cap, 
+         l_Htop, l_num_3body, l_bond_top, l_hb_top );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    copy_host_device( Htop, l_Htop, sizeof (int), cudaMemcpyDeviceToHost, "Htop");
+    copy_host_device( num_3body, l_num_3body, sizeof (int), cudaMemcpyDeviceToHost, "num_3body");
+    //copy_host_device( hb_top, l_hb_top, sizeof (int) * local_cap, cudaMemcpyDeviceToHost, "hb_top");
+    copy_host_device( hb_top, l_hb_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "hb_top");
+    copy_host_device( bond_top, l_bond_top, sizeof (int) * total_cap, cudaMemcpyDeviceToHost, "bond_top");
+
+
+    int bond_count = 0;
+    int hbond_count = 0;
+    int max_bonds = 0, min_bonds = 999999;
+    int max_hbonds = 0, min_hbonds = 999999;
+
+    for (int i = 0; i < system->N; i++) {
+        if (bond_top[i] >= max_bonds) max_bonds = bond_top[i];
+        if (bond_top[i] <= min_bonds) min_bonds = bond_top[i];
+        bond_count += bond_top[i];
+    }
+    system->max_bonds = max_bonds * SAFER_ZONE;
+    //for (int i = 0; i < system->n; i++)
+    for (int i = 0; i < system->N; i++){
+        if (hb_top[i] >= max_hbonds) max_hbonds = hb_top[i];
+        if (hb_top[i] <= min_hbonds) min_hbonds = hb_top[i];
+        hbond_count += hb_top [i];
+    }
+    system->max_hbonds = max_hbonds * SAFER_ZONE;
+    //fprintf (stderr, " TOTAL DEVICE BOND COUNT: %d \n", bond_count);
+    //fprintf (stderr, " TOTAL DEVICE HBOND COUNT: %d \n", hbond_count);
+    //fprintf (stderr, " TOTAL DEVICE SPARSE COUNT: %d \n", *Htop);
+    fprintf (stderr, "p:%d --> Bonds(%d, %d) HBonds (%d, %d) *******\n", 
+            system->my_rank, min_bonds, max_bonds, min_hbonds, max_hbonds);
+
+    ker_init_system_atoms <<<blocks, ST_BLOCK_SIZE>>>
+        (system->d_my_atoms, system->N, l_hb_top, l_bond_top );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
+
 CUDA_DEVICE real Compute_H( real r, real gamma, real *ctap )
 {
-	real taper, dr3gamij_1, dr3gamij_3;
-
-	taper = ctap[7] * r + ctap[6];
-	taper = taper * r + ctap[5];
-	taper = taper * r + ctap[4];
-	taper = taper * r + ctap[3];
-	taper = taper * r + ctap[2];
-	taper = taper * r + ctap[1];
-	taper = taper * r + ctap[0];    
-
-	dr3gamij_1 = ( r*r*r + gamma );
-	dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-	return taper * EV_to_KCALpMOL / dr3gamij_3;
+    real taper, dr3gamij_1, dr3gamij_3;
+
+    taper = ctap[7] * r + ctap[6];
+    taper = taper * r + ctap[5];
+    taper = taper * r + ctap[4];
+    taper = taper * r + ctap[3];
+    taper = taper * r + ctap[2];
+    taper = taper * r + ctap[1];
+    taper = taper * r + ctap[0];    
+
+    dr3gamij_1 = ( r*r*r + gamma );
+    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+    return taper * EV_to_KCALpMOL / dr3gamij_3;
 }
 
 
 CUDA_DEVICE real Compute_tabH( LR_lookup_table *t_LR, real r_ij, int ti, int tj, int num_atom_types )
 {
-	int r, tmin, tmax;
-	real val, dif, base;
-	LR_lookup_table *t; 
-
-	tmin  = MIN( ti, tj );
-	tmax  = MAX( ti, tj );
-	t = &( t_LR[index_lr (tmin,tmax, num_atom_types)] );    
-
-	/* cubic spline interpolation */
-	r = (int)(r_ij * t->inv_dx);
-	if( r == 0 )  ++r;
-	base = (real)(r+1) * t->dx;
-	dif = r_ij - base;
-	val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-		t->ele[r].a;
-	val *= EV_to_KCALpMOL / C_ele;
-
-	return val;
+    int r, tmin, tmax;
+    real val, dif, base;
+    LR_lookup_table *t; 
+
+    tmin  = MIN( ti, tj );
+    tmax  = MAX( ti, tj );
+    t = &( t_LR[index_lr (tmin,tmax, num_atom_types)] );    
+
+    /* cubic spline interpolation */
+    r = (int)(r_ij * t->inv_dx);
+    if( r == 0 )  ++r;
+    base = (real)(r+1) * t->dx;
+    dif = r_ij - base;
+    val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+        t->ele[r].a;
+    val *= EV_to_KCALpMOL / C_ele;
+
+    return val;
 }
 
+
 CUDA_GLOBAL void ker_estimate_sparse_matrix (reax_atom *my_atoms, control_params *control, 
-		reax_list p_far_nbrs, int n, int N, int renbr, int *indices)
+        reax_list p_far_nbrs, int n, int N, int renbr, int *indices)
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int flag;
+    real cutoff;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+    reax_list *far_nbrs = &( p_far_nbrs );
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    atom_i = &(my_atoms[i]);
+    start_i = Dev_Start_Index(i, far_nbrs);
+    end_i   = Dev_End_Index(i, far_nbrs);
+
+    cutoff = control->nonb_cut;
+
+    //++Htop;
+    if ( i < n) 
+        indices [i] ++;
+
+    /* update i-j distance - check if j is within cutoff */
+    for( pj = start_i; pj < end_i; ++pj ) {
+        nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &(my_atoms[j]);
+        if( renbr ) {
+            if(nbr_pj->d <= cutoff)
+                flag = 1;
+            else flag = 0;
+        }
+        else {
+            if (i < j) {
+                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
+                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
+                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
+            } else {
+                nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0];
+                nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1];
+                nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2];
+            }
+            nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
+            //TODO
+            //TODO
+            //TODO
+            //if( nbr_pj->d <= (cutoff) ) {
+            if( nbr_pj->d <= SQR(cutoff) )
+            {
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+            else
+            {
+                flag = 0;
+            }
+        }
+
+        if( flag )
+        {
+            /* H matrix entry */
+            //if( j < n || atom_i->orig_id < atom_j->orig_id )
+            //++Htop;
+            //    indices [i] ++;
+            //else if (j < n || atom_i->orig_id > atom_j->orig_id )
+            //    indices [i] ++;
+
+            //if ((i < n) || (j < n))
+            //    indices [i] ++;
+            //if ((i < n) && (i < j) && ((j < n) || atom_i->orig_id < atom_j->orig_id))
+            //    indices [i] ++;
+            //if ( i >= n && j < n && atom_i->orig_id > atom_j->orig_id)
+            //    indices [i] ++;
+            //else if ((i >=n) && (i > j) && ((j < n) || (atom_i->orig_id > atom_j->orig_id)))
+            //    indices [i] ++;
+            //THIS IS THE HOST CONDITION
+            //if (i < n && i < j && ( j < n || atom_i->orig_id < atom_j->orig_id ))
+            //if (i < n && i < j && atom_i->orig_id < atom_j->orig_id && j >=n)
+            //    indices [i] ++;
+            //THIS IS THE DEVICE CONDITION
+            //if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
+            //    indices [i] ++;
+
+            //this is the working condition
+            if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id))
+                indices [i]++;
+            else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
+                indices [i] ++;
+            else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id ))
+                indices [i] ++;
+        }
+    }
+}
+
+
+int Cuda_Estimate_Sparse_Matrix (reax_system *system, control_params *control, 
+        simulation_data *data, reax_list **lists)
+{
+    int blocks, max_sp_entries;
+    int *indices = (int *) scratch;
+    int *h_indices = (int *) host_scratch;
+    int total_sparse = 0;
+
+    cuda_memset (indices, 0, sizeof (int) * system->N, "sp_matrix:indices");
+
+    blocks = system->N / DEF_BLOCK_SIZE + 
+        ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    //TODO
+    //TODO
+    //TODO
+    //TODO
+    ker_estimate_sparse_matrix  <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, (control_params *)control->d_control_params, 
+         *(*dev_lists + FAR_NBRS), system->n, system->N, 
+         (((data->step-data->prev_steps) % control->reneighbor) == 0), indices);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    copy_host_device (h_indices, indices, sizeof (int) * system->N, 
+            cudaMemcpyDeviceToHost, "sp_matrix:indices");
+    max_sp_entries = 0;    
+    for (int i = 0; i < system->N; i++){
+        total_sparse += h_indices [i];
+        if (max_sp_entries < h_indices[i])
+            max_sp_entries = h_indices[i];
+    }
+
+    //fprintf (stderr, " TOTAL DEVICE SPARSE ENTRIES: %d \n", total_sparse );
+    //fprintf (stderr, "p%d: Max sparse entries -> %d \n", system->my_rank, max_sp_entries );
+    system->max_sparse_entries = max_sp_entries * SAFE_ZONE;
+
+    return SUCCESS;
+}
+
+
+CUDA_GLOBAL void ker_init_forces (reax_atom *my_atoms, single_body_parameters *sbp, 
+        two_body_parameters *tbp, storage workspace, 
+        control_params *control, 
+        reax_list far_nbrs, reax_list bonds, reax_list hbonds, 
+        LR_lookup_table *t_LR, 
+        int n, int N, int num_atom_types, 
+        int max_sparse_entries, int renbr, 
+        int max_bonds, int max_hbonds)
+{
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop;
+    int btop_i, ihb, jhb, ihb_top;
+    //int btop_j, jhb, jhb_top;
+    int local, flag, flag2, flag3;
+    real r_ij, cutoff;
+    //reax_list *far_nbrs, *bonds, *hbonds;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+    sparse_matrix *H = &(workspace.H);
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    Htop = i * max_sparse_entries;
+    btop_i = 0;
+
+    //Commented for CUDA KERNEL
+    //for( i = 0; i < system->N; ++i ) {
+    atom_i = &(my_atoms[i]);
+    type_i  = atom_i->type;
+    start_i = Dev_Start_Index(i, &far_nbrs);
+    end_i   = Dev_End_Index(i, &far_nbrs);
+    //CHANGE ORIGINAL
+    //btop_i = Dev_Start_Index( i, &bonds );
+    btop_i = i * max_bonds;
+    Dev_Set_Start_Index (i, btop_i, &bonds);
+    //CHANGE ORIGINAL
+
+    sbp_i = &(sbp[type_i]);
+
+    if( i < n ) {
+        local = 1;
+        cutoff = control->nonb_cut;
+
+        //update bond mark here
+        workspace.bond_mark [i] = 0;
+
+    }
+    else {
+        local = 0;
+        cutoff = control->bond_cut;
+
+        //update bond mark here
+        workspace.bond_mark [i] = 1000;
+    }
+
+    ihb = -1;
+    ihb_top = -1;
+    //CHANGE ORIGINAL
+    H->start[i] = Htop;
+
+    if( local ) {
+        H->entries[Htop].j = i;
+        H->entries[Htop].val = sbp_i->eta;
+        ++Htop;
+    }
+    //CHANGE ORIGINAL
+
+    if( control->hbond_cut > 0 ) {
+        ihb = sbp_i->p_hbond;
+        //CHANGE ORIGINAL
+        if( ihb == 1  || ihb == 2) {
+            //CHANGE ORIGINAL
+            //ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds );
+            ihb_top = i * max_hbonds;
+            Dev_Set_Start_Index (atom_i->Hindex, ihb_top, &hbonds );
+        }
+        else ihb_top = -1;
+    }
+
+    /* update i-j distance - check if j is within cutoff */
+    for( pj = start_i; pj < end_i; ++pj ) {
+        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &(my_atoms[j]);
+        if( renbr ) {
+            if(nbr_pj->d <= cutoff)
+                flag = 1;
+            else flag = 0;
+
+            if(nbr_pj->d <= control->nonb_cut)
+                flag2 = 1;
+            else flag2 = 0;
+
+        }
+        else{
+            if (i < j) {
+                nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
+                nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
+                nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
+                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
+            } else {
+                nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0];
+                nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1];
+                nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2];
+                nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
+            }
+
+            if(nbr_pj->d <= SQR (control->nonb_cut))
+                flag2 = 1;
+            else flag2 = 0;
+
+            //if( nbr_pj->d <= SQR(cutoff) ) {
+            if( nbr_pj->d <= SQR(control->nonb_cut) ) {
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+            else {
+                flag = 0;
+            }
+        }
+        if (flag2) {
+            ihb = sbp_i->p_hbond;
+            type_j = atom_j->type;
+            sbp_j = &(sbp[type_j]);
+            jhb = sbp_j->p_hbond;
+            if( control->hbond_cut > 0 
+                    && nbr_pj->d <= control->hbond_cut
+                    && (ihb == 2)
+                    && (jhb == 1)
+                    && (i >= n)
+                    && (j < n)
+              ) 
+            {
+                hbonds.select.hbond_list[ihb_top].nbr = j;
+                hbonds.select.hbond_list[ihb_top].scl = -1;
+                hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                //CUDA SPECIFIC
+                hbonds.select.hbond_list[ihb_top].sym_index = -1;
+                rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f);
+
+                ++ihb_top;
+            }
+
+            //if ((i < n) || (j < n))
+            //if (local || ((i >= n) &&(j < n)))
+
+            flag3 = false;
+            if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id))
+                flag3 = true;
+            else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
+                flag3 = true;
+            else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id ))
+                flag3 = true;
+
+            if (flag3)
+            {
+                twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]);
+                r_ij = nbr_pj->d;
+
+                //if (renbr) {
+                H->entries[Htop].j = j;
+                if( control->tabulate == 0 )
+                    H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
+                else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
+                //}
+                ++Htop;
+            }
+        }
+
+        if( flag ){
+            type_j = atom_j->type;
+            r_ij = nbr_pj->d;
+            sbp_j = &(sbp[type_j]);
+            twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]);
+
+            if( local ) {
+                /* H matrix entry */
+                /*
+                   if( j < n || atom_i->orig_id < atom_j->orig_id ) {//tryQEq||1
+                   H->entries[Htop].j = j;
+                   if( control->tabulate == 0 )
+                   H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
+                   else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
+                   ++Htop;
+                   } 
+                   else if( j < n || atom_i->orig_id > atom_j->orig_id ) {//tryQEq||1
+                   H->entries[Htop].j = j;
+                   if( control->tabulate == 0 )
+                   H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
+                   else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
+                   ++Htop;
+                   } 
+                 */
+
+                //bool condition = !((i >= n) && (j >= n));
+                /* hydrogen bond lists */
+                if( control->hbond_cut > 0 && (ihb==1 || ihb==2) &&
+                        nbr_pj->d <= control->hbond_cut // && i < j
+                  ) {
+                    jhb = sbp_j->p_hbond;
+                    if( ihb == 1 && jhb == 2 ) {
+                        hbonds.select.hbond_list[ihb_top].nbr = j;
+                        if (i < j) 
+                            hbonds.select.hbond_list[ihb_top].scl = 1;
+                        else
+                            hbonds.select.hbond_list[ihb_top].scl = -1;
+                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                        //CUDA SPECIFIC
+                        hbonds.select.hbond_list[ihb_top].sym_index = -1;
+                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f);
+
+
+                        ++ihb_top;
+                    }
+                    //else if( j < n && ihb == 2 && jhb == 1 ) 
+                    else if( ihb == 2 && jhb == 1 && j < n) {
+                        //jhb_top = End_Index( atom_j->Hindex, hbonds );
+                        hbonds.select.hbond_list[ihb_top].nbr = j;
+                        hbonds.select.hbond_list[ihb_top].scl = -1;
+                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                        //CUDA SPECIFIC
+                        hbonds.select.hbond_list[ihb_top].sym_index = -1;
+                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f);
+
+                        ++ihb_top;
+
+                        //Set_End_Index( atom_j->Hindex, jhb_top+1, hbonds );
+                        //++num_hbonds;
+                    }
+                }
+            }
+
+
+
+            /* uncorrected bond orders */
+            if( nbr_pj->d <= control->bond_cut 
+                    && Dev_BOp( bonds, control->bo_cut, 
+                        i , btop_i, nbr_pj, sbp_i, sbp_j, twbp, 
+                        workspace.dDeltap_self, workspace.total_bond_order) 
+              ) {
+                //num_bonds += 2;
+                ++btop_i;
+
+                /* Need to do later... since i and j are parallel
+                   if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
+                   workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
+                   else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 ) {
+                   workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
+                   }
+                 */
+            }
+        }
+        }
+
+        Dev_Set_End_Index( i, btop_i, &bonds );
+        //    if( local ) {
+        H->end[i] = Htop;
+        //   }
+        //CHANGE ORIGINAL
+        if(( ihb == 1 || ihb == 2 ) && (ihb_top > 0) && (control->hbond_cut > 0))
+            Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds );
+        //} Commented for cuda kernel
+}
+
+
+CUDA_GLOBAL void ker_init_bond_mark (int offset, int n, int *bond_mark)
+{
+    int i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    bond_mark [offset + threadIdx.x] = 1000;
+}
+
+
+CUDA_GLOBAL void New_fix_sym_dbond_indices (reax_list pbonds, int N)
+{
+    int i, nbr;
+    bond_data *ibond, *jbond;
+    int atom_j;
+
+    reax_list *bonds = &pbonds;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    for (int j = Dev_Start_Index (i, bonds); j < Dev_End_Index (i, bonds); j++)
+    {
+        ibond = &( bonds->select.bond_list [j] );
+        nbr = ibond->nbr;
+
+        for (int k = Dev_Start_Index (nbr, bonds); k < Dev_End_Index (nbr, bonds); k ++)
+        {
+            jbond = &( bonds->select.bond_list[ k ] );
+            atom_j = jbond->nbr;
+
+            if ( (atom_j == i) )
+            {
+                if (i > nbr) {
+                    ibond->dbond_index = j;
+                    jbond->dbond_index = j;
+
+                    ibond->sym_index = k;
+                    jbond->sym_index = j;
+                }
+            }
+        }
+    }
+}
+
+
+CUDA_GLOBAL void New_fix_sym_hbond_indices (reax_atom *my_atoms, reax_list hbonds, int N )
+{
+
+    hbond_data *ihbond, *jhbond;
+
+    int __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM;
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warp_id = thread_id / __THREADS_PER_ATOM__;
+    int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
+    int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+
+    if (warp_id > N) return;
+
+    int i = warp_id;
+    int nbr;
+    int k;
+    int start = Dev_Start_Index (my_atoms[i].Hindex, &hbonds);
+    int end = Dev_End_Index (my_atoms[i].Hindex, &hbonds);
+    int j = start + lane_id;
+    while (j < end)
+    {
+        ihbond = &( hbonds.select.hbond_list [j] );
+        nbr = ihbond->nbr;
+
+        int nbrstart = Dev_Start_Index (my_atoms[nbr].Hindex, &hbonds);
+        int nbrend = Dev_End_Index (my_atoms[nbr].Hindex, &hbonds);
+
+        for (k = nbrstart; k < nbrend; k++)
+        {
+            jhbond = &( hbonds.select.hbond_list [k] );
+
+            if (jhbond->nbr == i){
+                ihbond->sym_index = k;
+                jhbond->sym_index = j;
+                break;
+            }
+        }
+
+        j += __THREADS_PER_ATOM__;
+    }
+}
+
+
+////////////////////////
+// HBOND ISSUE
+CUDA_GLOBAL void ker_update_bonds (reax_atom *my_atoms, 
+        reax_list bonds, 
+        int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    my_atoms [i].num_bonds = 
+        MAX(Dev_Num_Entries(i, &bonds) * 2, MIN_BONDS);
+}
+
+
+CUDA_GLOBAL void ker_update_hbonds (reax_atom *my_atoms, 
+        reax_list hbonds,
+        int n)
 {
-	int i, j, pj;
-	int start_i, end_i;
-	int flag;
-	real cutoff;
-	far_neighbor_data *nbr_pj;
-	reax_atom *atom_i, *atom_j;
-	reax_list *far_nbrs = &( p_far_nbrs );
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	atom_i = &(my_atoms[i]);
-	start_i = Dev_Start_Index(i, far_nbrs);
-	end_i   = Dev_End_Index(i, far_nbrs);
-
-	cutoff = control->nonb_cut;
-
-	//++Htop;
-	if ( i < n) 
-		indices [i] ++;
-
-	/* update i-j distance - check if j is within cutoff */
-	for( pj = start_i; pj < end_i; ++pj ) {
-		nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-		j = nbr_pj->nbr;
-		atom_j = &(my_atoms[j]);
-		if( renbr ) {
-			if(nbr_pj->d <= cutoff)
-				flag = 1;
-			else flag = 0;
-		}
-		else {
-			if (i < j) {
-				nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
-				nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
-				nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
-			} else {
-				nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0];
-				nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1];
-				nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2];
-			}
-			nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
-			//TODO
-			//TODO
-			//TODO
-			//if( nbr_pj->d <= (cutoff) ) {
-			if( nbr_pj->d <= SQR(cutoff) ) {
-				nbr_pj->d = sqrt(nbr_pj->d);
-				flag = 1;
-			}
-			else {
-				flag = 0;
-			}
-		}
-
-		if( flag ){
-			/* H matrix entry */
-			//if( j < n || atom_i->orig_id < atom_j->orig_id )
-			//++Htop;
-			//	indices [i] ++;
-			//else if (j < n || atom_i->orig_id > atom_j->orig_id )
-			//	indices [i] ++;
-
-			//if ((i < n) || (j < n))
-			//	indices [i] ++;
-			//if ((i < n) && (i < j) && ((j < n) || atom_i->orig_id < atom_j->orig_id))
-			//	indices [i] ++;
-			//if ( i >= n && j < n && atom_i->orig_id > atom_j->orig_id)
-			//	indices [i] ++;
-			//else if ((i >=n) && (i > j) && ((j < n) || (atom_i->orig_id > atom_j->orig_id)))
-			//	indices [i] ++;
-			//THIS IS THE HOST CONDITION
-			//if (i < n && i < j && ( j < n || atom_i->orig_id < atom_j->orig_id ))
-			//if (i < n && i < j && atom_i->orig_id < atom_j->orig_id && j >=n)
-			//	indices [i] ++;
-			//THIS IS THE DEVICE CONDITION
-			//if ( i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
-			//	indices [i] ++;
-
-			//this is the working condition
-			if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id))
-				indices [i]++;
-			else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
-				indices [i] ++;
-			else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id ))
-				indices [i] ++;
-		}
-		}
-	}
-
-	int Cuda_Estimate_Sparse_Matrix (reax_system *system, control_params *control, 
-			simulation_data *data, reax_list **lists)
-	{
-		int blocks, max_sp_entries;
-		int *indices = (int *) scratch;
-		int *h_indices = (int *) host_scratch;
-		int total_sparse = 0;
-
-		cuda_memset (indices, 0, sizeof (int) * system->N, "sp_matrix:indices");
-
-		blocks = system->N / DEF_BLOCK_SIZE + 
-			((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-		//TODO
-		//TODO
-		//TODO
-		//TODO
-		ker_estimate_sparse_matrix  <<< blocks, DEF_BLOCK_SIZE >>>
-			(system->d_my_atoms, (control_params *)control->d_control_params, 
-			 *(*dev_lists + FAR_NBRS), system->n, system->N, 
-			 (((data->step-data->prev_steps) % control->reneighbor) == 0), indices);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		copy_host_device (h_indices, indices, sizeof (int) * system->N, 
-				cudaMemcpyDeviceToHost, "sp_matrix:indices");
-		max_sp_entries = 0;	
-		for (int i = 0; i < system->N; i++){
-			total_sparse += h_indices [i];
-			if (max_sp_entries < h_indices[i])
-				max_sp_entries = h_indices[i];
-		}
-
-		//fprintf (stderr, " TOTAL DEVICE SPARSE ENTRIES: %d \n", total_sparse );
-		//fprintf (stderr, "p%d: Max sparse entries -> %d \n", system->my_rank, max_sp_entries );
-		system->max_sparse_entries = max_sp_entries * SAFE_ZONE;
-
-		return SUCCESS;
-	}
-
-
-	CUDA_GLOBAL void ker_init_forces (reax_atom *my_atoms, single_body_parameters *sbp, 
-			two_body_parameters *tbp, storage workspace, 
-			control_params *control, 
-			reax_list far_nbrs, reax_list bonds, reax_list hbonds, 
-			LR_lookup_table *t_LR, 
-			int n, int N, int num_atom_types, 
-			int max_sparse_entries, int renbr, 
-			int max_bonds, int max_hbonds)
-	{
-		int i, j, pj;
-		int start_i, end_i;
-		int type_i, type_j;
-		int Htop;
-		int btop_i, ihb, jhb, ihb_top;
-		//int btop_j, jhb, jhb_top;
-		int local, flag, flag2, flag3;
-		real r_ij, cutoff;
-		//reax_list *far_nbrs, *bonds, *hbonds;
-		single_body_parameters *sbp_i, *sbp_j;
-		two_body_parameters *twbp;
-		far_neighbor_data *nbr_pj;
-		reax_atom *atom_i, *atom_j;
-		sparse_matrix *H = &(workspace.H);
-
-		i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= N) return;
-
-		Htop = i * max_sparse_entries;
-		btop_i = 0;
-
-		//Commented for CUDA KERNEL
-		//for( i = 0; i < system->N; ++i ) {
-		atom_i = &(my_atoms[i]);
-		type_i  = atom_i->type;
-		start_i = Dev_Start_Index(i, &far_nbrs);
-		end_i   = Dev_End_Index(i, &far_nbrs);
-		//CHANGE ORIGINAL
-		//btop_i = Dev_Start_Index( i, &bonds );
-		btop_i = i * max_bonds;
-		Dev_Set_Start_Index (i, btop_i, &bonds);
-		//CHANGE ORIGINAL
-
-		sbp_i = &(sbp[type_i]);
-
-		if( i < n ) {
-			local = 1;
-			cutoff = control->nonb_cut;
-
-			//update bond mark here
-			workspace.bond_mark [i] = 0;
-
-		}
-		else {
-			local = 0;
-			cutoff = control->bond_cut;
-
-			//update bond mark here
-			workspace.bond_mark [i] = 1000;
-		}
-
-		ihb = -1;
-		ihb_top = -1;
-		//CHANGE ORIGINAL
-		H->start[i] = Htop;
-
-		if( local ) {
-			H->entries[Htop].j = i;
-			H->entries[Htop].val = sbp_i->eta;
-			++Htop;
-		}
-		//CHANGE ORIGINAL
-
-		if( control->hbond_cut > 0 ) {
-			ihb = sbp_i->p_hbond;
-			//CHANGE ORIGINAL
-			if( ihb == 1  || ihb == 2) {
-				//CHANGE ORIGINAL
-				//ihb_top = Dev_Start_Index( atom_i->Hindex, &hbonds );
-				ihb_top = i * max_hbonds;
-				Dev_Set_Start_Index (atom_i->Hindex, ihb_top, &hbonds );
-			}
-			else ihb_top = -1;
-		}
-
-		/* update i-j distance - check if j is within cutoff */
-		for( pj = start_i; pj < end_i; ++pj ) {
-			nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-			j = nbr_pj->nbr;
-			atom_j = &(my_atoms[j]);
-			if( renbr ) {
-				if(nbr_pj->d <= cutoff)
-					flag = 1;
-				else flag = 0;
-
-				if(nbr_pj->d <= control->nonb_cut)
-					flag2 = 1;
-				else flag2 = 0;
-
-			}
-			else{
-				if (i < j) {
-					nbr_pj->dvec[0] = atom_j->x[0] - atom_i->x[0];
-					nbr_pj->dvec[1] = atom_j->x[1] - atom_i->x[1];
-					nbr_pj->dvec[2] = atom_j->x[2] - atom_i->x[2];
-					nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
-				} else {
-					nbr_pj->dvec[0] = atom_i->x[0] - atom_j->x[0];
-					nbr_pj->dvec[1] = atom_i->x[1] - atom_j->x[1];
-					nbr_pj->dvec[2] = atom_i->x[2] - atom_j->x[2];
-					nbr_pj->d = rvec_Norm_Sqr( nbr_pj->dvec );
-				}
-
-				if(nbr_pj->d <= SQR (control->nonb_cut))
-					flag2 = 1;
-				else flag2 = 0;
-
-				//if( nbr_pj->d <= SQR(cutoff) ) {
-				if( nbr_pj->d <= SQR(control->nonb_cut) ) {
-					nbr_pj->d = sqrt(nbr_pj->d);
-					flag = 1;
-				}
-				else {
-					flag = 0;
-				}
-			}
-			if (flag2) {
-				ihb = sbp_i->p_hbond;
-				type_j = atom_j->type;
-				sbp_j = &(sbp[type_j]);
-				jhb = sbp_j->p_hbond;
-				if( control->hbond_cut > 0 
-						&& nbr_pj->d <= control->hbond_cut
-						&& (ihb == 2)
-						&& (jhb == 1)
-						&& (i >= n)
-						&& (j < n)
-				  ) 
-				{
-					hbonds.select.hbond_list[ihb_top].nbr = j;
-					hbonds.select.hbond_list[ihb_top].scl = -1;
-					hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-					//CUDA SPECIFIC
-					hbonds.select.hbond_list[ihb_top].sym_index = -1;
-					rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f);
-
-					++ihb_top;
-				}
-
-				//if ((i < n) || (j < n))
-				//if (local || ((i >= n) &&(j < n)))
-
-				flag3 = false;
-				if (i < j && i < n && ( j < n || atom_i->orig_id < atom_j->orig_id))
-					flag3 = true;
-				else if (i > j && i >= n && j < n && atom_j->orig_id < atom_i->orig_id)
-					flag3 = true;
-				else if (i > j && i < n && ( j < n || atom_j->orig_id < atom_i->orig_id ))
-					flag3 = true;
-
-				if (flag3)
-				{
-					twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]);
-					r_ij = nbr_pj->d;
-
-					//if (renbr) {
-					H->entries[Htop].j = j;
-					if( control->tabulate == 0 )
-						H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
-					else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
-					//}
-					++Htop;
-				}
-			}
-
-			if( flag ){
-				type_j = atom_j->type;
-				r_ij = nbr_pj->d;
-				sbp_j = &(sbp[type_j]);
-				twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types)]);
-
-				if( local ) {
-					/* H matrix entry */
-					/*
-					   if( j < n || atom_i->orig_id < atom_j->orig_id ) {//tryQEq||1
-					   H->entries[Htop].j = j;
-					   if( control->tabulate == 0 )
-					   H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
-					   else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
-					   ++Htop;
-					   } 
-					   else if( j < n || atom_i->orig_id > atom_j->orig_id ) {//tryQEq||1
-					   H->entries[Htop].j = j;
-					   if( control->tabulate == 0 )
-					   H->entries[Htop].val = Compute_H(r_ij,twbp->gamma,workspace.Tap);
-					   else H->entries[Htop].val = Compute_tabH(t_LR, r_ij, type_i, type_j,num_atom_types);
-					   ++Htop;
-					   } 
-					 */
-
-					//bool condition = !((i >= n) && (j >= n));
-					/* hydrogen bond lists */
-					if( control->hbond_cut > 0 && (ihb==1 || ihb==2) &&
-							nbr_pj->d <= control->hbond_cut // && i < j
-					  ) {
-						jhb = sbp_j->p_hbond;
-						if( ihb == 1 && jhb == 2 ) {
-							hbonds.select.hbond_list[ihb_top].nbr = j;
-							if (i < j) 
-								hbonds.select.hbond_list[ihb_top].scl = 1;
-							else
-								hbonds.select.hbond_list[ihb_top].scl = -1;
-							hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-							//CUDA SPECIFIC
-							hbonds.select.hbond_list[ihb_top].sym_index = -1;
-							rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f);
-
-
-							++ihb_top;
-						}
-						//else if( j < n && ihb == 2 && jhb == 1 ) 
-						else if( ihb == 2 && jhb == 1 && j < n) {
-							//jhb_top = End_Index( atom_j->Hindex, hbonds );
-							hbonds.select.hbond_list[ihb_top].nbr = j;
-							hbonds.select.hbond_list[ihb_top].scl = -1;
-							hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-							//CUDA SPECIFIC
-							hbonds.select.hbond_list[ihb_top].sym_index = -1;
-							rvec_MakeZero (hbonds.select.hbond_list[ihb_top].hb_f);
-
-							++ihb_top;
-
-							//Set_End_Index( atom_j->Hindex, jhb_top+1, hbonds );
-							//++num_hbonds;
-						}
-					}
-				}
-
-
-
-				/* uncorrected bond orders */
-				if( nbr_pj->d <= control->bond_cut 
-						&& Dev_BOp( bonds, control->bo_cut, 
-							i , btop_i, nbr_pj, sbp_i, sbp_j, twbp, 
-							workspace.dDeltap_self, workspace.total_bond_order) 
-				  ) {
-					//num_bonds += 2;
-					++btop_i;
-
-					/* Need to do later... since i and j are parallel
-					   if( workspace->bond_mark[j] > workspace->bond_mark[i] + 1 )
-					   workspace->bond_mark[j] = workspace->bond_mark[i] + 1;
-					   else if( workspace->bond_mark[i] > workspace->bond_mark[j] + 1 ) {
-					   workspace->bond_mark[i] = workspace->bond_mark[j] + 1;
-					   }
-					 */
-				}
-			}
-			}
-
-			Dev_Set_End_Index( i, btop_i, &bonds );
-			//    if( local ) {
-			H->end[i] = Htop;
-			//   }
-			//CHANGE ORIGINAL
-			if(( ihb == 1 || ihb == 2 ) && (ihb_top > 0) && (control->hbond_cut > 0))
-				Dev_Set_End_Index( atom_i->Hindex, ihb_top, &hbonds );
-			//} Commented for cuda kernel
-	}
-
-
-
-	CUDA_GLOBAL void ker_init_bond_mark (int offset, int n, int *bond_mark)
-	{
-		int i;
-
-		i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= n) return;
-
-		bond_mark [offset + threadIdx.x] = 1000;
-	}
-
-	CUDA_GLOBAL void New_fix_sym_dbond_indices (reax_list pbonds, int N)
-	{
-		int i, nbr;
-		bond_data *ibond, *jbond;
-		int atom_j;
-
-		reax_list *bonds = &pbonds;
-
-		i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= N) return;
-
-		for (int j = Dev_Start_Index (i, bonds); j < Dev_End_Index (i, bonds); j++)
-		{
-			ibond = &( bonds->select.bond_list [j] );
-			nbr = ibond->nbr;
-
-			for (int k = Dev_Start_Index (nbr, bonds); k < Dev_End_Index (nbr, bonds); k ++)
-			{
-				jbond = &( bonds->select.bond_list[ k ] );
-				atom_j = jbond->nbr;
-
-				if ( (atom_j == i) )
-				{
-					if (i > nbr) {
-						ibond->dbond_index = j;
-						jbond->dbond_index = j;
-
-						ibond->sym_index = k;
-						jbond->sym_index = j;
-					}
-				}
-			}
-		}
-	}
-
-
-	CUDA_GLOBAL void New_fix_sym_hbond_indices (reax_atom *my_atoms, reax_list hbonds, int N )
-	{
-
-		hbond_data *ihbond, *jhbond;
-
-		int __THREADS_PER_ATOM__ = HB_KER_SYM_THREADS_PER_ATOM;
-		int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-		int warp_id = thread_id / __THREADS_PER_ATOM__;
-		int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
-		int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-
-		if (warp_id > N) return;
-
-		int i = warp_id;
-		int nbr;
-		int k;
-		int start = Dev_Start_Index (my_atoms[i].Hindex, &hbonds);
-		int end = Dev_End_Index (my_atoms[i].Hindex, &hbonds);
-		int j = start + lane_id;
-		while (j < end)
-		{
-			ihbond = &( hbonds.select.hbond_list [j] );
-			nbr = ihbond->nbr;
-
-			int nbrstart = Dev_Start_Index (my_atoms[nbr].Hindex, &hbonds);
-			int nbrend = Dev_End_Index (my_atoms[nbr].Hindex, &hbonds);
-
-			for (k = nbrstart; k < nbrend; k++)
-			{
-				jhbond = &( hbonds.select.hbond_list [k] );
-
-				if (jhbond->nbr == i){
-					ihbond->sym_index = k;
-					jhbond->sym_index = j;
-					break;
-				}
-			}
-
-			j += __THREADS_PER_ATOM__;
-		}
-	}
-
-	////////////////////////
-	// HBOND ISSUE
-	CUDA_GLOBAL void ker_update_bonds (reax_atom *my_atoms, 
-			reax_list bonds, 
-			int n)
-	{
-		int i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= n) return;
-
-		my_atoms [i].num_bonds = 
-			MAX(Dev_Num_Entries(i, &bonds) * 2, MIN_BONDS);
-	}
-
-	CUDA_GLOBAL void ker_update_hbonds (reax_atom *my_atoms, 
-			reax_list hbonds,
-			int n)
-	{
-		int Hindex;
-		int i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= n) return;
-
-		Hindex = my_atoms[i].Hindex;
-		my_atoms [i].num_hbonds = 
-			MAX(Dev_Num_Entries(Hindex, &hbonds) * SAFER_ZONE, MIN_HBONDS);
-	}
-	////////////////////////
-	////////////////////////
-	////////////////////////
-
-	int Cuda_Validate_Lists (reax_system *system, storage *workspace, reax_list **lists, control_params *control, 
-			int step, int n, int N, int numH )
-	{
-		int blocks;
-		int i, comp, Hindex;
-		int *index, *end_index;
-		reax_list *bonds, *hbonds;
-		reax_atom *my_atoms;
-		reallocate_data *realloc;
-		realloc = &( dev_workspace->realloc);
-
-		int max_sp_entries, num_hbonds, num_bonds;
-		int total_sp_entries;
-
-		blocks = system->n / DEF_BLOCK_SIZE + 
-			((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-		ker_update_bonds <<< blocks, DEF_BLOCK_SIZE >>>
-			(system->d_my_atoms, *(*lists + BONDS), 
-			 system->n);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		////////////////////////
-		// HBOND ISSUE
-		//FIX - 4 - Added this check for hydrogen bond issue
-		if ((control->hbond_cut > 0) && (system->numH > 0)){
-			ker_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
-				(system->d_my_atoms, *(*lists + HBONDS), 
-				 system->n);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-		}
-
-		//validate sparse matrix entries.
-		memset (host_scratch, 0, 2 * system->N * sizeof (int));	
-		index = (int *) host_scratch;
-		end_index = index + system->N;
-		copy_host_device (index, dev_workspace->H.start, system->N * sizeof (int), 
-				cudaMemcpyDeviceToHost, "sparse_matrix:start" );
-		copy_host_device (end_index, dev_workspace->H.end, system->N * sizeof (int), 
-				cudaMemcpyDeviceToHost, "sparse_matrix:end" );
-		max_sp_entries = total_sp_entries = 0;
-		for (i = 0; i < N; i++ ){
-			//if (i < N-1)
-			//	comp = index [i+1];
-			//else
-			//	comp = dev_workspace->H.m;
-
-			total_sp_entries += end_index [i] - index[i];
-			if (end_index [i] - index[i] > system->max_sparse_entries) {
-				fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n",
-						step, i, index[i], end_index[i] );
-				return FAILURE;
-			} else if (end_index[i] >= dev_workspace->H.m) {
-				//SUDHIR_FIX_SPARSE_MATRIX
-				//TODO move this carver
-				//TODO move this carver
-				//TODO move this carver
-				fprintf (stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n", 
-						system->my_rank, step, i, index[i], end_index[i]);	
-				//TODO move this carver
-				//TODO move this carver
-				//TODO move this carver
-				return FAILURE;
-			} else {
-				if (max_sp_entries <= end_index[i] - index [i])
-					max_sp_entries = end_index[i] - index [i];
-			}
-		}
-		//if (max_sp_entries <= end_index[i] - index [i])
-		//	max_sp_entries = end_index[i] - index [i];
-
-		//update the current step max_sp_entries;
-		realloc->Htop = max_sp_entries;
-		fprintf (stderr, "p:%d - Cuda_Reallocate: Total H matrix entries: %d, cap: %d, used: %d \n", 
-				system->my_rank, dev_workspace->H.n, dev_workspace->H.m, total_sp_entries);
-
-		if (total_sp_entries >= dev_workspace->H.m) {
-			fprintf (stderr, "p:%d - **ran out of space for sparse matrix: step: %d, allocated: %d, used: %d \n", 
-					system->my_rank, step, dev_workspace->H.m, total_sp_entries);
-
-			return FAILURE;
-		}
-
-
-		//validate Bond list
-		if (N > 0) {
-			num_bonds = 0;
-
-			bonds = *lists + BONDS;
-			memset (host_scratch, 0, 2 * bonds->n * sizeof (int));	
-			index = (int *) host_scratch;
-			end_index = index + bonds->n;
-
-			copy_host_device (index, bonds->index, bonds->n * sizeof (int), 
-					cudaMemcpyDeviceToHost, "bonds:index");
-			copy_host_device (end_index, bonds->end_index, bonds->n * sizeof (int), 
-					cudaMemcpyDeviceToHost, "bonds:end_index");
-
-			/*
-			   for (i = 0; i < N; i++) {
-			   if (i < N-1)
-			   comp = index [i+1];
-			   else
-			   comp = bonds->num_intrs;
-
-			   if (end_index [i] > comp) {
-			   fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d str(i+1)=%d\n",
-			   step, i, index[i], end_index[i], comp );
-			   return FAILURE;
-			   }
-
-			   num_bonds += MAX( (end_index[i] - index[i]) * 4, MIN_BONDS);
-			   }
-
-			   if (end_index[N-1] >= bonds->num_intrs) {
-			   fprintf( stderr, "step%d-bondchk failed(end): i=N-1 start(i)=%d end(i)=%d num_intrs=%d\n",
-			   step, index[N-1], end_index[N-1], bonds->num_intrs);
-			   return FAILURE;
-			   }
-			   num_bonds = MAX( num_bonds, MIN_CAP*MIN_BONDS );
-			//check the condition for reallocation
-			realloc->num_bonds = num_bonds;
-			 */
-
-			int max_bonds = 0;
-			for (i = 0; i < N; i++) {
-				if (end_index[i] - index[i] >= system->max_bonds) {
-					fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d max_bonds=%d\n",
-							step, i, index[i], end_index[i], system->max_bonds);
-					return FAILURE;
-				}
-				if (end_index[i] - index[i] >= max_bonds)
-					max_bonds = index[i] - index[i];
-			}
-			realloc->num_bonds = max_bonds;
-
-		}
-
-		//validate Hbonds list
-		num_hbonds = 0;
-		// FIX - 4 - added additional check here
-		if ((numH > 0) && (control->hbond_cut > 0)) {
-			hbonds = *lists + HBONDS;
-			memset (host_scratch, 0, 2 * hbonds->n * sizeof (int) + sizeof (reax_atom) * system->N);	
-			index = (int *) host_scratch;
-			end_index = index + hbonds->n;
-			my_atoms = (reax_atom *)(end_index + hbonds->n);
-
-			copy_host_device (index, hbonds->index, hbonds->n * sizeof (int), 
-					cudaMemcpyDeviceToHost, "hbonds:index");
-			copy_host_device (end_index, hbonds->end_index, hbonds->n * sizeof (int), 
-					cudaMemcpyDeviceToHost, "hbonds:end_index");
-			copy_host_device (my_atoms, system->d_my_atoms, system->N * sizeof (reax_atom), 
-					cudaMemcpyDeviceToHost, "system:d_my_atoms");
-
-			//fprintf (stderr, " Total local atoms: %d \n", n);
-
-			/*
-			   for (i = 0; i < N-1; i++) {
-			   Hindex = my_atoms [i].Hindex;
-			   if (Hindex > -1) 
-			   comp = index [Hindex + 1];
-			   else
-			   comp = hbonds->num_intrs;
-
-			   if (end_index [Hindex] > comp) {
-			   fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d str(H+1)=%d\n",
-			   step, i, Hindex, index[Hindex], end_index[Hindex], comp );
-			   return FAILURE;
-			   }
-
-			   num_hbonds += MAX( (end_index [Hindex] - index [Hindex]) * 2, MIN_HBONDS * 2);
-			   }
-			   if (end_index [my_atoms[i].Hindex] > hbonds->num_intrs) {
-			   fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d num_intrs=%d\n",
-			   step, i, Hindex, index[Hindex], end_index[Hindex], hbonds->num_intrs);
-			   return FAILURE;
-			   }
-
-			   num_hbonds += MIN( (end_index [my_atoms[i].Hindex] - index [my_atoms[i].Hindex]) * 2, 
-			   2 * MIN_HBONDS);
-			   num_hbonds = MAX( num_hbonds, MIN_CAP*MIN_HBONDS );
-			   realloc->num_hbonds = num_hbonds;
-			 */
-
-			int max_hbonds = 0;
-			for (i = 0; i < N; i++) {
-				if (end_index[i] - index[i] >= system->max_hbonds) {
-					fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d end(i)=%d max_hbonds=%d\n",
-							step, i, index[i], end_index[i], system->max_hbonds);
-					return FAILURE;
-				}
-				if (end_index[i] - index[i] >= max_hbonds)
-					max_hbonds = end_index[i] - index[i];
-			}
-			realloc->num_hbonds = max_hbonds;
-		}
-
-		return SUCCESS;
-	}
-
-	CUDA_GLOBAL void ker_init_bond_orders (reax_atom *my_atoms, 
-			reax_list far_nbrs, 
-			reax_list bonds, 
-			real *total_bond_order, 
-			int N)
-	{
-		int i, j, pj; 
-		int start_i, end_i;
-		int type_i, type_j;
-		far_neighbor_data *nbr_pj;
-		reax_atom *atom_i, *atom_j;
-
-		i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= N) return;
-
-		atom_i = &(my_atoms[i]);
-		start_i = Dev_Start_Index(i, &far_nbrs);
-		end_i   = Dev_End_Index(i, &far_nbrs);
-
-		for( pj = start_i; pj < end_i; ++pj ) { 
-			// nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-			// j = nbr_pj->nbr;
-			// atom_j = &(my_atoms[j]);
-
-			//total_bond_order [i] ++;
-			//atom_i->Hindex ++;
-		}
-	}
-
-	CUDA_GLOBAL void ker_bond_mark (reax_list p_bonds, storage p_workspace, int N)
-	{
-		reax_list *bonds = &( p_bonds );
-		storage *workspace = &( p_workspace );
-		int j;
-
-		//int i = blockIdx.x * blockDim.x + threadIdx.x;
-		//if (i >= N) return;
-
-		for (int i = 0; i < N; i++) 
-			for (int k = Dev_Start_Index (i, bonds); k < Dev_End_Index (i, bonds); k++)
-			{
-				bond_data *bdata = &( bonds->select.bond_list [k] );
-				j = bdata->nbr;
-
-				if (i < j ) {
-					if ( workspace->bond_mark [j] > (workspace->bond_mark [i] + 1) )
-						workspace->bond_mark [j] = workspace->bond_mark [i] + 1;	
-					else if ( workspace->bond_mark [i] > (workspace->bond_mark [j] + 1) )
-						workspace->bond_mark [i] = workspace->bond_mark [j] + 1;
-				}
-			}
-	}
-
-
-	int Cuda_Init_Forces( reax_system *system, control_params *control,
-			simulation_data *data, storage *workspace,
-			reax_list **lists, output_controls *out_control ) 
-	{
-		int init_blocks;
-		int hblocks;
-
-		//init the workspace (bond_mark)
-		/*
-		   int blocks;
-		   cuda_memset (dev_workspace->bond_mark, 0, sizeof (int) * system->n, "bond_mark");
-
-		   blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
-		   (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-		   ker_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
-		   (system->n, (system->N - system->n), dev_workspace->bond_mark);
-		   cudaThreadSynchronize ();
-		   cudaCheckError ();
-		 */
-		//validate total_bond_orders
-
-		//main kernel
-		init_blocks = (system->N) / DEF_BLOCK_SIZE + 
-			(((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
-		//fprintf (stderr, " Total atoms: %d, blocks: %d \n", system->N, init_blocks );
-
-		//	ker_init_bond_orders <<<init_blocks, DEF_BLOCK_SIZE >>>
-		//			( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), 
-		//				dev_workspace->total_bond_order, system->N);
-		//	cudaThreadSynchronize ();
-		//	cudaCheckError ();
-		//	fprintf (stderr, " DONE WITH VALIDATION \n");
-
-		ker_init_forces <<<init_blocks, DEF_BLOCK_SIZE >>>
-			(system->d_my_atoms, system->reax_param.d_sbp, 
-			 system->reax_param.d_tbp, *dev_workspace, 
-			 (control_params *)control->d_control_params, 
-			 *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), *(*dev_lists + HBONDS), 
-			 d_LR, system->n, system->N, system->reax_param.num_atom_types, 
-			 //system->max_sparse_entries, ((data->step-data->prev_steps) % control->reneighbor));
-			system->max_sparse_entries, (((data->step-data->prev_steps) % control->reneighbor) == 0), 
-			system->max_bonds, system->max_hbonds);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-
-		//fix - sym_index and dbond_index
-		New_fix_sym_dbond_indices <<<init_blocks, BLOCK_SIZE>>> 
-			(*(*dev_lists + BONDS), system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		///////////////////////
-		///////////////////////
-		// FIX - 4 - HBOND ISSUE
-		if ((control->hbond_cut > 0 ) && (system->numH > 0))
-		{
-			//make hbond_list symmetric
-			hblocks = (system->N * HB_KER_SYM_THREADS_PER_ATOM) / HB_SYM_BLOCK_SIZE + 
-				((((system->N * HB_KER_SYM_THREADS_PER_ATOM) % HB_SYM_BLOCK_SIZE) == 0) ? 0 : 1);
-			//New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> 
-			New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> 
-				(system->d_my_atoms, *(*dev_lists + HBONDS), system->N);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-		}
-
-		//update bond_mark
-		//ker_bond_mark <<< init_blocks, DEF_BLOCK_SIZE>>>
-		/*
-		   ker_bond_mark <<< 1, 1>>>
-		   ( *(*dev_lists + BONDS), *dev_workspace, system->N);
-		   cudaThreadSynchronize ();
-		   cudaCheckError ();
-		 */
-
-		//TODO
-		//1. update the sparse matrix count for reallocation
-		//2. update the bonds count for reallocation
-		//3. update the hydrogen bonds count for reallocation
-
-		//Validate lists here.
-		return Cuda_Validate_Lists (system, workspace, dev_lists, control, 
-				data->step, system->n, system->N, system->numH );
-	}
-
-	int Cuda_Init_Forces_noQEq( reax_system *system, control_params *control,
-			simulation_data *data, storage *workspace,
-			reax_list **lists, output_controls *out_control ) 
-	{
-		//TODO Implement later
-		// when you figure out the bond_mark usage.
-
-		return FAILURE;
-	}
-
-	int Cuda_Compute_Bonded_Forces (reax_system *system, control_params *control, 
-			simulation_data *data, storage *workspace, 
-			reax_list **lists, output_controls *out_control )
-	{
-		real t_start, t_elapsed;
-		real *spad = (real *) scratch;
-		rvec *rvec_spad;
-
-		//1. Bond Order Interactions. - bond_orders.c
-		t_start = Get_Time( );
-		//fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS_N, BLOCK_SIZE);
-		Cuda_Calculate_BO_init  <<< BLOCKS_N, BLOCK_SIZE >>>
-			( system->d_my_atoms, system->reax_param.d_sbp, 
-			  *dev_workspace, 
-			  system->N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_Calculate_BO <<< BLOCKS_N, BLOCK_SIZE >>>
-			( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
-			  system->reax_param.d_tbp, *dev_workspace, 
-			  *(*dev_lists + BONDS),
-			  system->reax_param.num_atom_types, system->N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-
-		Cuda_Update_Uncorrected_BO <<<BLOCKS_N, BLOCK_SIZE>>>
-			(*dev_workspace, *(*dev_lists + BONDS), system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_Update_Workspace_After_BO <<<BLOCKS_N, BLOCK_SIZE>>>
-			(system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
-			 *dev_workspace, system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		t_elapsed = Get_Timing_Info( t_start );
-		//fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-		//fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n");
-
-		//2. Bond Energy Interactions. - bonds.c
-		t_start = Get_Time( );
-		cuda_memset (spad, 0, system->N * ( 2 * sizeof (real)) , "scratch");
-
-		Cuda_Bonds <<< BLOCKS, BLOCK_SIZE, sizeof (real)* BLOCK_SIZE >>>
-			( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp,
-			  *dev_workspace, *(*dev_lists + BONDS), 
-			  system->n, system->reax_param.num_atom_types, spad );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_BE
-		k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-			(spad, spad + system->n,  system->n);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>> 
-			(spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_bond, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		t_elapsed = Get_Timing_Info( t_start );
-		//fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-		//fprintf (stderr, "Cuda_Bond_Energy Done... \n");
-
-
-		//3. Atom Energy Interactions. 
-		t_start = Get_Time( );
-		cuda_memset (spad, 0, ( 6 * sizeof (real) * system->n ), "scratch");
-
-		Cuda_Atom_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_my_atoms, system->reax_param.d_gp, 
-				system->reax_param.d_sbp, system->reax_param.d_tbp, 
-				*dev_workspace, 
-				*(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types, 
-				spad, spad + 2 * system->n, spad + 4*system->n);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//CHANGE ORIGINAL
-		//Cuda_Atom_Energy_PostProcess     <<<BLOCKS, BLOCK_SIZE >>>
-		//					( *(*dev_lists + BONDS), *dev_workspace, system->n );
-		Cuda_Atom_Energy_PostProcess     <<<BLOCKS_N, BLOCK_SIZE >>>
-			( *(*dev_lists + BONDS), *dev_workspace, system->N );
-		//CHANGE ORIGINAL
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Lp
-		k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-			(spad, spad + system->n,  system->n);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>  
-			(spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_lp, BLOCKS);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Ov
-		k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-			(spad + 2*system->n, spad + 3*system->n,  system->n);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>  
-			(spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_ov, BLOCKS);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Un
-		k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-			(spad + 4*system->n, spad + 5*system->n,  system->n);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>  
-			(spad + 5*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_un, BLOCKS);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		t_elapsed = Get_Timing_Info( t_start );
-		//fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-		//fprintf (stderr, "test_LonePair_postprocess Done... \n");
-
-
-		//4. Valence Angles Interactions. 
-		t_start = Get_Time( );
-
-		//THREE BODY CHANGES HERE
-		cuda_memset(spad, 0, (*dev_lists + BONDS)->num_intrs * sizeof (int), "scratch");
-		Estimate_Cuda_Valence_Angles <<<BLOCKS_N, BLOCK_SIZE>>>
-			(system->d_my_atoms, 
-			 (control_params *)control->d_control_params, 
-			 *(*dev_lists + BONDS),
-			 system->n, system->N, (int *)spad);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-
-		int *thbody = (int *) host_scratch;
-		memset (thbody, 0, sizeof (int) * (*dev_lists + BONDS)->num_intrs);
-		copy_host_device (thbody, spad, (*dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, "thb:offsets");
-
-		int total_3body = thbody [0] * SAFE_ZONE;
-		for (int x = 1; x < (*dev_lists + BONDS)->num_intrs; x++) {
-			total_3body += thbody [x]*SAFE_ZONE;
-			thbody [x] += thbody [x-1];
-		}
-
-		system->num_thbodies = thbody [(*dev_lists+BONDS)->num_intrs-1];
-		if (!system->init_thblist) 
-		{
-			system->init_thblist = true;
-			if(!Dev_Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*dev_lists + THREE_BODIES))) {
-				fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
-				MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-			}    
-			if(!Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*lists + THREE_BODIES))) {
-				fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" );
-				MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-			}    
+    int Hindex;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    Hindex = my_atoms[i].Hindex;
+    my_atoms [i].num_hbonds = 
+        MAX(Dev_Num_Entries(Hindex, &hbonds) * SAFER_ZONE, MIN_HBONDS);
+}
+////////////////////////
+////////////////////////
+////////////////////////
+
+
+int Cuda_Validate_Lists (reax_system *system, storage *workspace, reax_list **lists, control_params *control, 
+        int step, int n, int N, int numH )
+{
+    int blocks;
+    int i, comp, Hindex;
+    int *index, *end_index;
+    reax_list *bonds, *hbonds;
+    reax_atom *my_atoms;
+    reallocate_data *realloc;
+    realloc = &( dev_workspace->realloc);
+
+    int max_sp_entries, num_hbonds, num_bonds;
+    int total_sp_entries;
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    ker_update_bonds <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, *(*lists + BONDS), 
+         system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    ////////////////////////
+    // HBOND ISSUE
+    //FIX - 4 - Added this check for hydrogen bond issue
+    if ((control->hbond_cut > 0) && (system->numH > 0)){
+        ker_update_hbonds <<< blocks, DEF_BLOCK_SIZE >>>
+            (system->d_my_atoms, *(*lists + HBONDS), 
+             system->n);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+
+    //validate sparse matrix entries.
+    memset (host_scratch, 0, 2 * system->N * sizeof (int));    
+    index = (int *) host_scratch;
+    end_index = index + system->N;
+    copy_host_device (index, dev_workspace->H.start, system->N * sizeof (int), 
+            cudaMemcpyDeviceToHost, "sparse_matrix:start" );
+    copy_host_device (end_index, dev_workspace->H.end, system->N * sizeof (int), 
+            cudaMemcpyDeviceToHost, "sparse_matrix:end" );
+    max_sp_entries = total_sp_entries = 0;
+    for (i = 0; i < N; i++ ){
+        //if (i < N-1)
+        //    comp = index [i+1];
+        //else
+        //    comp = dev_workspace->H.m;
+
+        total_sp_entries += end_index [i] - index[i];
+        if (end_index [i] - index[i] > system->max_sparse_entries) {
+            fprintf( stderr, "step%d-sparsemat-chk failed: i=%d start(i)=%d end(i)=%d \n",
+                    step, i, index[i], end_index[i] );
+            return FAILURE;
+        } else if (end_index[i] >= dev_workspace->H.m) {
+            //SUDHIR_FIX_SPARSE_MATRIX
+            //TODO move this carver
+            //TODO move this carver
+            //TODO move this carver
+            fprintf (stderr, "p:%d - step%d-sparsemat-chk failed (exceed limits): i=%d start(i)=%d end(i)=%d \n", 
+                    system->my_rank, step, i, index[i], end_index[i]);    
+            //TODO move this carver
+            //TODO move this carver
+            //TODO move this carver
+            return FAILURE;
+        } else {
+            if (max_sp_entries <= end_index[i] - index [i])
+                max_sp_entries = end_index[i] - index [i];
+        }
+    }
+    //if (max_sp_entries <= end_index[i] - index [i])
+    //    max_sp_entries = end_index[i] - index [i];
+
+    //update the current step max_sp_entries;
+    realloc->Htop = max_sp_entries;
+    fprintf (stderr, "p:%d - Cuda_Reallocate: Total H matrix entries: %d, cap: %d, used: %d \n", 
+            system->my_rank, dev_workspace->H.n, dev_workspace->H.m, total_sp_entries);
+
+    if (total_sp_entries >= dev_workspace->H.m) {
+        fprintf (stderr, "p:%d - **ran out of space for sparse matrix: step: %d, allocated: %d, used: %d \n", 
+                system->my_rank, step, dev_workspace->H.m, total_sp_entries);
+
+        return FAILURE;
+    }
+
+
+    //validate Bond list
+    if (N > 0) {
+        num_bonds = 0;
+
+        bonds = *lists + BONDS;
+        memset (host_scratch, 0, 2 * bonds->n * sizeof (int));    
+        index = (int *) host_scratch;
+        end_index = index + bonds->n;
+
+        copy_host_device (index, bonds->index, bonds->n * sizeof (int), 
+                cudaMemcpyDeviceToHost, "bonds:index");
+        copy_host_device (end_index, bonds->end_index, bonds->n * sizeof (int), 
+                cudaMemcpyDeviceToHost, "bonds:end_index");
+
+        /*
+           for (i = 0; i < N; i++) {
+           if (i < N-1)
+           comp = index [i+1];
+           else
+           comp = bonds->num_intrs;
+
+           if (end_index [i] > comp) {
+           fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d str(i+1)=%d\n",
+           step, i, index[i], end_index[i], comp );
+           return FAILURE;
+           }
+
+           num_bonds += MAX( (end_index[i] - index[i]) * 4, MIN_BONDS);
+           }
+
+           if (end_index[N-1] >= bonds->num_intrs) {
+           fprintf( stderr, "step%d-bondchk failed(end): i=N-1 start(i)=%d end(i)=%d num_intrs=%d\n",
+           step, index[N-1], end_index[N-1], bonds->num_intrs);
+           return FAILURE;
+           }
+           num_bonds = MAX( num_bonds, MIN_CAP*MIN_BONDS );
+        //check the condition for reallocation
+        realloc->num_bonds = num_bonds;
+         */
+
+        int max_bonds = 0;
+        for (i = 0; i < N; i++) {
+            if (end_index[i] - index[i] >= system->max_bonds) {
+                fprintf( stderr, "step%d-bondchk failed: i=%d start(i)=%d end(i)=%d max_bonds=%d\n",
+                        step, i, index[i], end_index[i], system->max_bonds);
+                return FAILURE;
+            }
+            if (end_index[i] - index[i] >= max_bonds)
+                max_bonds = index[i] - index[i];
+        }
+        realloc->num_bonds = max_bonds;
+
+    }
+
+    //validate Hbonds list
+    num_hbonds = 0;
+    // FIX - 4 - added additional check here
+    if ((numH > 0) && (control->hbond_cut > 0)) {
+        hbonds = *lists + HBONDS;
+        memset (host_scratch, 0, 2 * hbonds->n * sizeof (int) + sizeof (reax_atom) * system->N);    
+        index = (int *) host_scratch;
+        end_index = index + hbonds->n;
+        my_atoms = (reax_atom *)(end_index + hbonds->n);
+
+        copy_host_device (index, hbonds->index, hbonds->n * sizeof (int), 
+                cudaMemcpyDeviceToHost, "hbonds:index");
+        copy_host_device (end_index, hbonds->end_index, hbonds->n * sizeof (int), 
+                cudaMemcpyDeviceToHost, "hbonds:end_index");
+        copy_host_device (my_atoms, system->d_my_atoms, system->N * sizeof (reax_atom), 
+                cudaMemcpyDeviceToHost, "system:d_my_atoms");
+
+        //fprintf (stderr, " Total local atoms: %d \n", n);
+
+        /*
+           for (i = 0; i < N-1; i++) {
+           Hindex = my_atoms [i].Hindex;
+           if (Hindex > -1) 
+           comp = index [Hindex + 1];
+           else
+           comp = hbonds->num_intrs;
+
+           if (end_index [Hindex] > comp) {
+           fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d str(H+1)=%d\n",
+           step, i, Hindex, index[Hindex], end_index[Hindex], comp );
+           return FAILURE;
+           }
+
+           num_hbonds += MAX( (end_index [Hindex] - index [Hindex]) * 2, MIN_HBONDS * 2);
+           }
+           if (end_index [my_atoms[i].Hindex] > hbonds->num_intrs) {
+           fprintf(stderr,"step%d-atom:%d hbondchk failed: H=%d start(H)=%d end(H)=%d num_intrs=%d\n",
+           step, i, Hindex, index[Hindex], end_index[Hindex], hbonds->num_intrs);
+           return FAILURE;
+           }
+
+           num_hbonds += MIN( (end_index [my_atoms[i].Hindex] - index [my_atoms[i].Hindex]) * 2, 
+           2 * MIN_HBONDS);
+           num_hbonds = MAX( num_hbonds, MIN_CAP*MIN_HBONDS );
+           realloc->num_hbonds = num_hbonds;
+         */
+
+        int max_hbonds = 0;
+        for (i = 0; i < N; i++) {
+            if (end_index[i] - index[i] >= system->max_hbonds) {
+                fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d end(i)=%d max_hbonds=%d\n",
+                        step, i, index[i], end_index[i], system->max_hbonds);
+                return FAILURE;
+            }
+            if (end_index[i] - index[i] >= max_hbonds)
+                max_hbonds = end_index[i] - index[i];
+        }
+        realloc->num_hbonds = max_hbonds;
+    }
+
+    return SUCCESS;
+}
+
+
+CUDA_GLOBAL void ker_init_bond_orders (reax_atom *my_atoms, 
+        reax_list far_nbrs, 
+        reax_list bonds, 
+        real *total_bond_order, 
+        int N)
+{
+    int i, j, pj; 
+    int start_i, end_i;
+    int type_i, type_j;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    atom_i = &(my_atoms[i]);
+    start_i = Dev_Start_Index(i, &far_nbrs);
+    end_i   = Dev_End_Index(i, &far_nbrs);
+
+    for( pj = start_i; pj < end_i; ++pj ) { 
+        // nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        // j = nbr_pj->nbr;
+        // atom_j = &(my_atoms[j]);
+
+        //total_bond_order [i] ++;
+        //atom_i->Hindex ++;
+    }
+}
+
+
+CUDA_GLOBAL void ker_bond_mark (reax_list p_bonds, storage p_workspace, int N)
+{
+    reax_list *bonds = &( p_bonds );
+    storage *workspace = &( p_workspace );
+    int j;
+
+    //int i = blockIdx.x * blockDim.x + threadIdx.x;
+    //if (i >= N) return;
+
+    for (int i = 0; i < N; i++) 
+        for (int k = Dev_Start_Index (i, bonds); k < Dev_End_Index (i, bonds); k++)
+        {
+            bond_data *bdata = &( bonds->select.bond_list [k] );
+            j = bdata->nbr;
+
+            if (i < j ) {
+                if ( workspace->bond_mark [j] > (workspace->bond_mark [i] + 1) )
+                    workspace->bond_mark [j] = workspace->bond_mark [i] + 1;    
+                else if ( workspace->bond_mark [i] > (workspace->bond_mark [j] + 1) )
+                    workspace->bond_mark [i] = workspace->bond_mark [j] + 1;
+            }
+        }
+}
+
+
+int Cuda_Init_Forces( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control ) 
+{
+    int init_blocks;
+    int hblocks;
+
+    //init the workspace (bond_mark)
+    /*
+       int blocks;
+       cuda_memset (dev_workspace->bond_mark, 0, sizeof (int) * system->n, "bond_mark");
+
+       blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
+       (((system->N - system->n) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+       ker_init_bond_mark <<< blocks, DEF_BLOCK_SIZE >>>
+       (system->n, (system->N - system->n), dev_workspace->bond_mark);
+       cudaThreadSynchronize ();
+       cudaCheckError ();
+     */
+    //validate total_bond_orders
+
+    //main kernel
+    init_blocks = (system->N) / DEF_BLOCK_SIZE + 
+        (((system->N % DEF_BLOCK_SIZE) == 0) ? 0 : 1);
+    //fprintf (stderr, " Total atoms: %d, blocks: %d \n", system->N, init_blocks );
+
+    //    ker_init_bond_orders <<<init_blocks, DEF_BLOCK_SIZE >>>
+    //            ( system->d_my_atoms, *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), 
+    //                dev_workspace->total_bond_order, system->N);
+    //    cudaThreadSynchronize ();
+    //    cudaCheckError ();
+    //    fprintf (stderr, " DONE WITH VALIDATION \n");
+
+    ker_init_forces <<<init_blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, system->reax_param.d_sbp, 
+         system->reax_param.d_tbp, *dev_workspace, 
+         (control_params *)control->d_control_params, 
+         *(*dev_lists + FAR_NBRS), *(*dev_lists + BONDS), *(*dev_lists + HBONDS), 
+         d_LR, system->n, system->N, system->reax_param.num_atom_types, 
+         //system->max_sparse_entries, ((data->step-data->prev_steps) % control->reneighbor));
+        system->max_sparse_entries, (((data->step-data->prev_steps) % control->reneighbor) == 0), 
+        system->max_bonds, system->max_hbonds);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+
+    //fix - sym_index and dbond_index
+    New_fix_sym_dbond_indices <<<init_blocks, BLOCK_SIZE>>> 
+        (*(*dev_lists + BONDS), system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    ///////////////////////
+    ///////////////////////
+    // FIX - 4 - HBOND ISSUE
+    if ((control->hbond_cut > 0 ) && (system->numH > 0))
+    {
+        //make hbond_list symmetric
+        hblocks = (system->N * HB_KER_SYM_THREADS_PER_ATOM) / HB_SYM_BLOCK_SIZE + 
+            ((((system->N * HB_KER_SYM_THREADS_PER_ATOM) % HB_SYM_BLOCK_SIZE) == 0) ? 0 : 1);
+        //New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> 
+        New_fix_sym_hbond_indices <<<hblocks, HB_BLOCK_SIZE >>> 
+            (system->d_my_atoms, *(*dev_lists + HBONDS), system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+
+    //update bond_mark
+    //ker_bond_mark <<< init_blocks, DEF_BLOCK_SIZE>>>
+    /*
+       ker_bond_mark <<< 1, 1>>>
+       ( *(*dev_lists + BONDS), *dev_workspace, system->N);
+       cudaThreadSynchronize ();
+       cudaCheckError ();
+     */
+
+    //TODO
+    //1. update the sparse matrix count for reallocation
+    //2. update the bonds count for reallocation
+    //3. update the hydrogen bonds count for reallocation
+
+    //Validate lists here.
+    return Cuda_Validate_Lists (system, workspace, dev_lists, control, 
+            data->step, system->n, system->N, system->numH );
+}
+
+
+int Cuda_Init_Forces_noQEq( reax_system *system, control_params *control,
+        simulation_data *data, storage *workspace,
+        reax_list **lists, output_controls *out_control ) 
+{
+    //TODO Implement later
+    // when you figure out the bond_mark usage.
+
+    return FAILURE;
+}
+
+
+int Cuda_Compute_Bonded_Forces (reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, 
+        reax_list **lists, output_controls *out_control )
+{
+    real t_start, t_elapsed;
+    real *spad = (real *) scratch;
+    rvec *rvec_spad;
+
+    //1. Bond Order Interactions. - bond_orders.c
+    t_start = Get_Time( );
+    //fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS_N, BLOCK_SIZE);
+    Cuda_Calculate_BO_init  <<< BLOCKS_N, BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, 
+          *dev_workspace, 
+          system->N );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    Cuda_Calculate_BO <<< BLOCKS_N, BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
+          system->reax_param.d_tbp, *dev_workspace, 
+          *(*dev_lists + BONDS),
+          system->reax_param.num_atom_types, system->N );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+
+    Cuda_Update_Uncorrected_BO <<<BLOCKS_N, BLOCK_SIZE>>>
+        (*dev_workspace, *(*dev_lists + BONDS), system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    Cuda_Update_Workspace_After_BO <<<BLOCKS_N, BLOCK_SIZE>>>
+        (system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, 
+         *dev_workspace, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    t_elapsed = Get_Timing_Info( t_start );
+    //fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+    //fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n");
+
+    //2. Bond Energy Interactions. - bonds.c
+    t_start = Get_Time( );
+    cuda_memset (spad, 0, system->N * ( 2 * sizeof (real)) , "scratch");
+
+    Cuda_Bonds <<< BLOCKS, BLOCK_SIZE, sizeof (real)* BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_gp, system->reax_param.d_sbp, system->reax_param.d_tbp,
+          *dev_workspace, *(*dev_lists + BONDS), 
+          system->n, system->reax_param.num_atom_types, spad );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for E_BE
+    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
+        (spad, spad + system->n,  system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>> 
+        (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_bond, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    t_elapsed = Get_Timing_Info( t_start );
+    //fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+    //fprintf (stderr, "Cuda_Bond_Energy Done... \n");
+
+
+    //3. Atom Energy Interactions. 
+    t_start = Get_Time( );
+    cuda_memset (spad, 0, ( 6 * sizeof (real) * system->n ), "scratch");
+
+    Cuda_Atom_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_my_atoms, system->reax_param.d_gp, 
+            system->reax_param.d_sbp, system->reax_param.d_tbp, 
+            *dev_workspace, 
+            *(*dev_lists + BONDS), system->n, system->reax_param.num_atom_types, 
+            spad, spad + 2 * system->n, spad + 4*system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //CHANGE ORIGINAL
+    //Cuda_Atom_Energy_PostProcess     <<<BLOCKS, BLOCK_SIZE >>>
+    //                    ( *(*dev_lists + BONDS), *dev_workspace, system->n );
+    Cuda_Atom_Energy_PostProcess     <<<BLOCKS_N, BLOCK_SIZE >>>
+        ( *(*dev_lists + BONDS), *dev_workspace, system->N );
+    //CHANGE ORIGINAL
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for E_Lp
+    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
+        (spad, spad + system->n,  system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>  
+        (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_lp, BLOCKS);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for E_Ov
+    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
+        (spad + 2*system->n, spad + 3*system->n,  system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>  
+        (spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_ov, BLOCKS);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for E_Un
+    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
+        (spad + 4*system->n, spad + 5*system->n,  system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>  
+        (spad + 5*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_un, BLOCKS);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    t_elapsed = Get_Timing_Info( t_start );
+    //fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+    //fprintf (stderr, "test_LonePair_postprocess Done... \n");
+
+
+    //4. Valence Angles Interactions. 
+    t_start = Get_Time( );
+
+    //THREE BODY CHANGES HERE
+    cuda_memset(spad, 0, (*dev_lists + BONDS)->num_intrs * sizeof (int), "scratch");
+    Estimate_Cuda_Valence_Angles <<<BLOCKS_N, BLOCK_SIZE>>>
+        (system->d_my_atoms, 
+         (control_params *)control->d_control_params, 
+         *(*dev_lists + BONDS),
+         system->n, system->N, (int *)spad);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+
+    int *thbody = (int *) host_scratch;
+    memset (thbody, 0, sizeof (int) * (*dev_lists + BONDS)->num_intrs);
+    copy_host_device (thbody, spad, (*dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, "thb:offsets");
+
+    int total_3body = thbody [0] * SAFE_ZONE;
+    for (int x = 1; x < (*dev_lists + BONDS)->num_intrs; x++) {
+        total_3body += thbody [x]*SAFE_ZONE;
+        thbody [x] += thbody [x-1];
+    }
+
+    system->num_thbodies = thbody [(*dev_lists+BONDS)->num_intrs-1];
+    if (!system->init_thblist) 
+    {
+        system->init_thblist = true;
+        if(!Dev_Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*dev_lists + THREE_BODIES))) {
+            fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
+            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+        }    
+        if(!Make_List((*dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, (*lists + THREE_BODIES))) {
+            fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" );
+            MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+        }    
 #ifdef __CUDA_MEM__
-			fprintf (stderr, "Device memory allocated: three body list = %d MB\n", 
-					sizeof (three_body_interaction_data) * total_3body / (1024*1024));
+        fprintf (stderr, "Device memory allocated: three body list = %d MB\n", 
+                sizeof (three_body_interaction_data) * total_3body / (1024*1024));
 #endif
-		} else {
-			//if (((dev_workspace->realloc.num_bonds * DANGER_ZONE) >= (*dev_lists+BONDS)->num_intrs) || 
-			//		(system->num_thbodies > (*dev_lists+THREE_BODIES)->num_intrs )) { 
-			//int size = dev_workspace->realloc.num_bonds;
-			if ((system->num_thbodies >= (*dev_lists+THREE_BODIES)->num_intrs ) || 
-					((*dev_lists+THREE_BODIES)->n < (*dev_lists+BONDS)->num_intrs) ) {
-
-				int size = (*dev_lists + BONDS)->num_intrs;
-
-				/*Delete Three-body list*/
-				Dev_Delete_List( *dev_lists + THREE_BODIES );
-				Delete_List ( *lists + THREE_BODIES );
-
-				fprintf (stderr, "p%d ***** Reallocating the Three-body list threebody.n: %d, bonds.num_intrs: %d, num_thb: %d, thb_entries: %d \n", 
-						system->my_rank, (*dev_lists+THREE_BODIES)->n, (*dev_lists+BONDS)->num_intrs, 
-						system->num_thbodies, (*dev_lists+THREE_BODIES)->num_intrs);
+    } else {
+        //if (((dev_workspace->realloc.num_bonds * DANGER_ZONE) >= (*dev_lists+BONDS)->num_intrs) || 
+        //        (system->num_thbodies > (*dev_lists+THREE_BODIES)->num_intrs )) { 
+        //int size = dev_workspace->realloc.num_bonds;
+        if ((system->num_thbodies >= (*dev_lists+THREE_BODIES)->num_intrs ) || 
+                ((*dev_lists+THREE_BODIES)->n < (*dev_lists+BONDS)->num_intrs) ) {
+
+            int size = (*dev_lists + BONDS)->num_intrs;
+
+            /*Delete Three-body list*/
+            Dev_Delete_List( *dev_lists + THREE_BODIES );
+            Delete_List ( *lists + THREE_BODIES );
+
+            fprintf (stderr, "p%d ***** Reallocating the Three-body list threebody.n: %d, bonds.num_intrs: %d, num_thb: %d, thb_entries: %d \n", 
+                    system->my_rank, (*dev_lists+THREE_BODIES)->n, (*dev_lists+BONDS)->num_intrs, 
+                    system->num_thbodies, (*dev_lists+THREE_BODIES)->num_intrs);
 #ifdef __CUDA_MEM__
-				fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", 
-						data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies);
+            fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", 
+                    data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies);
 #endif
-				/*Recreate Three-body list */
-				if(!Dev_Make_List(size, total_3body, TYP_THREE_BODY, *dev_lists + THREE_BODIES )) {
-					fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
-					MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-				}
-				if(!Make_List(size, total_3body, TYP_THREE_BODY, *lists + THREE_BODIES )) {
-					fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" );
-					MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
-				}
-			}
-		}
-
-		//copy the indexes into the thb list;
-		copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1),
-				cudaMemcpyHostToDevice, "thb:index");
-		copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1),
-				cudaMemcpyHostToDevice, "thb:end_index");
-		//THREE_BODY CHANGES HERE
-
-
-		cuda_memset (spad, 0, ( 6 * sizeof (real) * system->N + sizeof (rvec) * system->N * 2), "scratch");
-		Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
-			( system->d_my_atoms,
-			  system->reax_param.d_gp, 
-			  system->reax_param.d_sbp, system->reax_param.d_thbp, 
-			  (control_params *)control->d_control_params,
-			  *dev_workspace, 
-			  *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES),
-			  system->n, system->N, system->reax_param.num_atom_types, 
-			  spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N));
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Ang
-		k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
-			(spad, spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>>
-			(spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, BLOCKS_N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Pen
-		k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-			(spad + 2*system->N, spad + 3*system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>>
-			(spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_pen, BLOCKS_N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Coa
-		k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-			(spad + 4*system->N, spad + 5*system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>>
-			(spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_coa, BLOCKS_N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for ext_pres
-		rvec_spad = (rvec *) (spad + 6*system->N);
-		k_reduction_rvec <<<BLOCKS_N, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>>
-			(rvec_spad, rvec_spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction_rvec <<<1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N >>>
-			(rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS_N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_Valence_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
-			(  system->d_my_atoms,
-			   (control_params *)control->d_control_params,
-			   *dev_workspace,
-			   *(*dev_lists + BONDS),
-			   system->N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		t_elapsed = Get_Timing_Info( t_start );
-		//fprintf (stderr, "Three_Body_Interactions ...  Timing %lf \n", t_elapsed );
-		//fprintf (stderr, "Three_Body_Interactions Done... \n");
-
-
-		//5. Torsion Angles Interactions. 
-		t_start = Get_Time( );
-		cuda_memset (spad, 0, ( 4 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2), "scratch");
-		Cuda_Torsion_Angles <<< BLOCKS, BLOCK_SIZE >>>
-			( system->d_my_atoms,
-			  system->reax_param.d_gp,
-			  system->reax_param.d_fbp,
-			  (control_params *)control->d_control_params,
-			  *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES),
-			  *dev_workspace,
-			  system->n, system->reax_param.num_atom_types, 
-			  spad, spad + 2*system->n, (rvec *) (spad + 4*system->n));
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Tor
-		k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-			(spad, spad + system->n,  system->n);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-			(spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_tor, BLOCKS);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Con
-		k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-			(spad + 2*system->n, spad + 3*system->n,  system->n);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-			(spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_con, BLOCKS);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for ext_pres
-		rvec_spad = (rvec *) (spad + 4*system->n);
-		k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>>
-			(rvec_spad, rvec_spad + system->n,  system->n);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>>
-			(rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Post process here
-		Cuda_Torsion_Angles_PostProcess   <<< BLOCKS_N, BLOCK_SIZE >>>
-			(  system->d_my_atoms,
-			   *dev_workspace,
-			   *(*dev_lists + BONDS),
-			   system->N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		t_elapsed = Get_Timing_Info( t_start );
-		//fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed );
-		//fprintf (stderr, " Four_Body_ Done... \n");
-
-
-		//6. Hydrogen Bonds Interactions.
-		// FIX - 4 - Added additional check here
-		if ((control->hbond_cut > 0) && (system->numH > 0)) {
-
-			t_start = Get_Time( );
-			cuda_memset (spad, 0, ( 2 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2 ), "scratch");
-
-
-			int hbs = ((system->n * HB_KER_THREADS_PER_ATOM)/ HB_BLOCK_SIZE) + 
-				(((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);
-			Cuda_Hydrogen_Bonds_MT <<<hbs, HB_BLOCK_SIZE, 
-					       HB_BLOCK_SIZE * (2 * sizeof (real) + 2 * sizeof (rvec)) >>>
-						       //Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE>>>
-						       (  system->d_my_atoms,
-							  system->reax_param.d_sbp,
-							  system->reax_param.d_hbp,
-							  system->reax_param.d_gp,
-							  (control_params *)control->d_control_params,
-							  *dev_workspace,
-							  *(*dev_lists + BONDS), *(*dev_lists + HBONDS),
-							  system->n, system->reax_param.num_atom_types,
-							  spad, (rvec *) (spad + 2*system->n));
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			//Reduction for E_HB
-			k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-				(spad, spad + system->n,  system->n);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-				(spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_hb, BLOCKS);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-
-			//Reduction for ext_pres
-			rvec_spad = (rvec *) (spad + 2*system->n);
-			k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>>
-				(rvec_spad, rvec_spad + system->n,  system->n);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>>
-				(rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			////post process step1:
-			Cuda_Hydrogen_Bonds_PostProcess <<< BLOCKS_N, BLOCK_SIZE, BLOCK_SIZE * sizeof (rvec) >>>
-				(  system->d_my_atoms,
-				   *dev_workspace,
-				   *(*dev_lists + BONDS),
-				   system->N );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			////post process step2:
-			/*
-			   Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof (rvec)>>>
-			   (  system->d_my_atoms,
-			 *dev_workspace,
-			 *(*dev_lists + HBONDS));
-			 */
-			int hnbrs_bl = ((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM)/ HB_POST_PROC_BLOCK_SIZE) + 
-				(((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1);
-			Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_bl, HB_POST_PROC_BLOCK_SIZE, 
-						     HB_POST_PROC_BLOCK_SIZE * sizeof (rvec)>>>
-							     (  system->d_my_atoms,
-								*dev_workspace,
-								*(*dev_lists + HBONDS), system->N);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			t_elapsed = Get_Timing_Info( t_start );
-			//fprintf (stderr, "Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed );
-			//fprintf (stderr, "Hydrogen_Bond Done... \n");	
-		}
-
-		return SUCCESS;
-		}
-
-		void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
-				simulation_data *data, storage *workspace, 
-				reax_list **lists, output_controls *out_control,
-				mpi_datatypes *mpi_data )
-		{
-			/* van der Waals and Coulomb interactions */
-			Cuda_NonBonded_Energy( system, control, workspace, data,
-					lists, out_control, (control->tabulate == 0) ? false: true);
-		}
+            /*Recreate Three-body list */
+            if(!Dev_Make_List(size, total_3body, TYP_THREE_BODY, *dev_lists + THREE_BODIES )) {
+                fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
+                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+            }
+            if(!Make_List(size, total_3body, TYP_THREE_BODY, *lists + THREE_BODIES )) {
+                fprintf( stderr, "Problem in initializing three-body list on host. Terminating!\n" );
+                MPI_Abort( MPI_COMM_WORLD, INSUFFICIENT_MEMORY );
+            }
+        }
+    }
+
+    //copy the indexes into the thb list;
+    copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1),
+            cudaMemcpyHostToDevice, "thb:index");
+    copy_host_device (thbody, ((*dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((*dev_lists+BONDS)->num_intrs - 1),
+            cudaMemcpyHostToDevice, "thb:end_index");
+    //THREE_BODY CHANGES HERE
+
+
+    cuda_memset (spad, 0, ( 6 * sizeof (real) * system->N + sizeof (rvec) * system->N * 2), "scratch");
+    Cuda_Valence_Angles <<< BLOCKS_N, BLOCK_SIZE >>>
+        ( system->d_my_atoms,
+          system->reax_param.d_gp, 
+          system->reax_param.d_sbp, system->reax_param.d_thbp, 
+          (control_params *)control->d_control_params,
+          *dev_workspace, 
+          *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES),
+          system->n, system->N, system->reax_param.num_atom_types, 
+          spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N));
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for E_Ang
+    k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>  
+        (spad, spad + system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>>
+        (spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ang, BLOCKS_N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for E_Pen
+    k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
+        (spad + 2*system->N, spad + 3*system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>>
+        (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_pen, BLOCKS_N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for E_Coa
+    k_reduction <<<BLOCKS_N, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
+        (spad + 4*system->N, spad + 5*system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N >>>
+        (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_coa, BLOCKS_N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for ext_pres
+    rvec_spad = (rvec *) (spad + 6*system->N);
+    k_reduction_rvec <<<BLOCKS_N, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>>
+        (rvec_spad, rvec_spad + system->N,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction_rvec <<<1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N >>>
+        (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS_N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    Cuda_Valence_Angles_PostProcess <<< BLOCKS_N, BLOCK_SIZE >>>
+        (  system->d_my_atoms,
+           (control_params *)control->d_control_params,
+           *dev_workspace,
+           *(*dev_lists + BONDS),
+           system->N );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    t_elapsed = Get_Timing_Info( t_start );
+    //fprintf (stderr, "Three_Body_Interactions ...  Timing %lf \n", t_elapsed );
+    //fprintf (stderr, "Three_Body_Interactions Done... \n");
+
+
+    //5. Torsion Angles Interactions. 
+    t_start = Get_Time( );
+    cuda_memset (spad, 0, ( 4 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2), "scratch");
+    Cuda_Torsion_Angles <<< BLOCKS, BLOCK_SIZE >>>
+        ( system->d_my_atoms,
+          system->reax_param.d_gp,
+          system->reax_param.d_fbp,
+          (control_params *)control->d_control_params,
+          *(*dev_lists + BONDS), *(*dev_lists + THREE_BODIES),
+          *dev_workspace,
+          system->n, system->reax_param.num_atom_types, 
+          spad, spad + 2*system->n, (rvec *) (spad + 4*system->n));
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for E_Tor
+    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
+        (spad, spad + system->n,  system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
+        (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_tor, BLOCKS);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for E_Con
+    k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
+        (spad + 2*system->n, spad + 3*system->n,  system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
+        (spad + 3*system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_con, BLOCKS);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for ext_pres
+    rvec_spad = (rvec *) (spad + 4*system->n);
+    k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>>
+        (rvec_spad, rvec_spad + system->n,  system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>>
+        (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Post process here
+    Cuda_Torsion_Angles_PostProcess   <<< BLOCKS_N, BLOCK_SIZE >>>
+        (  system->d_my_atoms,
+           *dev_workspace,
+           *(*dev_lists + BONDS),
+           system->N );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    t_elapsed = Get_Timing_Info( t_start );
+    //fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed );
+    //fprintf (stderr, " Four_Body_ Done... \n");
+
+
+    //6. Hydrogen Bonds Interactions.
+    // FIX - 4 - Added additional check here
+    if ((control->hbond_cut > 0) && (system->numH > 0)) {
+
+        t_start = Get_Time( );
+        cuda_memset (spad, 0, ( 2 * sizeof (real) * system->n + sizeof (rvec) * system->n * 2 ), "scratch");
+
+
+        int hbs = ((system->n * HB_KER_THREADS_PER_ATOM)/ HB_BLOCK_SIZE) + 
+            (((system->n * HB_KER_THREADS_PER_ATOM) % HB_BLOCK_SIZE) == 0 ? 0 : 1);
+        Cuda_Hydrogen_Bonds_MT <<<hbs, HB_BLOCK_SIZE, 
+                               HB_BLOCK_SIZE * (2 * sizeof (real) + 2 * sizeof (rvec)) >>>
+                                   //Cuda_Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE>>>
+                                   (  system->d_my_atoms,
+                                      system->reax_param.d_sbp,
+                                      system->reax_param.d_hbp,
+                                      system->reax_param.d_gp,
+                                      (control_params *)control->d_control_params,
+                                      *dev_workspace,
+                                      *(*dev_lists + BONDS), *(*dev_lists + HBONDS),
+                                      system->n, system->reax_param.num_atom_types,
+                                      spad, (rvec *) (spad + 2*system->n));
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for E_HB
+        k_reduction <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
+            (spad, spad + system->n,  system->n);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
+            (spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_hb, BLOCKS);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+
+        //Reduction for ext_pres
+        rvec_spad = (rvec *) (spad + 2*system->n);
+        k_reduction_rvec <<<BLOCKS, BLOCK_SIZE, sizeof (rvec) * BLOCK_SIZE >>>
+            (rvec_spad, rvec_spad + system->n,  system->n);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        k_reduction_rvec <<<1, BLOCKS_POW_2, sizeof (rvec) * BLOCKS_POW_2 >>>
+            (rvec_spad + system->n, &((simulation_data *)data->d_simulation_data)->my_ext_press, BLOCKS);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        ////post process step1:
+        Cuda_Hydrogen_Bonds_PostProcess <<< BLOCKS_N, BLOCK_SIZE, BLOCK_SIZE * sizeof (rvec) >>>
+            (  system->d_my_atoms,
+               *dev_workspace,
+               *(*dev_lists + BONDS),
+               system->N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        ////post process step2:
+        /*
+           Cuda_Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * sizeof (rvec)>>>
+           (  system->d_my_atoms,
+         *dev_workspace,
+         *(*dev_lists + HBONDS));
+         */
+        int hnbrs_bl = ((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM)/ HB_POST_PROC_BLOCK_SIZE) + 
+            (((system->N * HB_POST_PROC_KER_THREADS_PER_ATOM) % HB_POST_PROC_BLOCK_SIZE) == 0 ? 0 : 1);
+        Cuda_Hydrogen_Bonds_HNbrs_BL <<< hnbrs_bl, HB_POST_PROC_BLOCK_SIZE, 
+                                     HB_POST_PROC_BLOCK_SIZE * sizeof (rvec)>>>
+                                         (  system->d_my_atoms,
+                                            *dev_workspace,
+                                            *(*dev_lists + HBONDS), system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        t_elapsed = Get_Timing_Info( t_start );
+        //fprintf (stderr, "Hydrogen bonds return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed );
+        //fprintf (stderr, "Hydrogen_Bond Done... \n");    
+    }
+
+    return SUCCESS;
+}
+
 
+void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
+        simulation_data *data, storage *workspace, 
+        reax_list **lists, output_controls *out_control,
+        mpi_datatypes *mpi_data )
+{
+    /* van der Waals and Coulomb interactions */
+    Cuda_NonBonded_Energy( system, control, workspace, data,
+            lists, out_control, (control->tabulate == 0) ? false: true);
+}
diff --git a/PG-PuReMD/src/cuda_hydrogen_bonds.cu b/PG-PuReMD/src/cuda_hydrogen_bonds.cu
index db34b1b8..358c5073 100644
--- a/PG-PuReMD/src/cuda_hydrogen_bonds.cu
+++ b/PG-PuReMD/src/cuda_hydrogen_bonds.cu
@@ -32,731 +32,731 @@
 
 
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds( reax_atom *my_atoms, 
-		single_body_parameters *sbp, 
-		hbond_parameters *d_hbp,
-		global_parameters gp, 
-		control_params *control, 
-		storage p_workspace, 
-		reax_list p_bonds, 
-		reax_list p_hbonds, 
-		int n, 
-		int num_atom_types, 
-		real *data_e_hb, 
-		rvec *data_ext_press)
+        single_body_parameters *sbp, 
+        hbond_parameters *d_hbp,
+        global_parameters gp, 
+        control_params *control, 
+        storage p_workspace, 
+        reax_list p_bonds, 
+        reax_list p_hbonds, 
+        int n, 
+        int num_atom_types, 
+        real *data_e_hb, 
+        rvec *data_ext_press)
 {
-	int  i, j, k, pi, pk;
-	int  type_i, type_j, type_k;
-	int  start_j, end_j, hb_start_j, hb_end_j;
-	int  hblist[MAX_BONDS];
-	int  itr, top;
-	int  num_hb_intrs = 0;
-	ivec rel_jk;
-	real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-	real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
-	rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-	rvec dvec_jk, force, ext_press;
-	// rtensor temp_rtensor, total_rtensor;
-	hbond_parameters *hbp;
-	bond_order_data *bo_ij;
-	bond_data *pbond_ij;
-	far_neighbor_data *nbr_jk;
-	reax_list *bonds, *hbonds;
-	bond_data *bond_list;
-	hbond_data *hbond_list, *hbond_jk;
-	storage *workspace = &( p_workspace );
-
-	bonds = &( p_bonds );
-	bond_list = bonds->select.bond_list;
-	hbonds = & ( p_hbonds );
-	hbond_list = hbonds->select.hbond_list;
-
-	j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= n) return;
-
-	/* loops below discover the Hydrogen bonds between i-j-k triplets.
-	   here j is H atom and there has to be some bond between i and j.
-	   Hydrogen bond is between j and k.
-	   so in this function i->X, j->H, k->Z when we map 
-	   variables onto the ones in the handout.*/
-	//for( j = 0; j < system->n; ++j )
-	/* j has to be of type H */
-	if( sbp[ my_atoms[j].type ].p_hbond == 1 ) {
-		/*set j's variables */
-		type_j     = my_atoms[j].type;
-		start_j    = Dev_Start_Index(j, bonds);
-		end_j      = Dev_End_Index(j, bonds);
-		hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds );
-		hb_end_j   = Dev_End_Index( my_atoms[j].Hindex, hbonds );
-
-		top = 0;
-		for( pi = start_j; pi < end_j; ++pi )  {
-			pbond_ij = &( bond_list[pi] );
-			i = pbond_ij->nbr;
-			bo_ij = &(pbond_ij->bo_data);
-			type_i = my_atoms[i].type;
-
-			if( sbp[type_i].p_hbond == 2 && 
-					bo_ij->BO >= HB_THRESHOLD )
-				hblist[top++] = pi;
-		}
-
-		// fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-		//          j, top, hb_start_j, hb_end_j );
-
-		for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
-			/* set k's varibles */
-			k = hbond_list[pk].nbr;
-			type_k = my_atoms[k].type;
-			nbr_jk = hbond_list[pk].ptr;
-			r_jk = nbr_jk->d;
-			rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
-
-			hbond_jk = &( hbond_list [pk] );
-			rvec_MakeZero (hbond_jk->hb_f);
-
-			for( itr = 0; itr < top; ++itr ) {
-				pi = hblist[itr];
-				pbond_ij = &( bonds->select.bond_list[pi] );
-				i = pbond_ij->nbr;
-
-				if( my_atoms[i].orig_id != my_atoms[k].orig_id ) {
-					bo_ij = &(pbond_ij->bo_data);
-					type_i = my_atoms[i].type;
-					r_ij = pbond_ij->d;	     
-					hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]);
-					++num_hb_intrs;
-
-					Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-							&theta, &cos_theta );
-					/* the derivative of cos(theta) */
-					Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-							&dcos_theta_di, &dcos_theta_dj, 
-							&dcos_theta_dk );
-
-					/* hyrogen bond energy*/
-					sin_theta2 = SIN( theta/2.0 );
-					sin_xhz4 = SQR(sin_theta2);
-					sin_xhz4 *= sin_xhz4;
-					cos_xhz1 = ( 1.0 - cos_theta );
-					exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-					exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-								r_jk / hbp->r0_hb - 2.0 ) );
-
-					//data_e_hb [j] += 
-					e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-					data_e_hb [j] += e_hb;
-
-					CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
-					CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-					CEhb3 = -hbp->p_hb3 * 
-						(-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
-
-					/*fprintf( stdout, 
-					  "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
-					  system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
-					  system->my_atoms[k].orig_id, 
-					  r_jk, theta, hbp->p_hb1, exp_hb2, hbp->p_hb3, hbp->r0_hb, 
-					  exp_hb3, sin_xhz4, e_hb ); */
-
-					/* hydrogen bond forces */
-					bo_ij->Cdbo += CEhb1; // dbo term
-
-					if( control->virial == 0 ) {
-						// dcos terms
-						//rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); 
-						//atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di );
-						rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); 
-
-						rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
-
-						//rvec_ScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
-						//atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
-						rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk );
-
-						// dr terms
-						rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
-
-						//rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
-						//atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
-						rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk );
-					}
-					else {
-						/* for pressure coupling, terms that are not related to bond order
-						   derivatives are added directly into pressure vector/tensor */
-						rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
-						//rvec_Add( workspace->f[i], force );
-						rvec_Add( pbond_ij->hb_f, force );
-						rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-						rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press );
-
-						rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
-
-						ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
-						rvec_Scale( force, +CEhb2, dcos_theta_dk );
-						//rvec_Add( workspace->f[k], force );
-						rvec_Add( hbond_jk->hb_f, force );
-						rvec_iMultiply( ext_press, rel_jk, force );
-						rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
-						// dr terms
-						rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
-
-						rvec_Scale( force, CEhb3/r_jk, dvec_jk );
-						//rvec_Add( workspace->f[k], force );
-						rvec_Add( hbond_jk->hb_f, force );
-						rvec_iMultiply( ext_press, rel_jk, force );
-						rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
-					}
+    int  i, j, k, pi, pk;
+    int  type_i, type_j, type_k;
+    int  start_j, end_j, hb_start_j, hb_end_j;
+    int  hblist[MAX_BONDS];
+    int  itr, top;
+    int  num_hb_intrs = 0;
+    ivec rel_jk;
+    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+    rvec dvec_jk, force, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    hbond_parameters *hbp;
+    bond_order_data *bo_ij;
+    bond_data *pbond_ij;
+    far_neighbor_data *nbr_jk;
+    reax_list *bonds, *hbonds;
+    bond_data *bond_list;
+    hbond_data *hbond_list, *hbond_jk;
+    storage *workspace = &( p_workspace );
+
+    bonds = &( p_bonds );
+    bond_list = bonds->select.bond_list;
+    hbonds = & ( p_hbonds );
+    hbond_list = hbonds->select.hbond_list;
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= n) return;
+
+    /* loops below discover the Hydrogen bonds between i-j-k triplets.
+       here j is H atom and there has to be some bond between i and j.
+       Hydrogen bond is between j and k.
+       so in this function i->X, j->H, k->Z when we map 
+       variables onto the ones in the handout.*/
+    //for( j = 0; j < system->n; ++j )
+    /* j has to be of type H */
+    if( sbp[ my_atoms[j].type ].p_hbond == 1 ) {
+        /*set j's variables */
+        type_j     = my_atoms[j].type;
+        start_j    = Dev_Start_Index(j, bonds);
+        end_j      = Dev_End_Index(j, bonds);
+        hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds );
+        hb_end_j   = Dev_End_Index( my_atoms[j].Hindex, hbonds );
+
+        top = 0;
+        for( pi = start_j; pi < end_j; ++pi )  {
+            pbond_ij = &( bond_list[pi] );
+            i = pbond_ij->nbr;
+            bo_ij = &(pbond_ij->bo_data);
+            type_i = my_atoms[i].type;
+
+            if( sbp[type_i].p_hbond == 2 && 
+                    bo_ij->BO >= HB_THRESHOLD )
+                hblist[top++] = pi;
+        }
+
+        // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+        //          j, top, hb_start_j, hb_end_j );
+
+        for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
+            /* set k's varibles */
+            k = hbond_list[pk].nbr;
+            type_k = my_atoms[k].type;
+            nbr_jk = hbond_list[pk].ptr;
+            r_jk = nbr_jk->d;
+            rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+
+            hbond_jk = &( hbond_list [pk] );
+            rvec_MakeZero (hbond_jk->hb_f);
+
+            for( itr = 0; itr < top; ++itr ) {
+                pi = hblist[itr];
+                pbond_ij = &( bonds->select.bond_list[pi] );
+                i = pbond_ij->nbr;
+
+                if( my_atoms[i].orig_id != my_atoms[k].orig_id ) {
+                    bo_ij = &(pbond_ij->bo_data);
+                    type_i = my_atoms[i].type;
+                    r_ij = pbond_ij->d;         
+                    hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]);
+                    ++num_hb_intrs;
+
+                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &theta, &cos_theta );
+                    /* the derivative of cos(theta) */
+                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &dcos_theta_di, &dcos_theta_dj, 
+                            &dcos_theta_dk );
+
+                    /* hyrogen bond energy*/
+                    sin_theta2 = SIN( theta/2.0 );
+                    sin_xhz4 = SQR(sin_theta2);
+                    sin_xhz4 *= sin_xhz4;
+                    cos_xhz1 = ( 1.0 - cos_theta );
+                    exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                    exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
+                                r_jk / hbp->r0_hb - 2.0 ) );
+
+                    //data_e_hb [j] += 
+                    e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                    data_e_hb [j] += e_hb;
+
+                    CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
+                    CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                    CEhb3 = -hbp->p_hb3 * 
+                        (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
+
+                    /*fprintf( stdout, 
+                      "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
+                      system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
+                      system->my_atoms[k].orig_id, 
+                      r_jk, theta, hbp->p_hb1, exp_hb2, hbp->p_hb3, hbp->r0_hb, 
+                      exp_hb3, sin_xhz4, e_hb ); */
+
+                    /* hydrogen bond forces */
+                    bo_ij->Cdbo += CEhb1; // dbo term
+
+                    if( control->virial == 0 ) {
+                        // dcos terms
+                        //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); 
+                        //atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di );
+                        rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); 
+
+                        rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
+
+                        //rvec_ScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
+                        //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
+                        rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk );
+
+                        // dr terms
+                        rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
+
+                        //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
+                        //atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
+                        rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk );
+                    }
+                    else {
+                        /* for pressure coupling, terms that are not related to bond order
+                           derivatives are added directly into pressure vector/tensor */
+                        rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
+                        //rvec_Add( workspace->f[i], force );
+                        rvec_Add( pbond_ij->hb_f, force );
+                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                        rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press );
+
+                        rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
+
+                        ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                        rvec_Scale( force, +CEhb2, dcos_theta_dk );
+                        //rvec_Add( workspace->f[k], force );
+                        rvec_Add( hbond_jk->hb_f, force );
+                        rvec_iMultiply( ext_press, rel_jk, force );
+                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        // dr terms
+                        rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
+
+                        rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                        //rvec_Add( workspace->f[k], force );
+                        rvec_Add( hbond_jk->hb_f, force );
+                        rvec_iMultiply( ext_press, rel_jk, force );
+                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                    }
 
 #ifdef TEST_ENERGY
-					/* fprintf( out_control->ehb, 
-					   "%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n",
-					   dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], 
-					   dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], 
-					   dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
-					   fprintf( out_control->ehb, "%24.15e%24.15e%24.15e\n",
-					   CEhb1, CEhb2, CEhb3 ); */
-					fprintf( out_control->ehb, 
-							//"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-							"%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-							system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
-							system->my_atoms[k].orig_id, 
-							r_jk, theta, bo_ij->BO, e_hb, data->my_en.e_hb );       
+                    /* fprintf( out_control->ehb, 
+                       "%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n%24.15e%24.15e%24.15e\n",
+                       dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], 
+                       dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], 
+                       dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
+                       fprintf( out_control->ehb, "%24.15e%24.15e%24.15e\n",
+                       CEhb1, CEhb2, CEhb3 ); */
+                    fprintf( out_control->ehb, 
+                            //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                            "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                            system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
+                            system->my_atoms[k].orig_id, 
+                            r_jk, theta, bo_ij->BO, e_hb, data->my_en.e_hb );       
 #endif
 #ifdef TEST_FORCES
-					Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); //dbo term
-					// dcos terms
-					rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di );
-					rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
-					rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
-					// dr terms
-					rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk ); 
-					rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk );
+                    Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb ); //dbo term
+                    // dcos terms
+                    rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di );
+                    rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
+                    rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
+                    // dr terms
+                    rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk ); 
+                    rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk );
 #endif
-				}
-			}
-		}
-	}
+                }
+            }
+        }
+    }
 }
 
 
 
 //CUDA_GLOBAL void __launch_bounds__ (256, 4) Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms, 
 CUDA_GLOBAL void Cuda_Hydrogen_Bonds_MT ( reax_atom *my_atoms, 
-		single_body_parameters *sbp, 
-		hbond_parameters *d_hbp,
-		global_parameters gp, 
-		control_params *control, 
-		storage p_workspace, 
-		reax_list p_bonds, 
-		reax_list p_hbonds, 
-		int n, 
-		int num_atom_types, 
-		real *data_e_hb, 
-		rvec *data_ext_press)
+        single_body_parameters *sbp, 
+        hbond_parameters *d_hbp,
+        global_parameters gp, 
+        control_params *control, 
+        storage p_workspace, 
+        reax_list p_bonds, 
+        reax_list p_hbonds, 
+        int n, 
+        int num_atom_types, 
+        real *data_e_hb, 
+        rvec *data_ext_press)
 {
 
 #if defined( __SM_35__)
-	real sh_hb;
-	real sh_cdbo;
-	rvec sh_atomf;
-	rvec sh_hf;
+    real sh_hb;
+    real sh_cdbo;
+    rvec sh_atomf;
+    rvec sh_hf;
 #else
 
-	extern __shared__ real t_hb[];
-	extern __shared__ rvec t__f[];
-	extern __shared__ rvec t_cdbo[];
-	extern __shared__ rvec t_hf [];
+    extern __shared__ real t_hb[];
+    extern __shared__ rvec t__f[];
+    extern __shared__ rvec t_cdbo[];
+    extern __shared__ rvec t_hf [];
 
-	real *sh_hb = t_hb;
-	real *sh_cdbo = t_hb + blockDim.x;
-	rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x);
-	rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
+    real *sh_hb = t_hb;
+    real *sh_cdbo = t_hb + blockDim.x;
+    rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x);
+    rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
 #endif
 
-	int __THREADS_PER_ATOM__ = HB_KER_THREADS_PER_ATOM;
-
-	int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-	int warp_id = thread_id / __THREADS_PER_ATOM__;
-	int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
-
-	if (warp_id >= n ) return;
-
-	int  i, j, k, pi, pk;
-	int  type_i, type_j, type_k;
-	int  start_j, end_j, hb_start_j, hb_end_j;
-	int  hblist[MAX_BONDS];
-	int  itr, top;
-	int  num_hb_intrs = 0;
-	ivec rel_jk;
-	real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-	real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
-	rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-	rvec dvec_jk, force, ext_press;
-	// rtensor temp_rtensor, total_rtensor;
-	hbond_parameters *hbp;
-	bond_order_data *bo_ij;
-	bond_data *pbond_ij;
-	far_neighbor_data *nbr_jk;
-	reax_list *bonds, *hbonds;
-	bond_data *bond_list;
-	hbond_data *hbond_list, *hbond_jk;
-	storage *workspace = &( p_workspace );
-
-	bonds = &( p_bonds );
-	bond_list = bonds->select.bond_list;
-	hbonds = & ( p_hbonds );
-	hbond_list = hbonds->select.hbond_list;
-
-	/*
-	   j = blockIdx.x * blockDim.x + threadIdx.x;
-	   if (j >= n) return;
-	 */
-	j = warp_id;
-
-	/* loops below discover the Hydrogen bonds between i-j-k triplets.
-	   here j is H atom and there has to be some bond between i and j.
-	   Hydrogen bond is between j and k.
-	   so in this function i->X, j->H, k->Z when we map 
-	   variables onto the ones in the handout.*/
-	//for( j = 0; j < system->n; ++j )
+    int __THREADS_PER_ATOM__ = HB_KER_THREADS_PER_ATOM;
+
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warp_id = thread_id / __THREADS_PER_ATOM__;
+    int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
+
+    if (warp_id >= n ) return;
+
+    int  i, j, k, pi, pk;
+    int  type_i, type_j, type_k;
+    int  start_j, end_j, hb_start_j, hb_end_j;
+    int  hblist[MAX_BONDS];
+    int  itr, top;
+    int  num_hb_intrs = 0;
+    ivec rel_jk;
+    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+    rvec dvec_jk, force, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    hbond_parameters *hbp;
+    bond_order_data *bo_ij;
+    bond_data *pbond_ij;
+    far_neighbor_data *nbr_jk;
+    reax_list *bonds, *hbonds;
+    bond_data *bond_list;
+    hbond_data *hbond_list, *hbond_jk;
+    storage *workspace = &( p_workspace );
+
+    bonds = &( p_bonds );
+    bond_list = bonds->select.bond_list;
+    hbonds = & ( p_hbonds );
+    hbond_list = hbonds->select.hbond_list;
+
+    /*
+       j = blockIdx.x * blockDim.x + threadIdx.x;
+       if (j >= n) return;
+     */
+    j = warp_id;
+
+    /* loops below discover the Hydrogen bonds between i-j-k triplets.
+       here j is H atom and there has to be some bond between i and j.
+       Hydrogen bond is between j and k.
+       so in this function i->X, j->H, k->Z when we map 
+       variables onto the ones in the handout.*/
+    //for( j = 0; j < system->n; ++j )
 
 #if defined( __SM_35__)
-	sh_hb  = 0;
-	rvec_MakeZero ( sh_atomf );
+    sh_hb  = 0;
+    rvec_MakeZero ( sh_atomf );
 #else
-	sh_hb [threadIdx.x] = 0;
-	rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+    sh_hb [threadIdx.x] = 0;
+    rvec_MakeZero ( sh_atomf[ threadIdx.x] );
 #endif
 
-	/* j has to be of type H */
-	if( sbp[ my_atoms[j].type ].p_hbond == 1 ) {
-		/*set j's variables */
-		type_j     = my_atoms[j].type;
-		start_j    = Dev_Start_Index(j, bonds);
-		end_j      = Dev_End_Index(j, bonds);
-		hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds );
-		hb_end_j   = Dev_End_Index( my_atoms[j].Hindex, hbonds );
-
-		top = 0;
-		for( pi = start_j; pi < end_j; ++pi )  {
-			pbond_ij = &( bond_list[pi] );
-			i = pbond_ij->nbr;
-			bo_ij = &(pbond_ij->bo_data);
-			type_i = my_atoms[i].type;
-
-			if( sbp[type_i].p_hbond == 2 && 
-					bo_ij->BO >= HB_THRESHOLD )
-				hblist[top++] = pi;
-		}
-
-		// fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-		//          j, top, hb_start_j, hb_end_j );
-
-		for( itr = 0; itr < top; ++itr ) {
-			pi = hblist[itr];
-			pbond_ij = &( bonds->select.bond_list[pi] );
-			i = pbond_ij->nbr;
+    /* j has to be of type H */
+    if( sbp[ my_atoms[j].type ].p_hbond == 1 ) {
+        /*set j's variables */
+        type_j     = my_atoms[j].type;
+        start_j    = Dev_Start_Index(j, bonds);
+        end_j      = Dev_End_Index(j, bonds);
+        hb_start_j = Dev_Start_Index( my_atoms[j].Hindex, hbonds );
+        hb_end_j   = Dev_End_Index( my_atoms[j].Hindex, hbonds );
+
+        top = 0;
+        for( pi = start_j; pi < end_j; ++pi )  {
+            pbond_ij = &( bond_list[pi] );
+            i = pbond_ij->nbr;
+            bo_ij = &(pbond_ij->bo_data);
+            type_i = my_atoms[i].type;
+
+            if( sbp[type_i].p_hbond == 2 && 
+                    bo_ij->BO >= HB_THRESHOLD )
+                hblist[top++] = pi;
+        }
+
+        // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+        //          j, top, hb_start_j, hb_end_j );
+
+        for( itr = 0; itr < top; ++itr ) {
+            pi = hblist[itr];
+            pbond_ij = &( bonds->select.bond_list[pi] );
+            i = pbond_ij->nbr;
 
 #if defined( __SM_35__)
-			rvec_MakeZero (sh_hf );
-			sh_cdbo  = 0;
+            rvec_MakeZero (sh_hf );
+            sh_cdbo  = 0;
 #else
-			rvec_MakeZero (sh_hf [threadIdx.x]);
-			sh_cdbo [threadIdx.x] = 0;
+            rvec_MakeZero (sh_hf [threadIdx.x]);
+            sh_cdbo [threadIdx.x] = 0;
 #endif
 
 
-			//for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
-			int loopcount = (hb_end_j - hb_start_j) / HB_KER_THREADS_PER_ATOM + 
-				(((hb_end_j - hb_start_j) % HB_KER_THREADS_PER_ATOM == 0) ? 0 : 1);
-
-			int count = 0;
-			pk = hb_start_j + lane_id;
-			while (count < loopcount)
-			{
-
-				if (pk < hb_end_j)
-				{
-					hbond_jk = &( hbond_list [pk] );
-
-					/* set k's varibles */
-					k = hbond_list[pk].nbr;
-					type_k = my_atoms[k].type;
-					nbr_jk = hbond_list[pk].ptr;
-					r_jk = nbr_jk->d;
-					rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
-				}
-				else k = -1;
-
-
-				if( (my_atoms[i].orig_id != my_atoms[k].orig_id)
-						&& (k != -1) ) {
-
-					bo_ij = &(pbond_ij->bo_data);
-					type_i = my_atoms[i].type;
-					r_ij = pbond_ij->d;	     
-					hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]);
-					++num_hb_intrs;
-
-					Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-							&theta, &cos_theta );
-					/* the derivative of cos(theta) */
-					Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-							&dcos_theta_di, &dcos_theta_dj, 
-							&dcos_theta_dk );
-
-					/* hyrogen bond energy*/
-					sin_theta2 = SIN( theta/2.0 );
-					sin_xhz4 = SQR(sin_theta2);
-					sin_xhz4 *= sin_xhz4;
-					cos_xhz1 = ( 1.0 - cos_theta );
-					exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-					exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-								r_jk / hbp->r0_hb - 2.0 ) );
-
-					//data_e_hb [j] += 
-					e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-					//data_e_hb [j] += e_hb;
+            //for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
+            int loopcount = (hb_end_j - hb_start_j) / HB_KER_THREADS_PER_ATOM + 
+                (((hb_end_j - hb_start_j) % HB_KER_THREADS_PER_ATOM == 0) ? 0 : 1);
+
+            int count = 0;
+            pk = hb_start_j + lane_id;
+            while (count < loopcount)
+            {
+
+                if (pk < hb_end_j)
+                {
+                    hbond_jk = &( hbond_list [pk] );
+
+                    /* set k's varibles */
+                    k = hbond_list[pk].nbr;
+                    type_k = my_atoms[k].type;
+                    nbr_jk = hbond_list[pk].ptr;
+                    r_jk = nbr_jk->d;
+                    rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+                }
+                else k = -1;
+
+
+                if( (my_atoms[i].orig_id != my_atoms[k].orig_id)
+                        && (k != -1) ) {
+
+                    bo_ij = &(pbond_ij->bo_data);
+                    type_i = my_atoms[i].type;
+                    r_ij = pbond_ij->d;         
+                    hbp = &(d_hbp[ index_hbp (type_i,type_j,type_k,num_atom_types) ]);
+                    ++num_hb_intrs;
+
+                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &theta, &cos_theta );
+                    /* the derivative of cos(theta) */
+                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &dcos_theta_di, &dcos_theta_dj, 
+                            &dcos_theta_dk );
+
+                    /* hyrogen bond energy*/
+                    sin_theta2 = SIN( theta/2.0 );
+                    sin_xhz4 = SQR(sin_theta2);
+                    sin_xhz4 *= sin_xhz4;
+                    cos_xhz1 = ( 1.0 - cos_theta );
+                    exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                    exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
+                                r_jk / hbp->r0_hb - 2.0 ) );
+
+                    //data_e_hb [j] += 
+                    e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                    //data_e_hb [j] += e_hb;
 
 #if defined( __SM_35__)
-					sh_hb += e_hb;
+                    sh_hb += e_hb;
 #else
-					sh_hb [threadIdx.x] += e_hb;
+                    sh_hb [threadIdx.x] += e_hb;
 #endif
 
-					CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
-					CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-					CEhb3 = -hbp->p_hb3 * 
-						(-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
+                    CEhb1 = hbp->p_hb1 * hbp->p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
+                    CEhb2 = -hbp->p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                    CEhb3 = -hbp->p_hb3 * 
+                        (-hbp->r0_hb / SQR(r_jk) + 1.0 / hbp->r0_hb) * e_hb;
 
-					/*fprintf( stdout, 
-					  "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
-					  system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
-					  system->my_atoms[k].orig_id, 
-					  r_jk, theta, hbp->p_hb1, exp_hb2, hbp->p_hb3, hbp->r0_hb, 
-					  exp_hb3, sin_xhz4, e_hb ); */
+                    /*fprintf( stdout, 
+                      "%6d%6d%6d%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
+                      system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
+                      system->my_atoms[k].orig_id, 
+                      r_jk, theta, hbp->p_hb1, exp_hb2, hbp->p_hb3, hbp->r0_hb, 
+                      exp_hb3, sin_xhz4, e_hb ); */
 
-					/* hydrogen bond forces */
-					//	    bo_ij->Cdbo += CEhb1; // dbo term
+                    /* hydrogen bond forces */
+                    //        bo_ij->Cdbo += CEhb1; // dbo term
 #if defined( __SM_35__)
-					sh_cdbo += CEhb1;
+                    sh_cdbo += CEhb1;
 #else
-					sh_cdbo[threadIdx.x] += CEhb1;
+                    sh_cdbo[threadIdx.x] += CEhb1;
 #endif
 
-					if( control->virial == 0 ) {
-						// dcos terms
-						//rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); 
-						//atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di );
-						//rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); 
+                    if( control->virial == 0 ) {
+                        // dcos terms
+                        //rvec_ScaledAdd( workspace->f[i], +CEhb2, dcos_theta_di ); 
+                        //atomic_rvecScaledAdd (workspace->f[i], +CEhb2, dcos_theta_di );
+                        //rvec_ScaledAdd( pbond_ij->hb_f, +CEhb2, dcos_theta_di ); 
 #if defined( __SM_35__)
-						rvec_ScaledAdd( sh_hf , +CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( sh_hf , +CEhb2, dcos_theta_di ); 
 #else
-						rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); 
 #endif
 
-						//rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
+                        //rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
 #if defined( __SM_35__)
-						rvec_ScaledAdd( sh_atomf , +CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( sh_atomf , +CEhb2, dcos_theta_dj );
 #else
-						rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
 #endif
 
-						//rvec_ScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
-						//atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
-						rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk );
+                        //rvec_ScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
+                        //atomic_rvecScaledAdd( workspace->f[k], +CEhb2, dcos_theta_dk );
+                        rvec_ScaledAdd( hbond_jk->hb_f, +CEhb2, dcos_theta_dk );
 
-						// dr terms
-						//rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
+                        // dr terms
+                        //rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
 #if defined( __SM_35__)
-						rvec_ScaledAdd( sh_atomf , -CEhb3/r_jk, dvec_jk ); 
+                        rvec_ScaledAdd( sh_atomf , -CEhb3/r_jk, dvec_jk ); 
 #else
-						rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); 
+                        rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk ); 
 #endif
 
-						//rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
-						//atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
-						rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk );
-					}
-					else {
-						/* for pressure coupling, terms that are not related to bond order
-						   derivatives are added directly into pressure vector/tensor */
-						rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
-						//rvec_Add( workspace->f[i], force );
-						rvec_Add( pbond_ij->hb_f, force );
-						rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-						rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press );
-
-						rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
-
-						ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
-						rvec_Scale( force, +CEhb2, dcos_theta_dk );
-						//rvec_Add( workspace->f[k], force );
-						rvec_Add( hbond_jk->hb_f, force );
-						rvec_iMultiply( ext_press, rel_jk, force );
-						rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
-						// dr terms
-						rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
-
-						rvec_Scale( force, CEhb3/r_jk, dvec_jk );
-						//rvec_Add( workspace->f[k], force );
-						rvec_Add( hbond_jk->hb_f, force );
-						rvec_iMultiply( ext_press, rel_jk, force );
-						rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
-					}
-
-				} //orid id end
-
-				pk += __THREADS_PER_ATOM__;
-				count ++;
-
-			} //for itr loop end
-
-			//Reduction here
+                        //rvec_ScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
+                        //atomic_rvecScaledAdd( workspace->f[k], +CEhb3/r_jk, dvec_jk );
+                        rvec_ScaledAdd( hbond_jk->hb_f, +CEhb3/r_jk, dvec_jk );
+                    }
+                    else {
+                        /* for pressure coupling, terms that are not related to bond order
+                           derivatives are added directly into pressure vector/tensor */
+                        rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
+                        //rvec_Add( workspace->f[i], force );
+                        rvec_Add( pbond_ij->hb_f, force );
+                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                        rvec_ScaledAdd( data_ext_press [j], 1.0, ext_press );
+
+                        rvec_ScaledAdd( workspace->f[j], +CEhb2, dcos_theta_dj );
+
+                        ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                        rvec_Scale( force, +CEhb2, dcos_theta_dk );
+                        //rvec_Add( workspace->f[k], force );
+                        rvec_Add( hbond_jk->hb_f, force );
+                        rvec_iMultiply( ext_press, rel_jk, force );
+                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                        // dr terms
+                        rvec_ScaledAdd( workspace->f[j], -CEhb3/r_jk, dvec_jk ); 
+
+                        rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                        //rvec_Add( workspace->f[k], force );
+                        rvec_Add( hbond_jk->hb_f, force );
+                        rvec_iMultiply( ext_press, rel_jk, force );
+                        rvec_ScaledAdd( data_ext_press[j], 1.0, ext_press );
+                    }
+
+                } //orid id end
+
+                pk += __THREADS_PER_ATOM__;
+                count ++;
+
+            } //for itr loop end
+
+            //Reduction here
 #if defined( __SM_35__)
-			for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){
-				sh_cdbo += shfl( sh_cdbo, s);
-				sh_hf[0] += shfl( sh_hf[0], s);
-				sh_hf[1] += shfl( sh_hf[1], s);
-				sh_hf[2] += shfl( sh_hf[2], s);
-			}
-			//end of the shuffle
-			if (lane_id == 0) {
-				bo_ij->Cdbo += sh_cdbo ;
-				rvec_Add (pbond_ij->hb_f, sh_hf );
-			}
+            for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){
+                sh_cdbo += shfl( sh_cdbo, s);
+                sh_hf[0] += shfl( sh_hf[0], s);
+                sh_hf[1] += shfl( sh_hf[1], s);
+                sh_hf[2] += shfl( sh_hf[2], s);
+            }
+            //end of the shuffle
+            if (lane_id == 0) {
+                bo_ij->Cdbo += sh_cdbo ;
+                rvec_Add (pbond_ij->hb_f, sh_hf );
+            }
 #else
-			if (lane_id < 16) {
-				sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
-				rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
-			}
-			if (lane_id < 8) {
-				sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
-				rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
-			}
-			if (lane_id < 4) {
-				sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
-				rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
-			}
-			if (lane_id < 2) {
-				sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
-				rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
-			}
-			if (lane_id < 1) {
-				sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
-				rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
-
-				bo_ij->Cdbo += sh_cdbo [threadIdx.x];
-				rvec_Add (pbond_ij->hb_f, sh_hf [threadIdx.x]);
-			}
+            if (lane_id < 16) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
+            }
+            if (lane_id < 8) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
+            }
+            if (lane_id < 4) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
+            }
+            if (lane_id < 2) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
+            }
+            if (lane_id < 1) {
+                sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
+                rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
+
+                bo_ij->Cdbo += sh_cdbo [threadIdx.x];
+                rvec_Add (pbond_ij->hb_f, sh_hf [threadIdx.x]);
+            }
 #endif
 
 
-		} // for loop hbonds end
-		} //if Hbond check end
+        } // for loop hbonds end
+        } //if Hbond check end
 
 #if defined( __SM_35__)
-		for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){
-			sh_hb += shfl( sh_hb, s);
-			sh_atomf[0] += shfl( sh_atomf[0], s);
-			sh_atomf[1] += shfl( sh_atomf[1], s);
-			sh_atomf[2] += shfl( sh_atomf[2], s);
-		}
-		if (lane_id == 0){
-			data_e_hb[j] += sh_hb;
-			rvec_Add (workspace->f[j], sh_atomf);
-		}
+        for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){
+            sh_hb += shfl( sh_hb, s);
+            sh_atomf[0] += shfl( sh_atomf[0], s);
+            sh_atomf[1] += shfl( sh_atomf[1], s);
+            sh_atomf[2] += shfl( sh_atomf[2], s);
+        }
+        if (lane_id == 0){
+            data_e_hb[j] += sh_hb;
+            rvec_Add (workspace->f[j], sh_atomf);
+        }
 
 
 #else
-		if (lane_id < 16){
-			sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-			rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
-		}
-		if (lane_id < 8){
-			sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-			rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
-		}
-		if (lane_id < 4){
-			sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-			rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
-		}
-		if (lane_id < 2){
-			sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-			rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
-		}
-		if (lane_id < 1){
-			sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-			rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
-
-			data_e_hb[j] += sh_hb [threadIdx.x];
-			rvec_Add (workspace->f[j], sh_atomf [threadIdx.x]);
-		}
+        if (lane_id < 16){
+            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
+            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
+        }
+        if (lane_id < 8){
+            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
+            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
+        }
+        if (lane_id < 4){
+            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
+            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
+        }
+        if (lane_id < 2){
+            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
+            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
+        }
+        if (lane_id < 1){
+            sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
+            rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
+
+            data_e_hb[j] += sh_hb [threadIdx.x];
+            rvec_Add (workspace->f[j], sh_atomf [threadIdx.x]);
+        }
 #endif
 
-	}
+    }
 
 
 
 
 
-	CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess (  reax_atom *atoms,
-			storage p_workspace,
-			reax_list p_bonds, int N)
-	{
-		int i, pj;
+    CUDA_GLOBAL void Cuda_Hydrogen_Bonds_PostProcess (  reax_atom *atoms,
+            storage p_workspace,
+            reax_list p_bonds, int N)
+    {
+        int i, pj;
 
-		storage *workspace = &( p_workspace );
-		bond_data *pbond;
-		bond_data *sym_index_bond;
-		reax_list *bonds = &p_bonds;
+        storage *workspace = &( p_workspace );
+        bond_data *pbond;
+        bond_data *sym_index_bond;
+        reax_list *bonds = &p_bonds;
 
-		i = blockIdx.x * blockDim.x + threadIdx.x;
-		if ( i >= N) return;
+        i = blockIdx.x * blockDim.x + threadIdx.x;
+        if ( i >= N) return;
 
-		for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
+        for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
 
-			pbond = &(bonds->select.bond_list[pj]);
-			sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
+            pbond = &(bonds->select.bond_list[pj]);
+            sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
 
-			//rvec_Add (atoms[i].f, sym_index_bond->hb_f );
-			rvec_Add (workspace->f[i], sym_index_bond->hb_f );
-		}
-	}
+            //rvec_Add (atoms[i].f, sym_index_bond->hb_f );
+            rvec_Add (workspace->f[i], sym_index_bond->hb_f );
+        }
+    }
 
-	CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs (  reax_atom *atoms,
-			storage p_workspace,
-			reax_list p_hbonds )
-	{
+    CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs (  reax_atom *atoms,
+            storage p_workspace,
+            reax_list p_hbonds )
+    {
 
 #if defined(__SM_35__)
-		rvec __f;
+        rvec __f;
 #else
-		extern __shared__ rvec __f[];
+        extern __shared__ rvec __f[];
 #endif
-		int i, pj,j;
-		int start, end;
+        int i, pj,j;
+        int start, end;
 
-		storage *workspace = &( p_workspace );
-		hbond_data *nbr_pj, *sym_index_nbr;
-		reax_list *hbonds = &p_hbonds;
+        storage *workspace = &( p_workspace );
+        hbond_data *nbr_pj, *sym_index_nbr;
+        reax_list *hbonds = &p_hbonds;
 
-		i = blockIdx.x;
+        i = blockIdx.x;
 
-		start = Dev_Start_Index (i, hbonds);
-		end = Dev_End_Index (i, hbonds);
-		pj = start + threadIdx.x;
+        start = Dev_Start_Index (i, hbonds);
+        end = Dev_End_Index (i, hbonds);
+        pj = start + threadIdx.x;
 #if defined(__SM_35__)
-		rvec_MakeZero (__f);
+        rvec_MakeZero (__f);
 #else
-		rvec_MakeZero (__f[threadIdx.x]);
+        rvec_MakeZero (__f[threadIdx.x]);
 #endif
 
-		while (pj < end)
-		{
-			nbr_pj = &( hbonds->select.hbond_list[pj] );
-			j = nbr_pj->nbr;
+        while (pj < end)
+        {
+            nbr_pj = &( hbonds->select.hbond_list[pj] );
+            j = nbr_pj->nbr;
 
-			sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
+            sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
 
 #if defined(__SM_35__)
-			rvec_Add (__f, sym_index_nbr->hb_f );
+            rvec_Add (__f, sym_index_nbr->hb_f );
 #else
-			rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f );
+            rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f );
 #endif
 
-			pj += blockDim.x;
-		}
+            pj += blockDim.x;
+        }
 
 #if defined(__SM_35__)
-		for (int s = 16; s >= 1; s/=2){
-			__f[0] += shfl( __f[0], s);
-			__f[1] += shfl( __f[1], s);
-			__f[2] += shfl( __f[2], s);
-		}
-
-		if (threadIdx.x == 0)
-			rvec_Add (workspace->f[i], __f);
+        for (int s = 16; s >= 1; s/=2){
+            __f[0] += shfl( __f[0], s);
+            __f[1] += shfl( __f[1], s);
+            __f[2] += shfl( __f[2], s);
+        }
+
+        if (threadIdx.x == 0)
+            rvec_Add (workspace->f[i], __f);
 #else
-		if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
-		if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
-		if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
-		if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
-		if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
-
-		if (threadIdx.x == 0)
-			//rvec_Add (atoms[i].f, __f[0]);
-			rvec_Add (workspace->f[i], __f[0]);
+        if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
+        if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
+        if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
+        if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
+        if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
+
+        if (threadIdx.x == 0)
+            //rvec_Add (atoms[i].f, __f[0]);
+            rvec_Add (workspace->f[i], __f[0]);
 #endif
-	}
+    }
 
-	CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL (  reax_atom *atoms,
-			storage p_workspace,
-			reax_list p_hbonds, int N )
-	{
+    CUDA_GLOBAL void Cuda_Hydrogen_Bonds_HNbrs_BL (  reax_atom *atoms,
+            storage p_workspace,
+            reax_list p_hbonds, int N )
+    {
 #if defined(__SM_35__)
-		rvec __f;
+        rvec __f;
 #else
-		extern __shared__ rvec __f[];
+        extern __shared__ rvec __f[];
 #endif
-		int i, pj,j;
-		int start, end;
+        int i, pj,j;
+        int start, end;
 
-		storage *workspace = &( p_workspace );
-		hbond_data *nbr_pj, *sym_index_nbr;
-		reax_list *hbonds = &p_hbonds;
+        storage *workspace = &( p_workspace );
+        hbond_data *nbr_pj, *sym_index_nbr;
+        reax_list *hbonds = &p_hbonds;
 
-		int __THREADS_PER_ATOM__ = HB_POST_PROC_KER_THREADS_PER_ATOM;
+        int __THREADS_PER_ATOM__ = HB_POST_PROC_KER_THREADS_PER_ATOM;
 
-		int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-		int warp_id = thread_id / __THREADS_PER_ATOM__;
-		int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
-		if (warp_id >= N ) return;
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        int warp_id = thread_id / __THREADS_PER_ATOM__;
+        int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
+        if (warp_id >= N ) return;
 
-		i = warp_id;
+        i = warp_id;
 
-		start = Dev_Start_Index (i, hbonds);
-		end = Dev_End_Index (i, hbonds);
-		pj = start + lane_id;
+        start = Dev_Start_Index (i, hbonds);
+        end = Dev_End_Index (i, hbonds);
+        pj = start + lane_id;
 
 #if defined(__SM_35__)
-		rvec_MakeZero (__f);
+        rvec_MakeZero (__f);
 #else
-		rvec_MakeZero (__f[threadIdx.x]);
+        rvec_MakeZero (__f[threadIdx.x]);
 #endif
 
-		while (pj < end)
-		{
-			nbr_pj = &( hbonds->select.hbond_list[pj] );
-			j = nbr_pj->nbr;
+        while (pj < end)
+        {
+            nbr_pj = &( hbonds->select.hbond_list[pj] );
+            j = nbr_pj->nbr;
 
-			sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
+            sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
 #if defined(__SM_35__)
-			rvec_Add (__f, sym_index_nbr->hb_f );
+            rvec_Add (__f, sym_index_nbr->hb_f );
 #else
-			rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f );
+            rvec_Add (__f[threadIdx.x], sym_index_nbr->hb_f );
 #endif
 
-			pj += __THREADS_PER_ATOM__;
-		}
+            pj += __THREADS_PER_ATOM__;
+        }
 
 #if defined(__SM_35__)
-		for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){
-			__f[0] += shfl( __f[0], s);
-			__f[1] += shfl( __f[1], s);
-			__f[2] += shfl( __f[2], s);
-		}
-
-		if (lane_id == 0)
-			rvec_Add (workspace->f[i], __f);
+        for (int s = __THREADS_PER_ATOM__ >> 1; s >= 1; s/=2){
+            __f[0] += shfl( __f[0], s);
+            __f[1] += shfl( __f[1], s);
+            __f[2] += shfl( __f[2], s);
+        }
+
+        if (lane_id == 0)
+            rvec_Add (workspace->f[i], __f);
 #else
-		if (lane_id < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
-		if (lane_id < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
-		if (lane_id < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
-		if (lane_id < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
-		if (lane_id < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
-
-		if (lane_id == 0)
-			rvec_Add (workspace->f[i], __f[threadIdx.x]);
+        if (lane_id < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
+        if (lane_id < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
+        if (lane_id < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
+        if (lane_id < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
+        if (lane_id < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
+
+        if (lane_id == 0)
+            rvec_Add (workspace->f[i], __f[threadIdx.x]);
 #endif
-	}
+    }
diff --git a/PG-PuReMD/src/cuda_init_md.cu b/PG-PuReMD/src/cuda_init_md.cu
index 0bce2d22..827a63a3 100644
--- a/PG-PuReMD/src/cuda_init_md.cu
+++ b/PG-PuReMD/src/cuda_init_md.cu
@@ -6,7 +6,7 @@
 
 void Cuda_Init_ScratchArea ()
 {
-	cuda_malloc ((void **)& scratch, SCRATCH_SIZE, 1, "Device:Scratch");
+    cuda_malloc ((void **)& scratch, SCRATCH_SIZE, 1, "Device:Scratch");
 
-	host_scratch = (void *)malloc (HOST_SCRATCH_SIZE );
+    host_scratch = (void *)malloc (HOST_SCRATCH_SIZE );
 }
diff --git a/PG-PuReMD/src/cuda_integrate.cu b/PG-PuReMD/src/cuda_integrate.cu
index 4d2d3d93..7f042ce9 100644
--- a/PG-PuReMD/src/cuda_integrate.cu
+++ b/PG-PuReMD/src/cuda_integrate.cu
@@ -6,92 +6,92 @@
 #include "cuda_utils.h"
 
 CUDA_GLOBAL void ker_update_velocity_1 (reax_atom *my_atoms, 
-		single_body_parameters *sbp, 
-		real dt,
-		int n)
+        single_body_parameters *sbp, 
+        real dt,
+        int n)
 {
-	real inv_m;
-	rvec dx;
-	reax_atom *atom;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
+    real inv_m;
+    rvec dx;
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
 
-	/* velocity verlet, 1st part */
-	//for( i = 0; i < system->n; i++ ) { 
-	atom = &(my_atoms[i]);
-	inv_m = 1.0 / sbp[atom->type].mass;
-	/* Compute x(t + dt) */
-	rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-	rvec_Add( atom->x, dx );
-	/* Compute v(t + dt/2) */
-	rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-	//}
+    /* velocity verlet, 1st part */
+    //for( i = 0; i < system->n; i++ ) { 
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute x(t + dt) */
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+    /* Compute v(t + dt/2) */
+    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+    //}
 }
 
 void bNVT_update_velocity_part1 (reax_system *system, real dt)
 {
-	int blocks;
+    int blocks;
 
-	blocks = system->n / DEF_BLOCK_SIZE + 
-		((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	ker_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
-		(system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    ker_update_velocity_1 <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 CUDA_GLOBAL void ker_update_velocity_2 (reax_atom *my_atoms, 
-		single_body_parameters *sbp, 
-		real dt,
-		int n)
+        single_body_parameters *sbp, 
+        real dt,
+        int n)
 {
-	reax_atom *atom;
-	real inv_m;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
+    reax_atom *atom;
+    real inv_m;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
 
-	/* velocity verlet, 2nd part */
-	//for( i = 0; i < system->n; i++ ) { 
-	atom = &(my_atoms[i]);
-	inv_m = 1.0 / sbp[atom->type].mass;
-	/* Compute v(t + dt) */
-	rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-	//}
+    /* velocity verlet, 2nd part */
+    //for( i = 0; i < system->n; i++ ) { 
+    atom = &(my_atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute v(t + dt) */
+    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+    //}
 }
 
 void bNVT_update_velocity_part2 (reax_system *system, real dt)
 {
-	int blocks;
+    int blocks;
 
-	blocks = system->n / DEF_BLOCK_SIZE + 
-		((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	ker_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
-		(system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    ker_update_velocity_2 <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, system->reax_param.d_sbp, dt, system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 CUDA_GLOBAL void ker_scale_velocities (reax_atom *my_atoms, real lambda, int n)
 {
-	reax_atom *atom;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
 
-	/* Scale velocities and positions at t+dt */
-	//for( i = 0; i < system->n; ++i ) {
-	atom = &(my_atoms[i]);
-	rvec_Scale( atom->v, lambda, atom->v );
-	//}
+    /* Scale velocities and positions at t+dt */
+    //for( i = 0; i < system->n; ++i ) {
+    atom = &(my_atoms[i]);
+    rvec_Scale( atom->v, lambda, atom->v );
+    //}
 }
 
 void bNVT_scale_velocities (reax_system *system, real lambda)
 {
-	int blocks;
+    int blocks;
 
-	blocks = system->n / DEF_BLOCK_SIZE + 
-		((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	ker_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
-		(system->d_my_atoms, lambda, system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    ker_scale_velocities <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, lambda, system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
diff --git a/PG-PuReMD/src/cuda_linear_solvers.cu b/PG-PuReMD/src/cuda_linear_solvers.cu
index 7ad92cc1..1b1f510c 100644
--- a/PG-PuReMD/src/cuda_linear_solvers.cu
+++ b/PG-PuReMD/src/cuda_linear_solvers.cu
@@ -31,263 +31,263 @@
 
 void get_from_device (real *host, real *device, unsigned int bytes, char *msg)
 {
-	copy_host_device (host, device, bytes, cudaMemcpyDeviceToHost, msg);
+    copy_host_device (host, device, bytes, cudaMemcpyDeviceToHost, msg);
 }
 
 void put_on_device (real *host, real *device, unsigned int bytes, char *msg)
 {
-	copy_host_device (host, device, bytes, cudaMemcpyHostToDevice, msg);
+    copy_host_device (host, device, bytes, cudaMemcpyHostToDevice, msg);
 }
 
 void Cuda_Vector_Sum (real *res, real a, real *x, real b, real *y, int count)
 {
-	//res = ax + by
-	//use the cublas here
-	int blocks;
-	blocks = (count / DEF_BLOCK_SIZE) + 
-		((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
-		( res, a, x, b, y, count );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    //res = ax + by
+    //use the cublas here
+    int blocks;
+    blocks = (count / DEF_BLOCK_SIZE) + 
+        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_vector_sum <<< blocks, DEF_BLOCK_SIZE >>>
+        ( res, a, x, b, y, count );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 void Cuda_CG_Preconditioner (real *res, real *a, real *b, int count)
 {
-	//res = a*b - vector multiplication
-	//use the cublas here.
-	int blocks;
-	blocks = (count / DEF_BLOCK_SIZE) + 
-		((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
-		( res, a, b, count );
-	cudaThreadSynchronize ();
+    //res = a*b - vector multiplication
+    //use the cublas here.
+    int blocks;
+    blocks = (count / DEF_BLOCK_SIZE) + 
+        ((count % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_vector_mul <<< blocks, DEF_BLOCK_SIZE >>>
+        ( res, a, b, count );
+    cudaThreadSynchronize ();
 }
 
 CUDA_GLOBAL void k_diagnol_preconditioner (storage p_workspace, rvec2 *b, int n)
 {
-	storage *workspace = &( p_workspace );
-	int j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= n) return;
-
-	//for( j = 0; j < system->n; ++j ) {
-	// residual 
-	workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
-	workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
-	// apply diagonal pre-conditioner
-	workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
-	workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
-	//}
+    storage *workspace = &( p_workspace );
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= n) return;
+
+    //for( j = 0; j < system->n; ++j ) {
+    // residual 
+    workspace->r2[j][0] = b[j][0] - workspace->q2[j][0];
+    workspace->r2[j][1] = b[j][1] - workspace->q2[j][1];
+    // apply diagonal pre-conditioner
+    workspace->d2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j]; 
+    workspace->d2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j]; 
+    //}
 }
 
 void Cuda_CG_Diagnol_Preconditioner (storage *workspace, rvec2 *b, int n)
 {
-	int blocks;
-
-	blocks = (n / DEF_BLOCK_SIZE) + 
-		(( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	k_diagnol_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-		(*workspace, b, n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_diagnol_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
+        (*workspace, b, n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 CUDA_GLOBAL void k_dual_cg_preconditioner (storage p_workspace, rvec2 *x, 
-		real alpha_0, real alpha_1, int n, rvec2 *my_dot)
+        real alpha_0, real alpha_1, int n, rvec2 *my_dot)
 {
-	storage *workspace = &( p_workspace );
-	rvec2 alpha;
-	alpha[0] = alpha_0;
-	alpha[1] = alpha_1;
-
-	int j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= n) return;
-	my_dot[j][0] = my_dot[j][1] = 0.0;
-
-	//for( j = 0; j < system->n; ++j ) {
-	// update x 
-	x[j][0] += alpha[0] * workspace->d2[j][0];
-	x[j][1] += alpha[1] * workspace->d2[j][1];      
-	// update residual 
-	workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
-	workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
-	// apply diagonal pre-conditioner 
-	workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
-	workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
-	// dot product: r.p 
-	my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
-	my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
-	//}
+    storage *workspace = &( p_workspace );
+    rvec2 alpha;
+    alpha[0] = alpha_0;
+    alpha[1] = alpha_1;
+
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= n) return;
+    my_dot[j][0] = my_dot[j][1] = 0.0;
+
+    //for( j = 0; j < system->n; ++j ) {
+    // update x 
+    x[j][0] += alpha[0] * workspace->d2[j][0];
+    x[j][1] += alpha[1] * workspace->d2[j][1];      
+    // update residual 
+    workspace->r2[j][0] -= alpha[0] * workspace->q2[j][0]; 
+    workspace->r2[j][1] -= alpha[1] * workspace->q2[j][1]; 
+    // apply diagonal pre-conditioner 
+    workspace->p2[j][0] = workspace->r2[j][0] * workspace->Hdia_inv[j];
+    workspace->p2[j][1] = workspace->r2[j][1] * workspace->Hdia_inv[j];
+    // dot product: r.p 
+    my_dot[j][0] = workspace->r2[j][0] * workspace->p2[j][0];
+    my_dot[j][1] = workspace->r2[j][1] * workspace->p2[j][1];
+    //}
 }
 
 void Cuda_DualCG_Preconditioer (storage *workspace, rvec2 *x, rvec2 alpha, int n, rvec2 result)
 {
-	int blocks;
-	rvec2 *tmp = (rvec2 *) scratch;
-	cuda_memset (tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner");
-
-	blocks = (n / DEF_BLOCK_SIZE) + 
-		(( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
-		(*workspace, x, alpha[0], alpha[1], n, tmp);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	//Reduction to calculate my_dot
-	k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
-		( tmp, tmp + n, n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
-		( tmp + n, tmp + 2*n, blocks);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	copy_host_device (result, (tmp + 2*n), sizeof (rvec2), cudaMemcpyDeviceToHost, "my_dot");
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+    cuda_memset (tmp, 0, sizeof (rvec2) * ( 2 * n + 1), "cuda_dualcg_preconditioner");
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_dual_cg_preconditioner <<< blocks, DEF_BLOCK_SIZE >>>
+        (*workspace, x, alpha[0], alpha[1], n, tmp);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction to calculate my_dot
+    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
+        ( tmp, tmp + n, n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
+        ( tmp + n, tmp + 2*n, blocks);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    copy_host_device (result, (tmp + 2*n), sizeof (rvec2), cudaMemcpyDeviceToHost, "my_dot");
 }
 
 void Cuda_Norm (rvec2 *arr, int n, rvec2 result)
 {
-	int blocks;
-	rvec2 *tmp = (rvec2 *) scratch;
-
-	blocks = (n / DEF_BLOCK_SIZE) + 
-		(( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
-		(arr, tmp, n, INITIAL);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
-		(tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
-			cudaMemcpyDeviceToHost, "cuda_norm_rvec2");
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
+        (arr, tmp, n, INITIAL);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
+        (tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_norm_rvec2");
 }
 
 void Cuda_Dot (rvec2 *a, rvec2 *b, rvec2 result, int n)
 {
-	int blocks;
-	rvec2 *tmp = (rvec2 *) scratch;
-
-	blocks = (n / DEF_BLOCK_SIZE) + 
-		(( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
-		( a, b, tmp, n );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> 
-		//k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * BLOCKS_POW_2 >>> 
-		( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
-			cudaMemcpyDeviceToHost, "cuda_dot");
+    int blocks;
+    rvec2 *tmp = (rvec2 *) scratch;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_dot_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
+        ( a, b, tmp, n );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_norm_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>> 
+        //k_norm_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * BLOCKS_POW_2 >>> 
+        ( tmp, tmp + BLOCKS_POW_2, blocks, FINAL );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    copy_host_device (result, tmp + BLOCKS_POW_2, sizeof (rvec2), 
+            cudaMemcpyDeviceToHost, "cuda_dot");
 }
 
 void Cuda_Vector_Sum_Rvec2 (rvec2 *x, rvec2 *a, rvec2 b, rvec2 *c, int n)
 {
-	int blocks;
-
-	blocks = (n / DEF_BLOCK_SIZE) + 
-		(( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
-		( x, a, b[0], b[1], c, n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    int blocks;
+
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_rvec2_pbetad <<< blocks, DEF_BLOCK_SIZE >>> 
+        ( x, a, b[0], b[1], c, n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 CUDA_GLOBAL void k_rvec2_to_real_copy ( real *dst, rvec2 *src, int index, int n)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= n) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
 
-	dst[i] = src[i][index];
+    dst[i] = src[i][index];
 }
 
 void Cuda_RvecCopy_From (real *dst, rvec2 *src, int index, int n)
 {
-	int blocks;
-	blocks = (n / DEF_BLOCK_SIZE) + 
-		(( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
-		( dst, src, index, n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    int blocks;
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_rvec2_to_real_copy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( dst, src, index, n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 CUDA_GLOBAL void k_real_to_rvec2_copy ( rvec2 *dst, real *src, int index, int n)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= n) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
 
-	dst[i][index] = src[i];
+    dst[i][index] = src[i];
 }
 
 void Cuda_RvecCopy_To (rvec2 *dst, real *src, int index, int n)
 {
-	int blocks;
-	blocks = (n / DEF_BLOCK_SIZE) + 
-		(( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
-		( dst, src, index, n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    int blocks;
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    k_real_to_rvec2_copy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( dst, src, index, n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 void Cuda_Dual_Matvec (sparse_matrix *H, rvec2 *a, rvec2 *b, int n, int size)
 {
-	int blocks;
-	blocks = (n / DEF_BLOCK_SIZE) + 
-		(( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+    int blocks;
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
-	cuda_memset (b, 0, sizeof (rvec2) * size, "dual_matvec:result");
+    cuda_memset (b, 0, sizeof (rvec2) * size, "dual_matvec:result");
 
-	//One thread per row implementation
-	//k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-	//		(*H, a, b, n);
-	//cudaThreadSynchronize ();
-	//cudaCheckError ();
+    //One thread per row implementation
+    //k_dual_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+    //        (*H, a, b, n);
+    //cudaThreadSynchronize ();
+    //cudaCheckError ();
 
-	//One warp per row implementation
+    //One warp per row implementation
 #if defined(__SM_35__)
-	k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
+    k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
 #else
-		k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, 
-				  sizeof (rvec2) * MATVEC_BLOCK_SIZE >>>
+        k_dual_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, 
+                  sizeof (rvec2) * MATVEC_BLOCK_SIZE >>>
 #endif
-					  (*H, a, b, n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+                      (*H, a, b, n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 void Cuda_Matvec (sparse_matrix *H, real *a, real *b, int n, int size)
 {
-	int blocks;
-	blocks = (n / DEF_BLOCK_SIZE) + 
-		(( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+    int blocks;
+    blocks = (n / DEF_BLOCK_SIZE) + 
+        (( n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
 
-	cuda_memset (b, 0, sizeof (real) * size, "dual_matvec:result");
+    cuda_memset (b, 0, sizeof (real) * size, "dual_matvec:result");
 
-	//one thread per row implementation
-	//k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-	//		(*H, a, b, n);
-	//cudaThreadSynchronize ();
-	//cudaCheckError ();
+    //one thread per row implementation
+    //k_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+    //        (*H, a, b, n);
+    //cudaThreadSynchronize ();
+    //cudaCheckError ();
 
 #if defined(__SM_35__)
-	k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
+    k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE >>>
 #else
-		k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, 
-			     sizeof (real) * MATVEC_BLOCK_SIZE>>>
+        k_matvec_csr <<< MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, 
+                 sizeof (real) * MATVEC_BLOCK_SIZE>>>
 #endif
-				     (*H, a, b, n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+                     (*H, a, b, n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
diff --git a/PG-PuReMD/src/cuda_lookup.cu b/PG-PuReMD/src/cuda_lookup.cu
index 277a5b5d..bad6af13 100644
--- a/PG-PuReMD/src/cuda_lookup.cu
+++ b/PG-PuReMD/src/cuda_lookup.cu
@@ -7,71 +7,71 @@
 
 void copy_LR_table_to_device (reax_system *system, control_params *control, int *aggregated)
 {
-	int i, j, r;
-	int num_atom_types;
-	LR_data *d_y;
-	cubic_spline_coef *temp;
-
-	num_atom_types = system->reax_param.num_atom_types;
-
-	fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n");
-
-	cuda_malloc ((void **) &d_LR, sizeof (LR_lookup_table) * ( num_atom_types * num_atom_types ), 0, "LR_lookup:table");
-
-	/*
-	   for( i = 0; i < MAX_ATOM_TYPES; ++i )
-	   existing_types[i] = 0;
-
-	   for( i = 0; i < system->N; ++i )
-	   existing_types[ system->atoms[i].type ] = 1;
-	 */
-
-	copy_host_device ( LR, d_LR, sizeof (LR_lookup_table) * (num_atom_types * num_atom_types), 
-			cudaMemcpyHostToDevice, "LR_lookup:table");
-
-	for( i = 0; i < num_atom_types; ++i )
-		if( aggregated [i] )
-			for( j = i; j < num_atom_types; ++j )
-
-				if( aggregated [j] ) { 
-
-					cuda_malloc ((void **) &d_y, sizeof (LR_data) * (control->tabulate + 1), 0, "LR_lookup:d_y");
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y, 
-							sizeof (LR_data) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:y");
-					copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, 
-							sizeof (LR_data *), cudaMemcpyHostToDevice, "LR_lookup:y");
-
-					cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:h");
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp, 
-							sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:h");
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, 
-							sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:h");
-
-					cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:vdW");
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, 
-							sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:vdW");
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,
-							sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:vdW");
-
-					cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:CEvd");
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, 
-							sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:CEvd");
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, 
-							sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:CDvd");
-
-					cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ele");
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp,
-							sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ele");
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele,
-							sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ele");
-
-					cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ceclmb");
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp,
-							sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ceclmb");
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb,
-							sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ceclmb");
-				}
-
-	fprintf (stderr, "Copy of the LR Lookup Table to the device complete ... \n");
+    int i, j, r;
+    int num_atom_types;
+    LR_data *d_y;
+    cubic_spline_coef *temp;
+
+    num_atom_types = system->reax_param.num_atom_types;
+
+    fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n");
+
+    cuda_malloc ((void **) &d_LR, sizeof (LR_lookup_table) * ( num_atom_types * num_atom_types ), 0, "LR_lookup:table");
+
+    /*
+       for( i = 0; i < MAX_ATOM_TYPES; ++i )
+       existing_types[i] = 0;
+
+       for( i = 0; i < system->N; ++i )
+       existing_types[ system->atoms[i].type ] = 1;
+     */
+
+    copy_host_device ( LR, d_LR, sizeof (LR_lookup_table) * (num_atom_types * num_atom_types), 
+            cudaMemcpyHostToDevice, "LR_lookup:table");
+
+    for( i = 0; i < num_atom_types; ++i )
+        if( aggregated [i] )
+            for( j = i; j < num_atom_types; ++j )
+
+                if( aggregated [j] ) { 
+
+                    cuda_malloc ((void **) &d_y, sizeof (LR_data) * (control->tabulate + 1), 0, "LR_lookup:d_y");
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y, 
+                            sizeof (LR_data) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:y");
+                    copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, 
+                            sizeof (LR_data *), cudaMemcpyHostToDevice, "LR_lookup:y");
+
+                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:h");
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp, 
+                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:h");
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, 
+                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:h");
+
+                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:vdW");
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, 
+                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:vdW");
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,
+                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:vdW");
+
+                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:CEvd");
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, 
+                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:CEvd");
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, 
+                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:CDvd");
+
+                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ele");
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp,
+                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ele");
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele,
+                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ele");
+
+                    cuda_malloc ((void **) &temp, sizeof (cubic_spline_coef) * (control->tabulate + 1), 0, "LR_lookup:ceclmb");
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp,
+                            sizeof (cubic_spline_coef) * (control->tabulate + 1), cudaMemcpyHostToDevice, "LR_lookup:ceclmb");
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb,
+                            sizeof (cubic_spline_coef *), cudaMemcpyHostToDevice, "LR_lookup:ceclmb");
+                }
+
+    fprintf (stderr, "Copy of the LR Lookup Table to the device complete ... \n");
 }
 
diff --git a/PG-PuReMD/src/cuda_multi_body.cu b/PG-PuReMD/src/cuda_multi_body.cu
index e3d7c60a..24a51005 100644
--- a/PG-PuReMD/src/cuda_multi_body.cu
+++ b/PG-PuReMD/src/cuda_multi_body.cu
@@ -27,322 +27,322 @@
 
 
 CUDA_GLOBAL void Cuda_Atom_Energy( reax_atom *my_atoms, 
-		global_parameters gp, 
-		single_body_parameters *sbp, 
-		two_body_parameters *tbp, 
-		storage p_workspace, 
-		reax_list p_bonds, 
-		int n, 
-		int num_atom_types,
-		real *data_elp,
-		real *data_eov, 
-		real *data_eun
-		)
+        global_parameters gp, 
+        single_body_parameters *sbp, 
+        two_body_parameters *tbp, 
+        storage p_workspace, 
+        reax_list p_bonds, 
+        int n, 
+        int num_atom_types,
+        real *data_elp,
+        real *data_eov, 
+        real *data_eun
+        )
 {
-	int i, j, pj, type_i, type_j;
-	real Delta_lpcorr, dfvl;
-	real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
-	real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
-	real e_ov, CEover1, CEover2, CEover3, CEover4;
-	real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
-	real exp_ovun2n, exp_ovun6, exp_ovun8;
-	real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
-	real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
-	real p_lp1, p_lp2, p_lp3;
-	real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
-
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_data *pbond;
-	bond_order_data *bo_ij; 
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= n) return;
-
-	reax_list *bonds = &( p_bonds );
-	storage *workspace = &( p_workspace );
-
-	/* Initialize parameters */
-	p_lp1 = gp.l[15];
-	p_lp3 = gp.l[5];
-	p_ovun3 = gp.l[32];
-	p_ovun4 = gp.l[31];
-	p_ovun6 = gp.l[6];
-	p_ovun7 = gp.l[8];
-	p_ovun8 = gp.l[9];
-
-	//for( i = 0; i < system->n; ++i ) {
-	/* set the parameter pointer */
-	type_i = my_atoms[i].type;
-	sbp_i = &(sbp[ type_i ]);
-
-	/* lone-pair Energy */
-	p_lp2 = sbp_i->p_lp2;      
-	expvd2 = EXP( -75 * workspace->Delta_lp[i] );
-	inv_expvd2 = 1. / (1. + expvd2 );
-
-	/* calculate the energy */
-	data_elp [i] += e_lp = 
-		p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-
-	dElp = p_lp2 * inv_expvd2 + 
-		75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
-	CElp = dElp * workspace->dDelta_lp[i];
-
-	workspace->CdDelta[i] += CElp;  // lp - 1st term  
+    int i, j, pj, type_i, type_j;
+    real Delta_lpcorr, dfvl;
+    real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
+    real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
+    real e_ov, CEover1, CEover2, CEover3, CEover4;
+    real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
+    real exp_ovun2n, exp_ovun6, exp_ovun8;
+    real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
+    real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
+    real p_lp1, p_lp2, p_lp3;
+    real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
+
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_data *pbond;
+    bond_order_data *bo_ij; 
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    reax_list *bonds = &( p_bonds );
+    storage *workspace = &( p_workspace );
+
+    /* Initialize parameters */
+    p_lp1 = gp.l[15];
+    p_lp3 = gp.l[5];
+    p_ovun3 = gp.l[32];
+    p_ovun4 = gp.l[31];
+    p_ovun6 = gp.l[6];
+    p_ovun7 = gp.l[8];
+    p_ovun8 = gp.l[9];
+
+    //for( i = 0; i < system->n; ++i ) {
+    /* set the parameter pointer */
+    type_i = my_atoms[i].type;
+    sbp_i = &(sbp[ type_i ]);
+
+    /* lone-pair Energy */
+    p_lp2 = sbp_i->p_lp2;      
+    expvd2 = EXP( -75 * workspace->Delta_lp[i] );
+    inv_expvd2 = 1. / (1. + expvd2 );
+
+    /* calculate the energy */
+    data_elp [i] += e_lp = 
+        p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+
+    dElp = p_lp2 * inv_expvd2 + 
+        75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
+    CElp = dElp * workspace->dDelta_lp[i];
+
+    workspace->CdDelta[i] += CElp;  // lp - 1st term  
 
 #ifdef TEST_ENERGY
-	//  fprintf( out_control->elp, "%24.15e%24.15e%24.15e%24.15e\n",
-	//	     p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
-	//  fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n",
-	fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n",
-			system->my_atoms[i].orig_id, workspace->nlp[i], 
-			e_lp, data->my_en.e_lp );
+    //  fprintf( out_control->elp, "%24.15e%24.15e%24.15e%24.15e\n",
+    //         p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
+    //  fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n",
+    fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n",
+            system->my_atoms[i].orig_id, workspace->nlp[i], 
+            e_lp, data->my_en.e_lp );
 #endif
 #ifdef TEST_FORCES
-	Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
+    Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
 #endif
 
-	/* correction for C2 */
-	if( gp.l[5] > 0.001 &&
-			!cuda_strcmp( sbp[type_i].name, "C", 1 ) )
-		for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
-			if( my_atoms[i].orig_id < 
-					my_atoms[bonds->select.bond_list[pj].nbr].orig_id ) {
-				j = bonds->select.bond_list[pj].nbr;
-				type_j = my_atoms[j].type;
-
-				if( !cuda_strcmp( sbp[type_j].name, "C", 1 ) ) {
-					twbp = &( tbp[index_tbp (type_i,type_j, num_atom_types) ]);
-					bo_ij = &( bonds->select.bond_list[pj].bo_data );
-					Di = workspace->Delta[i];
-					vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
-
-					if( vov3 > 3. ) {
-						data_elp [i] += e_lph = p_lp3 * SQR(vov3-3.0);
-
-						deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-						deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
-
-						bo_ij->Cdbo += deahu2dbo;
-						workspace->CdDelta[i] += deahu2dsbo;
+    /* correction for C2 */
+    if( gp.l[5] > 0.001 &&
+            !cuda_strcmp( sbp[type_i].name, "C", 1 ) )
+        for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj )
+            if( my_atoms[i].orig_id < 
+                    my_atoms[bonds->select.bond_list[pj].nbr].orig_id ) {
+                j = bonds->select.bond_list[pj].nbr;
+                type_j = my_atoms[j].type;
+
+                if( !cuda_strcmp( sbp[type_j].name, "C", 1 ) ) {
+                    twbp = &( tbp[index_tbp (type_i,type_j, num_atom_types) ]);
+                    bo_ij = &( bonds->select.bond_list[pj].bo_data );
+                    Di = workspace->Delta[i];
+                    vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+
+                    if( vov3 > 3. ) {
+                        data_elp [i] += e_lph = p_lp3 * SQR(vov3-3.0);
+
+                        deahu2dbo = 2.*p_lp3*(vov3 - 3.);
+                        deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+
+                        bo_ij->Cdbo += deahu2dbo;
+                        workspace->CdDelta[i] += deahu2dsbo;
 #ifdef TEST_ENERGY
-						fprintf(out_control->elp,"C2cor%6d%6d%12.6f%12.6f%12.6f\n",
-								system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-								e_lph, deahu2dbo, deahu2dsbo );
+                        fprintf(out_control->elp,"C2cor%6d%6d%12.6f%12.6f%12.6f\n",
+                                system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+                                e_lph, deahu2dbo, deahu2dsbo );
 #endif
 #ifdef TEST_FORCES
-						Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
-						Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
+                        Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
+                        Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
 #endif
-					}
-				}    
-			}
-	//}
+                    }
+                }    
+            }
+    //}
 
 
-	//for( i = 0; i < system->n; ++i ) {
-	type_i = my_atoms[i].type;
-	sbp_i = &(sbp[ type_i ]);
+    //for( i = 0; i < system->n; ++i ) {
+    type_i = my_atoms[i].type;
+    sbp_i = &(sbp[ type_i ]);
 
-	/* over-coordination energy */
-	if( sbp_i->mass > 21.0 ) 
-		dfvl = 0.0;
-	else dfvl = 1.0; // only for 1st-row elements
+    /* over-coordination energy */
+    if( sbp_i->mass > 21.0 ) 
+        dfvl = 0.0;
+    else dfvl = 1.0; // only for 1st-row elements
 
-	p_ovun2 = sbp_i->p_ovun2;
-	sum_ovun1 = sum_ovun2 = 0;
-	for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) {
-		j = bonds->select.bond_list[pj].nbr;
-		type_j = my_atoms[j].type;
-		bo_ij = &(bonds->select.bond_list[pj].bo_data);
-		sbp_j = &(sbp[ type_j ]);
-		twbp = &(tbp[ index_tbp (type_i, type_j, num_atom_types )]);
+    p_ovun2 = sbp_i->p_ovun2;
+    sum_ovun1 = sum_ovun2 = 0;
+    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) {
+        j = bonds->select.bond_list[pj].nbr;
+        type_j = my_atoms[j].type;
+        bo_ij = &(bonds->select.bond_list[pj].bo_data);
+        sbp_j = &(sbp[ type_j ]);
+        twbp = &(tbp[ index_tbp (type_i, type_j, num_atom_types )]);
 
-		sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
-		sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
-			( bo_ij->BO_pi + bo_ij->BO_pi2 );
+        sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
+        sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
+            ( bo_ij->BO_pi + bo_ij->BO_pi2 );
 
-		/*fprintf( stdout, "%4d%4d%12.6f%12.6f%12.6f\n",
-		  i+1, j+1,      
-		  dfvl * workspace->Delta_lp_temp[j], 
-		  sbp_j->nlp_opt,
-		  workspace->nlp_temp[j] );*/
-	}
+        /*fprintf( stdout, "%4d%4d%12.6f%12.6f%12.6f\n",
+          i+1, j+1,      
+          dfvl * workspace->Delta_lp_temp[j], 
+          sbp_j->nlp_opt,
+          workspace->nlp_temp[j] );*/
+    }
 
-	exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
-	inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
-	Delta_lpcorr  = workspace->Delta[i] - 
-		(dfvl * workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
+    exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
+    inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
+    Delta_lpcorr  = workspace->Delta[i] - 
+        (dfvl * workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
 
-	exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
-	inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
+    exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
+    inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
 
-	DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8);
-	CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
+    DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8);
+    CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
 
-	data_eov [i] += e_ov = sum_ovun1 * CEover1;
+    data_eov [i] += e_ov = sum_ovun1 * CEover1;
 
-	CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
-		(1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ));
+    CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
+        (1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ));
 
-	CEover3 = CEover2 * (1.0 - dfvl * workspace->dDelta_lp[i] * inv_exp_ovun1 );
+    CEover3 = CEover2 * (1.0 - dfvl * workspace->dDelta_lp[i] * inv_exp_ovun1 );
 
-	CEover4 = CEover2 * (dfvl * workspace->Delta_lp_temp[i]) * 
-		p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
+    CEover4 = CEover2 * (dfvl * workspace->Delta_lp_temp[i]) * 
+        p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
 
 
-	/* under-coordination potential */
-	p_ovun2 = sbp_i->p_ovun2;
-	p_ovun5 = sbp_i->p_ovun5;
+    /* under-coordination potential */
+    p_ovun2 = sbp_i->p_ovun2;
+    p_ovun5 = sbp_i->p_ovun5;
 
-	exp_ovun2n = 1.0 / exp_ovun2;
-	exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
-	exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
-	inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
-	inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
+    exp_ovun2n = 1.0 / exp_ovun2;
+    exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
+    exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
+    inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
+    inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
-	data_eun [i] += e_un =
-		-p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+    data_eun [i] += e_un =
+        -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
 
-	CEunder1 = inv_exp_ovun2n * 
-		( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 +
-		  p_ovun2 * e_un * exp_ovun2n );
-	CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
-	CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
-	CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
-		p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
+    CEunder1 = inv_exp_ovun2n * 
+        ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 +
+          p_ovun2 * e_un * exp_ovun2n );
+    CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
+    CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
+    CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
+        p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
 
 
-	/* forces */
-	workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
-	workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
+    /* forces */
+    workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
+    workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
 
 #ifdef TEST_FORCES
-	Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor 2nd
-	Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor 1st
+    Add_dDelta( system, lists, i, CEover3, workspace->f_ov ); // OvCoor 2nd
+    Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor 1st
 #endif
 
-	for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) {
-		pbond = &(bonds->select.bond_list[pj]);
-		j = pbond->nbr;
-		bo_ij = &(pbond->bo_data);
-		twbp  = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[pbond->nbr].type, 
-					num_atom_types) ]);
+    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ) {
+        pbond = &(bonds->select.bond_list[pj]);
+        j = pbond->nbr;
+        bo_ij = &(pbond->bo_data);
+        twbp  = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[pbond->nbr].type, 
+                    num_atom_types) ]);
 
 
-		bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s;// OvCoor-1st 
-		//workspace->CdDelta[j] += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-		pbond->ae_CdDelta += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-			(bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor-3a
-		bo_ij->Cdbopi += CEover4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // OvCoor-3b
-		bo_ij->Cdbopi2 += CEover4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // OvCoor-3b
+        bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s;// OvCoor-1st 
+        //workspace->CdDelta[j] += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+        pbond->ae_CdDelta += CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+            (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor-3a
+        bo_ij->Cdbopi += CEover4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]); // OvCoor-3b
+        bo_ij->Cdbopi2 += CEover4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // OvCoor-3b
 
 
-		//workspace->CdDelta[j] += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) *
-		pbond->ae_CdDelta += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) *
-			(bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
-		bo_ij->Cdbopi += CEunder4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // UnCoor-2b
-		bo_ij->Cdbopi2 += CEunder4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // UnCoor-2b
+        //workspace->CdDelta[j] += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) *
+        pbond->ae_CdDelta += CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) *
+            (bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
+        bo_ij->Cdbopi += CEunder4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // UnCoor-2b
+        bo_ij->Cdbopi2 += CEunder4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);  // UnCoor-2b
 
 
 #ifdef TEST_ENERGY
-		/*	  fprintf( out_control->eov, "%6d%12.6f\n", 
-			  workspace->reverse_map[j], 
-		// CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, 
-		CEover4 * (1.0 - workspace->dDelta_lp[j]) * 
-		(bo_ij->BO_pi + bo_ij->BO_pi2)
-		 *///		   /*CEover4 * (workspace->Delta[j]-workspace->Delta_lp[j])*/);
-		//	  fprintf( out_control->eov, "%6d%12.6f\n", 
-		//	  fprintf( out_control->eov, "%6d%24.15e\n", 
-		//		   system->my_atoms[j].orig_id, 
-		// CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, 
-		//		   CEover4 * (1.0 - workspace->dDelta_lp[j]) * 
-		//		   (bo_ij->BO_pi + bo_ij->BO_pi2)
-		//		   /*CEover4 * (workspace->Delta[j]-workspace->Delta_lp[j])*/);
-
-		// CEunder4 * (1.0 - workspace->dDelta_lp[j]) * 
-		// (bo_ij->BO_pi + bo_ij->BO_pi2),
-		// CEunder4 * (workspace->Delta[j] - workspace->Delta_lp[j]) );
+        /*      fprintf( out_control->eov, "%6d%12.6f\n", 
+              workspace->reverse_map[j], 
+        // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, 
+        CEover4 * (1.0 - workspace->dDelta_lp[j]) * 
+        (bo_ij->BO_pi + bo_ij->BO_pi2)
+         *///           /*CEover4 * (workspace->Delta[j]-workspace->Delta_lp[j])*/);
+        //      fprintf( out_control->eov, "%6d%12.6f\n", 
+        //      fprintf( out_control->eov, "%6d%24.15e\n", 
+        //           system->my_atoms[j].orig_id, 
+        // CEover1 * twbp->p_ovun1 * twbp->De_s, CEover3, 
+        //           CEover4 * (1.0 - workspace->dDelta_lp[j]) * 
+        //           (bo_ij->BO_pi + bo_ij->BO_pi2)
+        //           /*CEover4 * (workspace->Delta[j]-workspace->Delta_lp[j])*/);
+
+        // CEunder4 * (1.0 - workspace->dDelta_lp[j]) * 
+        // (bo_ij->BO_pi + bo_ij->BO_pi2),
+        // CEunder4 * (workspace->Delta[j] - workspace->Delta_lp[j]) );
 #endif
 
 #ifdef TEST_FORCES
-		Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s,
-				workspace->f_ov ); // OvCoor - 1st term
-
-		Add_dDelta( system, lists, j,
-				CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-				(bo_ij->BO_pi + bo_ij->BO_pi2),
-				workspace->f_ov );   // OvCoor - 3a
-
-		Add_dBOpinpi2( system, lists, i, pj, 
-				CEover4 * (workspace->Delta[j] - 
-					dfvl * workspace->Delta_lp_temp[j]),
-				CEover4 * (workspace->Delta[j] - 
-					dfvl * workspace->Delta_lp_temp[j]),
-				workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
-
-		Add_dDelta( system, lists, j,
-				CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-				(bo_ij->BO_pi + bo_ij->BO_pi2),
-				workspace->f_un ); // UnCoor - 2a
-
-		Add_dBOpinpi2( system, lists, i, pj, 
-				CEunder4 * (workspace->Delta[j] - 
-					dfvl * workspace->Delta_lp_temp[j]),
-				CEunder4 * (workspace->Delta[j] - 
-					dfvl * workspace->Delta_lp_temp[j]),
-				workspace->f_un, workspace->f_un ); // UnCoor - 2b
+        Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s,
+                workspace->f_ov ); // OvCoor - 1st term
+
+        Add_dDelta( system, lists, j,
+                CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+                (bo_ij->BO_pi + bo_ij->BO_pi2),
+                workspace->f_ov );   // OvCoor - 3a
+
+        Add_dBOpinpi2( system, lists, i, pj, 
+                CEover4 * (workspace->Delta[j] - 
+                    dfvl * workspace->Delta_lp_temp[j]),
+                CEover4 * (workspace->Delta[j] - 
+                    dfvl * workspace->Delta_lp_temp[j]),
+                workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
+
+        Add_dDelta( system, lists, j,
+                CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+                (bo_ij->BO_pi + bo_ij->BO_pi2),
+                workspace->f_un ); // UnCoor - 2a
+
+        Add_dBOpinpi2( system, lists, i, pj, 
+                CEunder4 * (workspace->Delta[j] - 
+                    dfvl * workspace->Delta_lp_temp[j]),
+                CEunder4 * (workspace->Delta[j] - 
+                    dfvl * workspace->Delta_lp_temp[j]),
+                workspace->f_un, workspace->f_un ); // UnCoor - 2b
 #endif
-	}
+    }
 
 #ifdef TEST_ENERGY
-	//fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n",
-	//fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n",
-	//     system->my_atoms[i].orig_id, workspace->nlp[i], 
-	//     e_lp, data->my_en.e_lp );
-
-	//fprintf( out_control->eov, "%6d%24.15e%24.15e\n", 
-	fprintf( out_control->eov, "%6d%12.4f%12.4f\n", 
-			system->my_atoms[i].orig_id, 
-			e_ov, data->my_en.e_ov + data->my_en.e_un );
-
-	//fprintf( out_control->eun, "%6d%24.15e%24.15e\n", 
-	fprintf( out_control->eun, "%6d%12.4f%12.4f\n", 
-			system->my_atoms[i].orig_id, 
-			e_un, data->my_en.e_ov + data->my_en.e_un );
+    //fprintf( out_control->elp, "%6d%24.15e%24.15e%24.15e\n",
+    //fprintf( out_control->elp, "%6d%12.4f%12.4f%12.4f\n",
+    //     system->my_atoms[i].orig_id, workspace->nlp[i], 
+    //     e_lp, data->my_en.e_lp );
+
+    //fprintf( out_control->eov, "%6d%24.15e%24.15e\n", 
+    fprintf( out_control->eov, "%6d%12.4f%12.4f\n", 
+            system->my_atoms[i].orig_id, 
+            e_ov, data->my_en.e_ov + data->my_en.e_un );
+
+    //fprintf( out_control->eun, "%6d%24.15e%24.15e\n", 
+    fprintf( out_control->eun, "%6d%12.4f%12.4f\n", 
+            system->my_atoms[i].orig_id, 
+            e_un, data->my_en.e_ov + data->my_en.e_un );
 #endif
-	//}
+    //}
 }
 
 CUDA_GLOBAL void Cuda_Atom_Energy_PostProcess ( reax_list p_bonds, 
-		storage p_workspace, int n )
+        storage p_workspace, int n )
 {
-	int i,pj;
-	bond_data *pbond, *sbond;
-	bond_data *sym_index_bond;
+    int i,pj;
+    bond_data *pbond, *sbond;
+    bond_data *sym_index_bond;
 
-	reax_list *bonds = &p_bonds;
-	storage *workspace = &p_workspace;
+    reax_list *bonds = &p_bonds;
+    storage *workspace = &p_workspace;
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n) return;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n) return;
 
-	for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
+    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
 
-		/*
-		   pbond = &(bonds->select.bond_list[pj]);
-		   dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] );
-		   workspace->CdDelta [i] += dbond_index_bond->ae_CdDelta;
-		 */
+        /*
+           pbond = &(bonds->select.bond_list[pj]);
+           dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] );
+           workspace->CdDelta [i] += dbond_index_bond->ae_CdDelta;
+         */
 
-		sbond = &(bonds->select.bond_list [pj]);
-		sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); 
-		workspace->CdDelta [i] += sym_index_bond->ae_CdDelta;
+        sbond = &(bonds->select.bond_list [pj]);
+        sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]); 
+        workspace->CdDelta [i] += sym_index_bond->ae_CdDelta;
 
-	}
+    }
 }
diff --git a/PG-PuReMD/src/cuda_neighbors.cu b/PG-PuReMD/src/cuda_neighbors.cu
index 9072de22..e552ab6b 100644
--- a/PG-PuReMD/src/cuda_neighbors.cu
+++ b/PG-PuReMD/src/cuda_neighbors.cu
@@ -33,681 +33,681 @@
 
 CUDA_DEVICE real Dev_DistSqr_to_Special_Point( rvec cp, rvec x ) 
 {
-	int  i;  
-	real d_sqr = 0;
+    int  i;  
+    real d_sqr = 0;
 
-	for( i = 0; i < 3; ++i )
-		if( cp[i] > NEG_INF )
-			d_sqr += SQR( cp[i] - x[i] );
+    for( i = 0; i < 3; ++i )
+        if( cp[i] > NEG_INF )
+            d_sqr += SQR( cp[i] - x[i] );
 
-	return d_sqr;
+    return d_sqr;
 }
 
 
-CUDA_GLOBAL void ker_generate_neighbor_lists (	reax_atom *my_atoms, 
-		simulation_box my_ext_box,
-		grid g,
-		reax_list far_nbrs, 
-		int n, int N )
+CUDA_GLOBAL void ker_generate_neighbor_lists (    reax_atom *my_atoms, 
+        simulation_box my_ext_box,
+        grid g,
+        reax_list far_nbrs, 
+        int n, int N )
 {
-	int  i, j, k, l, m, itr, num_far;
-	real d, cutoff;
-	ivec c, nbrs_x;
-	rvec dvec;
-	grid_cell *gci, *gcj;
-	far_neighbor_data *nbr_data;//, *my_start;
-	reax_atom *atom1, *atom2;
-
-	l = blockIdx.x * blockDim.x  + threadIdx.x;
-	if (l >= N) return;
-
-	atom1 = &(my_atoms[l]);
-	num_far = Dev_Start_Index (l, &far_nbrs);
-
-	//get the coordinates of the atom and 
-	//compute the grid cell
-	/*
-	   i = (int) (my_atoms[ l ].x[0] * g.inv_len[0]);
-	   j = (int) (my_atoms[ l ].x[1] * g.inv_len[1]);
-	   k = (int) (my_atoms[ l ].x[2] * g.inv_len[2]);
-	 */
-	if (l < n) {
-		for (i = 0; i < 3; i++)
-		{
-			c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
-			if( c[i] >= g.native_end[i] )
-				c[i] = g.native_end[i] - 1;
-			else if( c[i] < g.native_str[i] )
-				c[i] = g.native_str[i];
-		}
-	} else {
-		for (i = 0; i < 3; i++)
-		{
-			c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
-			if( c[i] < 0 ) c[i] = 0;
-			else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1;
-		}
-	}
-
-	i = c[0];
-	j = c[1];
-	k = c[2];
-
-	//gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] );
-	cutoff = SQR(g.cutoff[index_grid_3d (i, j, k, &g)]);
-
-	itr = 0;
-	while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { 
-
-		ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-		//gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-
-		if( g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
-				(Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) )
-			/* pick up another atom from the neighbor cell */
-			for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-					m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) {
-				if(( l < m )) { // prevent recounting same pairs within a gcell 
-					atom2 = &(my_atoms[m]);
-					dvec[0] = atom2->x[0] - atom1->x[0];
-					dvec[1] = atom2->x[1] - atom1->x[1];
-					dvec[2] = atom2->x[2] - atom1->x[2];
-					d = rvec_Norm_Sqr( dvec );
-					if( d <= cutoff ) { 
-						nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
-						nbr_data->nbr = m;
-						nbr_data->d = SQRT(d);
-						rvec_Copy( nbr_data->dvec, dvec );
-						//ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box );
-						ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-								-1, g.rel_box[index_grid_3d (i, j, k, &g)] );
-						++num_far;
-					}
-				}
-				/*
-				   if(( l > m )) { // prevent recounting same pairs within a gcell 
-				   atom2 = &(my_atoms[m]);
-				   dvec[0] = atom1->x[0] - atom2->x[0];
-				   dvec[1] = atom1->x[1] - atom2->x[1];
-				   dvec[2] = atom1->x[2] - atom2->x[2];
-				   d = rvec_Norm_Sqr( dvec );
-				   if( d <= cutoff ) { 
-				   nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
-				   nbr_data->nbr = m;
-				   nbr_data->d = SQRT(d);
-				   rvec_Copy( nbr_data->dvec, dvec );
-				   ivec_ScaledSum( nbr_data->rel_box, 
-				   -1, gcj->rel_box, 1, gci->rel_box );
-				   ++num_far;
-				   }
-				   }   
-				 */
-			}
-		++itr;
-	}   
-
-	itr = 0;
-	while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { 
-		ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-		//gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-		cutoff = SQR(g.cutoff[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]);
-
-		if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
-				(Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) )
-			for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-					m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) {
-				if(( l > m )) {
-					atom2 = &(my_atoms[m]);
-					dvec[0] = atom1->x[0] - atom2->x[0];
-					dvec[1] = atom1->x[1] - atom2->x[1];
-					dvec[2] = atom1->x[2] - atom2->x[2];
-					d = rvec_Norm_Sqr( dvec );
-					if( d <= cutoff ) { 
-						nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
-						nbr_data->nbr = m;
-						nbr_data->d = SQRT(d);
-						rvec_Copy( nbr_data->dvec, dvec );
-						//ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box );
-						ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-								-1, g.rel_box[index_grid_3d (i, j, k, &g)] );
-						++num_far;
-					}
-				}   
-			}
-		++itr;
-	}   
-
-	Dev_Set_End_Index( l, num_far, &far_nbrs );
+    int  i, j, k, l, m, itr, num_far;
+    real d, cutoff;
+    ivec c, nbrs_x;
+    rvec dvec;
+    grid_cell *gci, *gcj;
+    far_neighbor_data *nbr_data;//, *my_start;
+    reax_atom *atom1, *atom2;
+
+    l = blockIdx.x * blockDim.x  + threadIdx.x;
+    if (l >= N) return;
+
+    atom1 = &(my_atoms[l]);
+    num_far = Dev_Start_Index (l, &far_nbrs);
+
+    //get the coordinates of the atom and 
+    //compute the grid cell
+    /*
+       i = (int) (my_atoms[ l ].x[0] * g.inv_len[0]);
+       j = (int) (my_atoms[ l ].x[1] * g.inv_len[1]);
+       k = (int) (my_atoms[ l ].x[2] * g.inv_len[2]);
+     */
+    if (l < n) {
+        for (i = 0; i < 3; i++)
+        {
+            c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
+            if( c[i] >= g.native_end[i] )
+                c[i] = g.native_end[i] - 1;
+            else if( c[i] < g.native_str[i] )
+                c[i] = g.native_str[i];
+        }
+    } else {
+        for (i = 0; i < 3; i++)
+        {
+            c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
+            if( c[i] < 0 ) c[i] = 0;
+            else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1;
+        }
+    }
+
+    i = c[0];
+    j = c[1];
+    k = c[2];
+
+    //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] );
+    cutoff = SQR(g.cutoff[index_grid_3d (i, j, k, &g)]);
+
+    itr = 0;
+    while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { 
+
+        ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
+        //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
+
+        if( g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
+                (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) )
+            /* pick up another atom from the neighbor cell */
+            for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
+                    m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) {
+                if(( l < m )) { // prevent recounting same pairs within a gcell 
+                    atom2 = &(my_atoms[m]);
+                    dvec[0] = atom2->x[0] - atom1->x[0];
+                    dvec[1] = atom2->x[1] - atom1->x[1];
+                    dvec[2] = atom2->x[2] - atom1->x[2];
+                    d = rvec_Norm_Sqr( dvec );
+                    if( d <= cutoff ) { 
+                        nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
+                        nbr_data->nbr = m;
+                        nbr_data->d = SQRT(d);
+                        rvec_Copy( nbr_data->dvec, dvec );
+                        //ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box );
+                        ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                                -1, g.rel_box[index_grid_3d (i, j, k, &g)] );
+                        ++num_far;
+                    }
+                }
+                /*
+                   if(( l > m )) { // prevent recounting same pairs within a gcell 
+                   atom2 = &(my_atoms[m]);
+                   dvec[0] = atom1->x[0] - atom2->x[0];
+                   dvec[1] = atom1->x[1] - atom2->x[1];
+                   dvec[2] = atom1->x[2] - atom2->x[2];
+                   d = rvec_Norm_Sqr( dvec );
+                   if( d <= cutoff ) { 
+                   nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
+                   nbr_data->nbr = m;
+                   nbr_data->d = SQRT(d);
+                   rvec_Copy( nbr_data->dvec, dvec );
+                   ivec_ScaledSum( nbr_data->rel_box, 
+                   -1, gcj->rel_box, 1, gci->rel_box );
+                   ++num_far;
+                   }
+                   }   
+                 */
+            }
+        ++itr;
+    }   
+
+    itr = 0;
+    while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { 
+        ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
+        //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
+        cutoff = SQR(g.cutoff[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]);
+
+        if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
+                (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) )
+            for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
+                    m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m ) {
+                if(( l > m )) {
+                    atom2 = &(my_atoms[m]);
+                    dvec[0] = atom1->x[0] - atom2->x[0];
+                    dvec[1] = atom1->x[1] - atom2->x[1];
+                    dvec[2] = atom1->x[2] - atom2->x[2];
+                    d = rvec_Norm_Sqr( dvec );
+                    if( d <= cutoff ) { 
+                        nbr_data = &(far_nbrs.select.far_nbr_list[num_far]);
+                        nbr_data->nbr = m;
+                        nbr_data->d = SQRT(d);
+                        rvec_Copy( nbr_data->dvec, dvec );
+                        //ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box );
+                        ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                                -1, g.rel_box[index_grid_3d (i, j, k, &g)] );
+                        ++num_far;
+                    }
+                }   
+            }
+        ++itr;
+    }   
+
+    Dev_Set_End_Index( l, num_far, &far_nbrs );
 }
 
 
-CUDA_GLOBAL void ker_mt_generate_neighbor_lists (	reax_atom *my_atoms, 
-		//CUDA_GLOBAL void __launch_bounds__ (1024) ker_mt_generate_neighbor_lists (	reax_atom *my_atoms, 
-		simulation_box my_ext_box,
-		grid g,
-		reax_list far_nbrs, 
-		int n, int N )
-		{
-
-		extern __shared__ int __nbr[];
-		extern __shared__ int __sofar [];
-		bool  nbrgen;
-
-		int __THREADS_PER_ATOM__ = NB_KER_THREADS_PER_ATOM;
-
-		int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-		int warp_id = thread_id / __THREADS_PER_ATOM__;
-		int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
-		int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-
-		if (warp_id >= N ) return;
-
-		int *tnbr = __nbr;
-		int *nbrssofar = __nbr + blockDim.x;
-		int max, leader;
-
-		int  i, j, k, l, m, itr, num_far, ll;
-		real d, cutoff, cutoff_ji;
-		ivec c, nbrs_x;
-		rvec dvec;
-		grid_cell *gci, *gcj;
-		far_neighbor_data *nbr_data, *my_start;
-		reax_atom *atom1, *atom2;
-
-		//l = blockIdx.x * blockDim.x  + threadIdx.x;
-		//if (l >= N) return;
-
-		l = warp_id;
-
-		atom1 = &(my_atoms[l]);
-		num_far = Dev_Start_Index (l, &far_nbrs);
-
-		my_start = &( far_nbrs.select.far_nbr_list [num_far] );
-
-		//get the coordinates of the atom and 
-		//compute the grid cell
-		if (l < n) {
-			for (i = 0; i < 3; i++)
-			{
-				c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
-				if( c[i] >= g.native_end[i] )
-					c[i] = g.native_end[i] - 1;
-				else if( c[i] < g.native_str[i] )
-					c[i] = g.native_str[i];
-			}
-		} else {
-			for (i = 0; i < 3; i++)
-			{
-				c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
-				if( c[i] < 0 ) c[i] = 0;
-				else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1;
-			}
-		}
-
-		i = c[0];
-		j = c[1];
-		k = c[2];
-
-		//gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] );
-
-
-		tnbr[threadIdx.x] = 0;
-		if (lane_id == 0) {
-			nbrssofar [my_bucket] = 0;
-		}
-		__syncthreads ();
-
-		itr = 0;
-		//while( (gci->nbrs_x[itr][0]) >= 0 ) { 
-		while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { 
-
-			tnbr[threadIdx.x] = 0;
-			nbrgen = false;
-
-			//ivec_Copy (nbrs_x, gci->nbrs_x[itr] );
-			ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-			//gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-
-			//cutoff = SQR(gci->cutoff);
-			cutoff = SQR (g.cutoff [index_grid_3d (i, j, k, &g)]);
-			//cutoff_ji = SQR(gcj->cutoff);
-			cutoff_ji = SQR(g.cutoff[ index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-			//if( ((gci->str <= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff)) 
-			//	 || ((gci->str >= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff_ji)))
-			if( ((g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) 
-						&& (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff)) 
-					|| ((g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) 
-						&& (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff_ji)))
-			{
-				//max = gcj->end - gcj->str;
-				max = g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] - g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)];
-				tnbr[threadIdx.x] = 0;
-				nbrgen = false;
-				//m = lane_id  + gcj->str; //0-31
-				m = lane_id  + g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; //0-31
-				int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1);
-				int iterations = 0;
-
-				// pick up another atom from the neighbor cell
-				//for( m = gcj->str; m < gcj->end; ++m ) 
-				while (iterations < loopcount) {
-					tnbr [threadIdx.x] = 0;
-					nbrgen = false;
-
-					//if(( l < m ) && (m < gcj->end)) { // prevent recounting same pairs within a gcell 
-					if(( l < m ) && (m < g.end [index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) { // prevent recounting same pairs within a gcell 
-						atom2 = &(my_atoms[m]);
-						dvec[0] = atom2->x[0] - atom1->x[0];
-						dvec[1] = atom2->x[1] - atom1->x[1];
-						dvec[2] = atom2->x[2] - atom1->x[2];
-						d = rvec_Norm_Sqr( dvec );
-						if( d <= cutoff ) { 
-							tnbr [threadIdx.x] = 1;
-							nbrgen = true;
-						}
-					}
-
-					//if(( l > m ) && (m < gcj->end)) {
-					if(( l > m ) && (m < g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) {
-						atom2 = &(my_atoms[m]);
-						dvec[0] = atom1->x[0] - atom2->x[0];
-						dvec[1] = atom1->x[1] - atom2->x[1];
-						dvec[2] = atom1->x[2] - atom2->x[2];
-						d = rvec_Norm_Sqr( dvec );
-						if( d <= cutoff_ji ) { 
-							tnbr [threadIdx.x] = 1;
-							nbrgen = true;
-						}
-					} 
-
-					//is neighbor generated
-					if (nbrgen)
-					{
-						//do leader selection here
-						leader = -1;
-						for (ll = my_bucket *__THREADS_PER_ATOM__; ll < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; ll++)
-							if (tnbr[ll]){
-								leader = ll;
-								break;
-							}
-
-						//do the reduction;
-						if (threadIdx.x == leader)
-							for (ll = 1; ll < __THREADS_PER_ATOM__; ll++)
-								tnbr [my_bucket * __THREADS_PER_ATOM__ + ll] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (ll-1)];
-					}
-
-					if (nbrgen)
-					{
-						//got the indices
-						nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1;
-						nbr_data->nbr = m;
-						if (l < m) {
-							dvec[0] = atom2->x[0] - atom1->x[0];
-							dvec[1] = atom2->x[1] - atom1->x[1];
-							dvec[2] = atom2->x[2] - atom1->x[2];
-							d = rvec_Norm_Sqr( dvec );
-							nbr_data->d = SQRT (d);
-							rvec_Copy( nbr_data->dvec, dvec );
-							//ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box );
-							ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-									-1, g.rel_box[index_grid_3d( i, j, k, &g)] );
-						} 
-						else {
-							dvec[0] = atom1->x[0] - atom2->x[0];
-							dvec[1] = atom1->x[1] - atom2->x[1];
-							dvec[2] = atom1->x[2] - atom2->x[2];
-							d = rvec_Norm_Sqr( dvec );
-							nbr_data->d = SQRT(d);
-							rvec_Copy( nbr_data->dvec, dvec );
-							//ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box );
-							/*
-							   CHANGE ORIGINAL
-							   This is a bug in the original code 
-							   ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-							   -1, g.rel_box[index_grid_3d( i, j, k, &g)] );
-							 */
-							ivec_ScaledSum( nbr_data->rel_box, -1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
-									1, g.rel_box[index_grid_3d( i, j, k, &g)] );
-						}
-
-						if (threadIdx.x == leader)
-							nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
-					}
-
-					m += __THREADS_PER_ATOM__;
-					iterations ++;
-
-					//cleanup
-					nbrgen = false;
-					tnbr [threadIdx.x] = 0;
-				}
-				}
-				++itr;
-				}   
-
-				if (lane_id == 0)
-					Dev_Set_End_Index (l, num_far + nbrssofar[my_bucket], &far_nbrs);
-				//Dev_Set_End_Index( l, num_far, &far_nbrs );
-			}
-
-
-
-			CUDA_GLOBAL void ker_count_total_nbrs (reax_list far_nbrs, int N, int *result)
-			{
-				//strided access
-				extern __shared__ int count[];
-				unsigned int i = threadIdx.x;
-				int my_count = 0;
-				count[i] = 0;
-
-				for (i = threadIdx.x; i < N; i += threadIdx.x + blockDim.x)
-					count[threadIdx.x] += Dev_Num_Entries (i, &far_nbrs);
-
-				__syncthreads ();
-
-				for (int offset = blockDim.x/2; offset > 0; offset >>=1 )
-					if(threadIdx.x < offset)
-						count [threadIdx.x] += count [threadIdx.x + offset];
-
-				__syncthreads ();
-
-				if (threadIdx.x == 0)
-					*result = count [threadIdx.x];
-			}
-
-			extern "C" void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, 
-					storage *workspace, reax_list **lists )
-			{
-				int blocks, num_far;
-				int *d_num_far = (int *) scratch;
+CUDA_GLOBAL void ker_mt_generate_neighbor_lists (    reax_atom *my_atoms, 
+        //CUDA_GLOBAL void __launch_bounds__ (1024) ker_mt_generate_neighbor_lists (    reax_atom *my_atoms, 
+        simulation_box my_ext_box,
+        grid g,
+        reax_list far_nbrs, 
+        int n, int N )
+        {
+
+        extern __shared__ int __nbr[];
+        extern __shared__ int __sofar [];
+        bool  nbrgen;
+
+        int __THREADS_PER_ATOM__ = NB_KER_THREADS_PER_ATOM;
+
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        int warp_id = thread_id / __THREADS_PER_ATOM__;
+        int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
+        int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+
+        if (warp_id >= N ) return;
+
+        int *tnbr = __nbr;
+        int *nbrssofar = __nbr + blockDim.x;
+        int max, leader;
+
+        int  i, j, k, l, m, itr, num_far, ll;
+        real d, cutoff, cutoff_ji;
+        ivec c, nbrs_x;
+        rvec dvec;
+        grid_cell *gci, *gcj;
+        far_neighbor_data *nbr_data, *my_start;
+        reax_atom *atom1, *atom2;
+
+        //l = blockIdx.x * blockDim.x  + threadIdx.x;
+        //if (l >= N) return;
+
+        l = warp_id;
+
+        atom1 = &(my_atoms[l]);
+        num_far = Dev_Start_Index (l, &far_nbrs);
+
+        my_start = &( far_nbrs.select.far_nbr_list [num_far] );
+
+        //get the coordinates of the atom and 
+        //compute the grid cell
+        if (l < n) {
+            for (i = 0; i < 3; i++)
+            {
+                c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
+                if( c[i] >= g.native_end[i] )
+                    c[i] = g.native_end[i] - 1;
+                else if( c[i] < g.native_str[i] )
+                    c[i] = g.native_str[i];
+            }
+        } else {
+            for (i = 0; i < 3; i++)
+            {
+                c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
+                if( c[i] < 0 ) c[i] = 0;
+                else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1;
+            }
+        }
+
+        i = c[0];
+        j = c[1];
+        k = c[2];
+
+        //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] );
+
+
+        tnbr[threadIdx.x] = 0;
+        if (lane_id == 0) {
+            nbrssofar [my_bucket] = 0;
+        }
+        __syncthreads ();
+
+        itr = 0;
+        //while( (gci->nbrs_x[itr][0]) >= 0 ) { 
+        while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0 ) { 
+
+            tnbr[threadIdx.x] = 0;
+            nbrgen = false;
+
+            //ivec_Copy (nbrs_x, gci->nbrs_x[itr] );
+            ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
+            //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
+
+            //cutoff = SQR(gci->cutoff);
+            cutoff = SQR (g.cutoff [index_grid_3d (i, j, k, &g)]);
+            //cutoff_ji = SQR(gcj->cutoff);
+            cutoff_ji = SQR(g.cutoff[ index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
+            //if( ((gci->str <= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff)) 
+            //     || ((gci->str >= gcj->str) && (Dev_DistSqr_to_Special_Point(gci->nbrs_cp[itr],atom1->x)<=cutoff_ji)))
+            if( ((g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) 
+                        && (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff)) 
+                    || ((g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) 
+                        && (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff_ji)))
+            {
+                //max = gcj->end - gcj->str;
+                max = g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] - g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)];
+                tnbr[threadIdx.x] = 0;
+                nbrgen = false;
+                //m = lane_id  + gcj->str; //0-31
+                m = lane_id  + g.str[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; //0-31
+                int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1);
+                int iterations = 0;
+
+                // pick up another atom from the neighbor cell
+                //for( m = gcj->str; m < gcj->end; ++m ) 
+                while (iterations < loopcount) {
+                    tnbr [threadIdx.x] = 0;
+                    nbrgen = false;
+
+                    //if(( l < m ) && (m < gcj->end)) { // prevent recounting same pairs within a gcell 
+                    if(( l < m ) && (m < g.end [index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) { // prevent recounting same pairs within a gcell 
+                        atom2 = &(my_atoms[m]);
+                        dvec[0] = atom2->x[0] - atom1->x[0];
+                        dvec[1] = atom2->x[1] - atom1->x[1];
+                        dvec[2] = atom2->x[2] - atom1->x[2];
+                        d = rvec_Norm_Sqr( dvec );
+                        if( d <= cutoff ) { 
+                            tnbr [threadIdx.x] = 1;
+                            nbrgen = true;
+                        }
+                    }
+
+                    //if(( l > m ) && (m < gcj->end)) {
+                    if(( l > m ) && (m < g.end[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)])) {
+                        atom2 = &(my_atoms[m]);
+                        dvec[0] = atom1->x[0] - atom2->x[0];
+                        dvec[1] = atom1->x[1] - atom2->x[1];
+                        dvec[2] = atom1->x[2] - atom2->x[2];
+                        d = rvec_Norm_Sqr( dvec );
+                        if( d <= cutoff_ji ) { 
+                            tnbr [threadIdx.x] = 1;
+                            nbrgen = true;
+                        }
+                    } 
+
+                    //is neighbor generated
+                    if (nbrgen)
+                    {
+                        //do leader selection here
+                        leader = -1;
+                        for (ll = my_bucket *__THREADS_PER_ATOM__; ll < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; ll++)
+                            if (tnbr[ll]){
+                                leader = ll;
+                                break;
+                            }
+
+                        //do the reduction;
+                        if (threadIdx.x == leader)
+                            for (ll = 1; ll < __THREADS_PER_ATOM__; ll++)
+                                tnbr [my_bucket * __THREADS_PER_ATOM__ + ll] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (ll-1)];
+                    }
+
+                    if (nbrgen)
+                    {
+                        //got the indices
+                        nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1;
+                        nbr_data->nbr = m;
+                        if (l < m) {
+                            dvec[0] = atom2->x[0] - atom1->x[0];
+                            dvec[1] = atom2->x[1] - atom1->x[1];
+                            dvec[2] = atom2->x[2] - atom1->x[2];
+                            d = rvec_Norm_Sqr( dvec );
+                            nbr_data->d = SQRT (d);
+                            rvec_Copy( nbr_data->dvec, dvec );
+                            //ivec_ScaledSum( nbr_data->rel_box, 1, gcj->rel_box, -1, gci->rel_box );
+                            ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                                    -1, g.rel_box[index_grid_3d( i, j, k, &g)] );
+                        } 
+                        else {
+                            dvec[0] = atom1->x[0] - atom2->x[0];
+                            dvec[1] = atom1->x[1] - atom2->x[1];
+                            dvec[2] = atom1->x[2] - atom2->x[2];
+                            d = rvec_Norm_Sqr( dvec );
+                            nbr_data->d = SQRT(d);
+                            rvec_Copy( nbr_data->dvec, dvec );
+                            //ivec_ScaledSum( nbr_data->rel_box, -1, gcj->rel_box, 1, gci->rel_box );
+                            /*
+                               CHANGE ORIGINAL
+                               This is a bug in the original code 
+                               ivec_ScaledSum( nbr_data->rel_box, 1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                               -1, g.rel_box[index_grid_3d( i, j, k, &g)] );
+                             */
+                            ivec_ScaledSum( nbr_data->rel_box, -1, g.rel_box[index_grid_3d( nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)], 
+                                    1, g.rel_box[index_grid_3d( i, j, k, &g)] );
+                        }
+
+                        if (threadIdx.x == leader)
+                            nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
+                    }
+
+                    m += __THREADS_PER_ATOM__;
+                    iterations ++;
+
+                    //cleanup
+                    nbrgen = false;
+                    tnbr [threadIdx.x] = 0;
+                }
+                }
+                ++itr;
+                }   
+
+                if (lane_id == 0)
+                    Dev_Set_End_Index (l, num_far + nbrssofar[my_bucket], &far_nbrs);
+                //Dev_Set_End_Index( l, num_far, &far_nbrs );
+            }
+
+
+
+            CUDA_GLOBAL void ker_count_total_nbrs (reax_list far_nbrs, int N, int *result)
+            {
+                //strided access
+                extern __shared__ int count[];
+                unsigned int i = threadIdx.x;
+                int my_count = 0;
+                count[i] = 0;
+
+                for (i = threadIdx.x; i < N; i += threadIdx.x + blockDim.x)
+                    count[threadIdx.x] += Dev_Num_Entries (i, &far_nbrs);
+
+                __syncthreads ();
+
+                for (int offset = blockDim.x/2; offset > 0; offset >>=1 )
+                    if(threadIdx.x < offset)
+                        count [threadIdx.x] += count [threadIdx.x + offset];
+
+                __syncthreads ();
+
+                if (threadIdx.x == 0)
+                    *result = count [threadIdx.x];
+            }
+
+            extern "C" void Cuda_Generate_Neighbor_Lists( reax_system *system, simulation_data *data, 
+                    storage *workspace, reax_list **lists )
+            {
+                int blocks, num_far;
+                int *d_num_far = (int *) scratch;
 #if defined(LOG_PERFORMANCE)
-				real t_start=0, t_elapsed=0;
+                real t_start=0, t_elapsed=0;
 
-				if( system->my_rank == MASTER_NODE )
-					t_start = Get_Time( );
+                if( system->my_rank == MASTER_NODE )
+                    t_start = Get_Time( );
 #endif
 
-				cuda_memset (d_num_far, 0, sizeof (int), "num_far");
-
-				//invoke the kernel here
-				//one thread per atom implementation
-				/*
-				   blocks = (system->N / NBRS_BLOCK_SIZE) + 
-				   ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-				   ker_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE>>>
-				   (system->d_my_atoms, system->my_ext_box, system->d_my_grid,
-				 *(*dev_lists + FAR_NBRS), system->n, system->N);
-				 cudaThreadSynchronize ();
-				 cudaCheckError ();
-				 */
-
-				//Multiple threads per atom implementation
-				blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + 
-					(((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-				ker_mt_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE, 
-							       //sizeof (int) * (NBRS_BLOCK_SIZE + (NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM)) >>>
-							       sizeof (int) *  2 * (NBRS_BLOCK_SIZE) >>>
-								       (system->d_my_atoms, system->my_ext_box, system->d_my_grid,
-									*(*dev_lists + FAR_NBRS), system->n, system->N);
-				cudaThreadSynchronize ();
-				cudaCheckError ();
-
-				/*
-				   ker_count_total_nbrs  <<<1, NBRS_BLOCK_SIZE, sizeof (int) * NBRS_BLOCK_SIZE>>>
-				   (*(*dev_lists + FAR_NBRS), system->N, d_num_far);
-				   cudaThreadSynchronize ();
-				   cudaCheckError ();
-				   copy_host_device (&num_far, d_num_far, sizeof (int), cudaMemcpyDeviceToHost, "num_far");
-				 */
-
-				int *index = (int *) host_scratch;
-				memset (index , 0, 2 * sizeof (int) * system->N);
-				int *end_index = index + system->N;
-
-				copy_host_device (index, (*dev_lists + FAR_NBRS)->index, 
-						sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:index");
-				copy_host_device (end_index, (*dev_lists + FAR_NBRS)->end_index, 
-						sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:end_index");
-
-				num_far = 0;
-				for (int i = 0; i < system->N; i++)
-					num_far = end_index[i] - index[i];
-
-				dev_workspace->realloc.num_far = num_far;
+                cuda_memset (d_num_far, 0, sizeof (int), "num_far");
+
+                //invoke the kernel here
+                //one thread per atom implementation
+                /*
+                   blocks = (system->N / NBRS_BLOCK_SIZE) + 
+                   ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+                   ker_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE>>>
+                   (system->d_my_atoms, system->my_ext_box, system->d_my_grid,
+                 *(*dev_lists + FAR_NBRS), system->n, system->N);
+                 cudaThreadSynchronize ();
+                 cudaCheckError ();
+                 */
+
+                //Multiple threads per atom implementation
+                blocks = ((system->N * NB_KER_THREADS_PER_ATOM) / NBRS_BLOCK_SIZE) + 
+                    (((system->N * NB_KER_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+                ker_mt_generate_neighbor_lists <<<blocks, NBRS_BLOCK_SIZE, 
+                                   //sizeof (int) * (NBRS_BLOCK_SIZE + (NBRS_BLOCK_SIZE / NB_KER_THREADS_PER_ATOM)) >>>
+                                   sizeof (int) *  2 * (NBRS_BLOCK_SIZE) >>>
+                                       (system->d_my_atoms, system->my_ext_box, system->d_my_grid,
+                                    *(*dev_lists + FAR_NBRS), system->n, system->N);
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+
+                /*
+                   ker_count_total_nbrs  <<<1, NBRS_BLOCK_SIZE, sizeof (int) * NBRS_BLOCK_SIZE>>>
+                   (*(*dev_lists + FAR_NBRS), system->N, d_num_far);
+                   cudaThreadSynchronize ();
+                   cudaCheckError ();
+                   copy_host_device (&num_far, d_num_far, sizeof (int), cudaMemcpyDeviceToHost, "num_far");
+                 */
+
+                int *index = (int *) host_scratch;
+                memset (index , 0, 2 * sizeof (int) * system->N);
+                int *end_index = index + system->N;
+
+                copy_host_device (index, (*dev_lists + FAR_NBRS)->index, 
+                        sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:index");
+                copy_host_device (end_index, (*dev_lists + FAR_NBRS)->end_index, 
+                        sizeof (int) * (*dev_lists + FAR_NBRS)->n, cudaMemcpyDeviceToHost, "nbrs:end_index");
+
+                num_far = 0;
+                for (int i = 0; i < system->N; i++)
+                    num_far = end_index[i] - index[i];
+
+                dev_workspace->realloc.num_far = num_far;
 
 #if defined(LOG_PERFORMANCE)
-				if( system->my_rank == MASTER_NODE ) {
-					t_elapsed = Get_Timing_Info( t_start );
-					data->timing.nbrs += t_elapsed;
-				}
+                if( system->my_rank == MASTER_NODE ) {
+                    t_elapsed = Get_Timing_Info( t_start );
+                    data->timing.nbrs += t_elapsed;
+                }
 #endif
 
 #if defined(DEBUG_FOCUS)  
-				fprintf( stderr, "p%d @ step%d: nbrs done - num_far=%d\n", 
-						system->my_rank, data->step, num_far );
-				MPI_Barrier( MPI_COMM_WORLD );
+                fprintf( stderr, "p%d @ step%d: nbrs done - num_far=%d\n", 
+                        system->my_rank, data->step, num_far );
+                MPI_Barrier( MPI_COMM_WORLD );
 #endif
-			}
-
-			CUDA_GLOBAL void ker_estimate_neighbors (	reax_atom *my_atoms, 
-					simulation_box my_ext_box,
-					grid g,
-					int n,
-					int N, 
-					int *indices)
-			{
-				int  i, j, k, l, m, itr, num_far;
-				real d, cutoff;
-				rvec dvec, c;
-				ivec nbrs_x;
-				grid_cell *gci, *gcj;
-				far_neighbor_data *nbr_data;//, *my_start;
-				reax_atom *atom1, *atom2;
-
-				l = blockIdx.x * blockDim.x  + threadIdx.x;
-				if (l >= N) return;
-
-				num_far = 0;
-				atom1 = &(my_atoms[l]);
-				indices [l] = 0;
-
-				//if (atom1->orig_id < 0) return;
-
-				//get the coordinates of the atom and 
-				//compute the grid cell
-				if (l < n) {
-					for (i = 0; i < 3; i++)
-					{
-						c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
-						if( c[i] >= g.native_end[i] )
-							c[i] = g.native_end[i] - 1;
-						else if( c[i] < g.native_str[i] )
-							c[i] = g.native_str[i];
-					}
-				} else {
-					for (i = 0; i < 3; i++)
-					{
-						c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
-						if( c[i] < 0 ) c[i] = 0;
-						else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1;
-					}
-				}
-
-				i = c[0];
-				j = c[1];
-				k = c[2];
-
-				//gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] );
-				//cutoff = SQR(gci->cutoff);
-				cutoff = SQR(g.cutoff [index_grid_3d (i, j, k, &g) ]);
-
-				itr = 0;
-				while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { 
-					ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-					//gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-
-					if( //(g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) &&  
-							(Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) 
-					{
-						// pick up another atom from the neighbor cell 
-						for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-								m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
-						{
-							if( l < m ) { // prevent recounting same pairs within a gcell 
-								atom2 = &(my_atoms[m]);
-								dvec[0] = atom2->x[0] - atom1->x[0];
-								dvec[1] = atom2->x[1] - atom1->x[1];
-								dvec[2] = atom2->x[2] - atom1->x[2];
-								d = rvec_Norm_Sqr( dvec );
-								if( d <= cutoff ) { 
-									num_far ++;
-								}
-							}   
-						}
-					}
-					++itr;
-
-				}   
-
-				itr = 0;
-				while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { 
-					ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
-					//gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
-					cutoff = SQR(g.cutoff[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]);
-
-					if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
-							(Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) 
-					{
-						// pick up another atom from the neighbor cell 
-						for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
-								m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
-						{
-							if( l > m ) { // prevent recounting same pairs within a gcell 
-								atom2 = &(my_atoms[m]);
-								dvec[0] = atom2->x[0] - atom1->x[0];
-								dvec[1] = atom2->x[1] - atom1->x[1];
-								dvec[2] = atom2->x[2] - atom1->x[2];
-								d = rvec_Norm_Sqr( dvec );
-								if( d <= cutoff ) { 
-									num_far ++;
-								}
-							}   
-						}
-					}
-					++itr;
-				}   
-
-				indices [l] = num_far;// * SAFE_ZONE;
-			}
-
-			void Cuda_Estimate_Neighbors( reax_system *system, int *nbr_indices )
-			{
-				int blocks, num_nbrs;
-				int *indices = (int *) scratch;
-				reax_list *far_nbrs;
-
-				cuda_memset (indices, 0, sizeof (int) * system->total_cap, 
-						"neighbors:indices");
-
-				blocks = system->N / DEF_BLOCK_SIZE + 
-					((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-				ker_estimate_neighbors <<< blocks, DEF_BLOCK_SIZE >>>
-					(system->d_my_atoms, (system->my_ext_box), system->d_my_grid, 
-					 system->n, system->N, indices);
-				cudaThreadSynchronize ();
-				cudaCheckError ();
-
-				copy_host_device (nbr_indices, indices, sizeof (int) * system->total_cap, 
-						cudaMemcpyDeviceToHost, "nbrs:indices");
-			}
-
-			void Cuda_Init_Neighbors_Indices (int *indices, int entries)
-			{
-				reax_list *far_nbrs = *dev_lists + FAR_NBRS;
-
-				copy_host_device (indices, (far_nbrs->index + 1), (entries -1) * sizeof (int), 
-						cudaMemcpyHostToDevice, "nbrs:index");
-				copy_host_device (indices, (far_nbrs->end_index + 1), (entries-1) * sizeof (int), 
-						cudaMemcpyHostToDevice, "nbrs:end_index");
-			}
-
-			void Cuda_Init_HBond_Indices (int *indices, int entries)
-			{
-				reax_list *hbonds = *dev_lists + HBONDS;
-
-				for (int i = 1 ; i < entries; i++)
-					indices [i] += indices [i-1];
-
-				copy_host_device (indices, hbonds->index + 1, (entries-1) * sizeof (int), 
-						cudaMemcpyHostToDevice, "hbonds:index");
-				copy_host_device (indices, hbonds->end_index + 1, (entries-1) * sizeof (int), 
-						cudaMemcpyHostToDevice, "hbonds:end_index");
-			}
-
-			void Cuda_Init_Bond_Indices (int *indices, int entries, int num_intrs)
-			{
-				reax_list *bonds = *dev_lists + BONDS;
-
-				indices[0] = MAX( indices[0]*2, MIN_BONDS);
-				for (int i = 1 ; i < entries; i++) {
-					indices[i] = MAX( indices[i]*2, MIN_BONDS);
-				}
-
-				for (int i = 1 ; i < entries; i++) {
-					indices[i] += indices[i-1];
-				}
-
-				copy_host_device (indices, (bonds->index + 1), (entries - 1) * sizeof (int), 
-						cudaMemcpyHostToDevice, "bonds:index");
-				copy_host_device (indices, (bonds->end_index + 1), (entries - 1) * sizeof (int), 
-						cudaMemcpyHostToDevice, "bonds:end_index");
-
-				for (int i = 1 ; i < entries; i++)
-					if (indices [i] > num_intrs) {
-						fprintf (stderr, "We have a problem here ==> %d index: %d, num_intrs: %d \n", 
-								i, indices[i], num_intrs);	
-						exit (-1);
-					}
-			}
-
-			/*
-
-			   CUDA_GLOBAL void ker_validate_neighbors (reax_atom *my_atoms, 
-			   reax_list far_nbrs, 
-			   int N)
-			   {
-			   int i, j, pj;
-			   far_neighbor_data *nbr_pj;
-			   reax_atom *atom_i;
-			   int start_i, end_i;
-
-			   i = blockIdx.x * blockDim.x + threadIdx.x;
-			   if (i >= N) return;
-
-			   atom_i = &( my_atoms[i] );
-			   start_i = Dev_Start_Index (i, &far_nbrs );
-			   end_i = Dev_End_Index (i, &far_nbrs );
-
-			   for( pj = start_i; pj < end_i; ++pj ) {
-			   nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-			   j = nbr_pj->nbr;
-			   nbr_pj->d = 0;
-			   rvec_MakeZero (nbr_pj->dvec);
-			   }
-			   }
-
-			   void validate_neighbors (reax_system *system)
-			   {
-			   int blocks;
-			   blocks = (system->N / NBRS_BLOCK_SIZE) + 
-			   ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-			   ker_validate_neighbors <<< blocks, NBRS_BLOCK_SIZE>>>
-			   (system->d_my_atoms, *(*dev_lists + FAR_NBRS), system->N);
-			   cudaThreadSynchronize ();
-			   cudaCheckError ();
-
-			   fprintf (stderr, " Neighbors validated and is fine... \n");
-			   }
-
-			 */
+            }
+
+            CUDA_GLOBAL void ker_estimate_neighbors (    reax_atom *my_atoms, 
+                    simulation_box my_ext_box,
+                    grid g,
+                    int n,
+                    int N, 
+                    int *indices)
+            {
+                int  i, j, k, l, m, itr, num_far;
+                real d, cutoff;
+                rvec dvec, c;
+                ivec nbrs_x;
+                grid_cell *gci, *gcj;
+                far_neighbor_data *nbr_data;//, *my_start;
+                reax_atom *atom1, *atom2;
+
+                l = blockIdx.x * blockDim.x  + threadIdx.x;
+                if (l >= N) return;
+
+                num_far = 0;
+                atom1 = &(my_atoms[l]);
+                indices [l] = 0;
+
+                //if (atom1->orig_id < 0) return;
+
+                //get the coordinates of the atom and 
+                //compute the grid cell
+                if (l < n) {
+                    for (i = 0; i < 3; i++)
+                    {
+                        c[i] = (int)((my_atoms[l].x[i]- my_ext_box.min[i])*g.inv_len[i]);   
+                        if( c[i] >= g.native_end[i] )
+                            c[i] = g.native_end[i] - 1;
+                        else if( c[i] < g.native_str[i] )
+                            c[i] = g.native_str[i];
+                    }
+                } else {
+                    for (i = 0; i < 3; i++)
+                    {
+                        c[i] = (int)((my_atoms[l].x[i] - my_ext_box.min[i]) * g.inv_len[i]);
+                        if( c[i] < 0 ) c[i] = 0;
+                        else if( c[i] >= g.ncells[i] ) c[i] = g.ncells[i] - 1;
+                    }
+                }
+
+                i = c[0];
+                j = c[1];
+                k = c[2];
+
+                //gci = &( g.cells[ index_grid_3d (i, j, k, &g) ] );
+                //cutoff = SQR(gci->cutoff);
+                cutoff = SQR(g.cutoff [index_grid_3d (i, j, k, &g) ]);
+
+                itr = 0;
+                while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { 
+                    ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
+                    //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
+
+                    if( //(g.str[index_grid_3d (i, j, k, &g)] <= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]) &&  
+                            (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) 
+                    {
+                        // pick up another atom from the neighbor cell 
+                        for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
+                                m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
+                        {
+                            if( l < m ) { // prevent recounting same pairs within a gcell 
+                                atom2 = &(my_atoms[m]);
+                                dvec[0] = atom2->x[0] - atom1->x[0];
+                                dvec[1] = atom2->x[1] - atom1->x[1];
+                                dvec[2] = atom2->x[2] - atom1->x[2];
+                                d = rvec_Norm_Sqr( dvec );
+                                if( d <= cutoff ) { 
+                                    num_far ++;
+                                }
+                            }   
+                        }
+                    }
+                    ++itr;
+
+                }   
+
+                itr = 0;
+                while( (g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)][0]) >= 0) { 
+                    ivec_Copy (nbrs_x, g.nbrs_x[index_grid_nbrs (i, j, k, itr, &g)] );
+                    //gcj =  &( g.cells [ index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g) ]);
+                    cutoff = SQR(g.cutoff[index_grid_3d(nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]);
+
+                    if( g.str[index_grid_3d (i, j, k, &g)] >= g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)] &&  
+                            (Dev_DistSqr_to_Special_Point(g.nbrs_cp[index_grid_nbrs (i, j, k, itr, &g)],atom1->x)<=cutoff) ) 
+                    {
+                        // pick up another atom from the neighbor cell 
+                        for( m = g.str[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; 
+                                m < g.end[index_grid_3d (nbrs_x[0], nbrs_x[1], nbrs_x[2], &g)]; ++m )
+                        {
+                            if( l > m ) { // prevent recounting same pairs within a gcell 
+                                atom2 = &(my_atoms[m]);
+                                dvec[0] = atom2->x[0] - atom1->x[0];
+                                dvec[1] = atom2->x[1] - atom1->x[1];
+                                dvec[2] = atom2->x[2] - atom1->x[2];
+                                d = rvec_Norm_Sqr( dvec );
+                                if( d <= cutoff ) { 
+                                    num_far ++;
+                                }
+                            }   
+                        }
+                    }
+                    ++itr;
+                }   
+
+                indices [l] = num_far;// * SAFE_ZONE;
+            }
+
+            void Cuda_Estimate_Neighbors( reax_system *system, int *nbr_indices )
+            {
+                int blocks, num_nbrs;
+                int *indices = (int *) scratch;
+                reax_list *far_nbrs;
+
+                cuda_memset (indices, 0, sizeof (int) * system->total_cap, 
+                        "neighbors:indices");
+
+                blocks = system->N / DEF_BLOCK_SIZE + 
+                    ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+                ker_estimate_neighbors <<< blocks, DEF_BLOCK_SIZE >>>
+                    (system->d_my_atoms, (system->my_ext_box), system->d_my_grid, 
+                     system->n, system->N, indices);
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+
+                copy_host_device (nbr_indices, indices, sizeof (int) * system->total_cap, 
+                        cudaMemcpyDeviceToHost, "nbrs:indices");
+            }
+
+            void Cuda_Init_Neighbors_Indices (int *indices, int entries)
+            {
+                reax_list *far_nbrs = *dev_lists + FAR_NBRS;
+
+                copy_host_device (indices, (far_nbrs->index + 1), (entries -1) * sizeof (int), 
+                        cudaMemcpyHostToDevice, "nbrs:index");
+                copy_host_device (indices, (far_nbrs->end_index + 1), (entries-1) * sizeof (int), 
+                        cudaMemcpyHostToDevice, "nbrs:end_index");
+            }
+
+            void Cuda_Init_HBond_Indices (int *indices, int entries)
+            {
+                reax_list *hbonds = *dev_lists + HBONDS;
+
+                for (int i = 1 ; i < entries; i++)
+                    indices [i] += indices [i-1];
+
+                copy_host_device (indices, hbonds->index + 1, (entries-1) * sizeof (int), 
+                        cudaMemcpyHostToDevice, "hbonds:index");
+                copy_host_device (indices, hbonds->end_index + 1, (entries-1) * sizeof (int), 
+                        cudaMemcpyHostToDevice, "hbonds:end_index");
+            }
+
+            void Cuda_Init_Bond_Indices (int *indices, int entries, int num_intrs)
+            {
+                reax_list *bonds = *dev_lists + BONDS;
+
+                indices[0] = MAX( indices[0]*2, MIN_BONDS);
+                for (int i = 1 ; i < entries; i++) {
+                    indices[i] = MAX( indices[i]*2, MIN_BONDS);
+                }
+
+                for (int i = 1 ; i < entries; i++) {
+                    indices[i] += indices[i-1];
+                }
+
+                copy_host_device (indices, (bonds->index + 1), (entries - 1) * sizeof (int), 
+                        cudaMemcpyHostToDevice, "bonds:index");
+                copy_host_device (indices, (bonds->end_index + 1), (entries - 1) * sizeof (int), 
+                        cudaMemcpyHostToDevice, "bonds:end_index");
+
+                for (int i = 1 ; i < entries; i++)
+                    if (indices [i] > num_intrs) {
+                        fprintf (stderr, "We have a problem here ==> %d index: %d, num_intrs: %d \n", 
+                                i, indices[i], num_intrs);    
+                        exit (-1);
+                    }
+            }
+
+            /*
+
+               CUDA_GLOBAL void ker_validate_neighbors (reax_atom *my_atoms, 
+               reax_list far_nbrs, 
+               int N)
+               {
+               int i, j, pj;
+               far_neighbor_data *nbr_pj;
+               reax_atom *atom_i;
+               int start_i, end_i;
+
+               i = blockIdx.x * blockDim.x + threadIdx.x;
+               if (i >= N) return;
+
+               atom_i = &( my_atoms[i] );
+               start_i = Dev_Start_Index (i, &far_nbrs );
+               end_i = Dev_End_Index (i, &far_nbrs );
+
+               for( pj = start_i; pj < end_i; ++pj ) {
+               nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+               j = nbr_pj->nbr;
+               nbr_pj->d = 0;
+               rvec_MakeZero (nbr_pj->dvec);
+               }
+               }
+
+               void validate_neighbors (reax_system *system)
+               {
+               int blocks;
+               blocks = (system->N / NBRS_BLOCK_SIZE) + 
+               ((system->N % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+               ker_validate_neighbors <<< blocks, NBRS_BLOCK_SIZE>>>
+               (system->d_my_atoms, *(*dev_lists + FAR_NBRS), system->N);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+
+               fprintf (stderr, " Neighbors validated and is fine... \n");
+               }
+
+             */
diff --git a/PG-PuReMD/src/cuda_nonbonded.cu b/PG-PuReMD/src/cuda_nonbonded.cu
index 6dc60b06..15eae7bc 100644
--- a/PG-PuReMD/src/cuda_nonbonded.cu
+++ b/PG-PuReMD/src/cuda_nonbonded.cu
@@ -30,590 +30,590 @@
 #include "cuda_shuffle.h"
 
 CUDA_GLOBAL void ker_vdW_coulomb_energy( 
-		//CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy( 	
-		reax_atom *my_atoms, 
-		two_body_parameters *tbp,
-		global_parameters gp, 
-		control_params *control, 
-		storage p_workspace, 
-		reax_list p_far_nbrs, 
-		int n, int N, int num_atom_types, 
-		real *data_e_vdW, real *data_e_ele, 
-		rvec *data_ext_press)
-		{
+        //CUDA_GLOBAL void __launch_bounds__ (960) ker_vdW_coulomb_energy(    
+        reax_atom *my_atoms, 
+        two_body_parameters *tbp,
+        global_parameters gp, 
+        control_params *control, 
+        storage p_workspace, 
+        reax_list p_far_nbrs, 
+        int n, int N, int num_atom_types, 
+        real *data_e_vdW, real *data_e_ele, 
+        rvec *data_ext_press)
+        {
 
 #if defined(__SM_35__)
-		real sh_vdw;
-		real sh_ele;
-		rvec sh_force;
+        real sh_vdw;
+        real sh_ele;
+        rvec sh_force;
 
 #else
 
-		extern __shared__ real _vdw[];
-		extern __shared__ real _ele[];
-		extern __shared__ rvec _force [];
+        extern __shared__ real _vdw[];
+        extern __shared__ real _ele[];
+        extern __shared__ rvec _force [];
 
-		real *sh_vdw;
-		real *sh_ele;
-		rvec *sh_force;
+        real *sh_vdw;
+        real *sh_ele;
+        rvec *sh_force;
 
 #endif
 
 
-		int i, j, pj, natoms;
-		int start_i, end_i, orig_i, orig_j;
-		real p_vdW1, p_vdW1i;
-		real powr_vdW1, powgi_vdW1;
-		real tmp, r_ij, fn13, exp1, exp2;
-		real Tap, dTap, dfn13, CEvd, CEclmb, de_core;
-		real dr3gamij_1, dr3gamij_3;
-		real e_ele, e_vdW, e_core;
-		rvec temp, ext_press;
-		two_body_parameters *twbp;
-		far_neighbor_data *nbr_pj;
-		reax_list *far_nbrs;
-		storage *workspace = &( p_workspace );
-		// rtensor temp_rtensor, total_rtensor;
-
-		int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-		int warpid = thread_id / VDW_KER_THREADS_PER_ATOM;
-		int laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); 
+        int i, j, pj, natoms;
+        int start_i, end_i, orig_i, orig_j;
+        real p_vdW1, p_vdW1i;
+        real powr_vdW1, powgi_vdW1;
+        real tmp, r_ij, fn13, exp1, exp2;
+        real Tap, dTap, dfn13, CEvd, CEclmb, de_core;
+        real dr3gamij_1, dr3gamij_3;
+        real e_ele, e_vdW, e_core;
+        rvec temp, ext_press;
+        two_body_parameters *twbp;
+        far_neighbor_data *nbr_pj;
+        reax_list *far_nbrs;
+        storage *workspace = &( p_workspace );
+        // rtensor temp_rtensor, total_rtensor;
+
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        int warpid = thread_id / VDW_KER_THREADS_PER_ATOM;
+        int laneid = thread_id & (VDW_KER_THREADS_PER_ATOM -1); 
 
 #if defined(__SM_35__)
-		sh_vdw = 0.0;
-		sh_ele = 0.0;
-		rvec_MakeZero ( sh_force );
+        sh_vdw = 0.0;
+        sh_ele = 0.0;
+        rvec_MakeZero ( sh_force );
 #else
-		sh_vdw = _vdw;
-		sh_ele = _vdw + blockDim.x;
-		sh_force = (rvec *)( _vdw + 2*blockDim.x);
+        sh_vdw = _vdw;
+        sh_ele = _vdw + blockDim.x;
+        sh_force = (rvec *)( _vdw + 2*blockDim.x);
 
-		sh_vdw[threadIdx.x] = 0.0;
-		sh_ele[threadIdx.x] = 0.0;
-		rvec_MakeZero ( sh_force [threadIdx.x] );
+        sh_vdw[threadIdx.x] = 0.0;
+        sh_ele[threadIdx.x] = 0.0;
+        rvec_MakeZero ( sh_force [threadIdx.x] );
 #endif
 
-		//i = blockIdx.x * blockDim.x + threadIdx.x;
-		//if (i >= N) return;
-		i = warpid;
-
-		if (i < N)
-		{
-			natoms = n;
-			far_nbrs = &( p_far_nbrs );
-			p_vdW1 = gp.l[28];
-			p_vdW1i = 1.0 / p_vdW1;
-			e_core = 0;
-			e_vdW = 0;
-
-			data_e_vdW [i] = 0;
-			data_e_ele [i] = 0;
-
-			//for( i = 0; i < natoms; ++i ) {
-			start_i = Dev_Start_Index(i, far_nbrs);
-			end_i   = Dev_End_Index(i, far_nbrs);
-			orig_i  = my_atoms[i].orig_id;
-			//fprintf( stderr, "i:%d, start_i: %d, end_i: %d\n", i, start_i, end_i );
-
-			//for( pj = start_i; pj < end_i; ++pj )
-			pj = start_i + laneid;
-			while (pj < end_i)
-			{
-
-				nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
-				j = nbr_pj->nbr;
-				orig_j  = my_atoms[j].orig_id;
-
-				if( nbr_pj->d <= control->nonb_cut && 
-						(((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j))
-						 || ((i > j) && (i < natoms) && (j < natoms)) 
-						 || (i > j && i >= natoms && j < natoms && orig_j < orig_i)))
-				{ // ji with j >= n
-					r_ij = nbr_pj->d;
-					twbp = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[j].type, num_atom_types) ]);
-
-					/* Calculate Taper and its derivative */
-					// Tap = nbr_pj->Tap;   -- precomputed during compte_H
-					Tap = workspace->Tap[7] * r_ij + workspace->Tap[6];
-					Tap = Tap * r_ij + workspace->Tap[5];
-					Tap = Tap * r_ij + workspace->Tap[4];
-					Tap = Tap * r_ij + workspace->Tap[3];
-					Tap = Tap * r_ij + workspace->Tap[2];
-					Tap = Tap * r_ij + workspace->Tap[1];
-					Tap = Tap * r_ij + workspace->Tap[0];
-
-					dTap = 7*workspace->Tap[7] * r_ij + 6*workspace->Tap[6];
-					dTap = dTap * r_ij + 5*workspace->Tap[5];
-					dTap = dTap * r_ij + 4*workspace->Tap[4];
-					dTap = dTap * r_ij + 3*workspace->Tap[3];
-					dTap = dTap * r_ij + 2*workspace->Tap[2];
-					dTap += workspace->Tap[1]/r_ij;
-
-					/*vdWaals Calculations*/
-					if(gp.vdw_type==1 || gp.vdw_type==3)
-					{ // shielding
-						powr_vdW1 = POW(r_ij, p_vdW1);
-						powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-
-						fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-						exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-						exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-
-						e_vdW = twbp->D * (exp1 - 2.0 * exp2);	  
-
-						//data_e_vdW [i] += Tap * e_vdW;
-						//	 data_e_vdW [i] += Tap * e_vdW / 2.0;
+        //i = blockIdx.x * blockDim.x + threadIdx.x;
+        //if (i >= N) return;
+        i = warpid;
+
+        if (i < N)
+        {
+            natoms = n;
+            far_nbrs = &( p_far_nbrs );
+            p_vdW1 = gp.l[28];
+            p_vdW1i = 1.0 / p_vdW1;
+            e_core = 0;
+            e_vdW = 0;
+
+            data_e_vdW [i] = 0;
+            data_e_ele [i] = 0;
+
+            //for( i = 0; i < natoms; ++i ) {
+            start_i = Dev_Start_Index(i, far_nbrs);
+            end_i   = Dev_End_Index(i, far_nbrs);
+            orig_i  = my_atoms[i].orig_id;
+            //fprintf( stderr, "i:%d, start_i: %d, end_i: %d\n", i, start_i, end_i );
+
+            //for( pj = start_i; pj < end_i; ++pj )
+            pj = start_i + laneid;
+            while (pj < end_i)
+            {
+
+                nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
+                j = nbr_pj->nbr;
+                orig_j  = my_atoms[j].orig_id;
+
+                if( nbr_pj->d <= control->nonb_cut && 
+                        (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j))
+                         || ((i > j) && (i < natoms) && (j < natoms)) 
+                         || (i > j && i >= natoms && j < natoms && orig_j < orig_i)))
+                { // ji with j >= n
+                    r_ij = nbr_pj->d;
+                    twbp = &(tbp[ index_tbp (my_atoms[i].type, my_atoms[j].type, num_atom_types) ]);
+
+                    /* Calculate Taper and its derivative */
+                    // Tap = nbr_pj->Tap;   -- precomputed during compte_H
+                    Tap = workspace->Tap[7] * r_ij + workspace->Tap[6];
+                    Tap = Tap * r_ij + workspace->Tap[5];
+                    Tap = Tap * r_ij + workspace->Tap[4];
+                    Tap = Tap * r_ij + workspace->Tap[3];
+                    Tap = Tap * r_ij + workspace->Tap[2];
+                    Tap = Tap * r_ij + workspace->Tap[1];
+                    Tap = Tap * r_ij + workspace->Tap[0];
+
+                    dTap = 7*workspace->Tap[7] * r_ij + 6*workspace->Tap[6];
+                    dTap = dTap * r_ij + 5*workspace->Tap[5];
+                    dTap = dTap * r_ij + 4*workspace->Tap[4];
+                    dTap = dTap * r_ij + 3*workspace->Tap[3];
+                    dTap = dTap * r_ij + 2*workspace->Tap[2];
+                    dTap += workspace->Tap[1]/r_ij;
+
+                    /*vdWaals Calculations*/
+                    if(gp.vdw_type==1 || gp.vdw_type==3)
+                    { // shielding
+                        powr_vdW1 = POW(r_ij, p_vdW1);
+                        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+
+                        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+                        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+
+                        e_vdW = twbp->D * (exp1 - 2.0 * exp2);      
+
+                        //data_e_vdW [i] += Tap * e_vdW;
+                        //     data_e_vdW [i] += Tap * e_vdW / 2.0;
 #if defined(__SM_35__)
-						sh_vdw  += Tap * e_vdW / 2.0;
+                        sh_vdw  += Tap * e_vdW / 2.0;
 #else
-						sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0;
+                        sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0;
 #endif
 
-						dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-							POW(r_ij, p_vdW1 - 2.0);
+                        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
+                            POW(r_ij, p_vdW1 - 2.0);
 
-						CEvd = dTap * e_vdW - 
-							Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
-					}
-					else{ // no shielding
-						exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-						exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                        CEvd = dTap * e_vdW - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+                    }
+                    else{ // no shielding
+                        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
-						e_vdW = twbp->D * (exp1 - 2.0 * exp2);
+                        e_vdW = twbp->D * (exp1 - 2.0 * exp2);
 
-						//data_e_vdW [i] += Tap * e_vdW;
-						//data_e_vdW [i] += Tap * e_vdW / 2.0;
+                        //data_e_vdW [i] += Tap * e_vdW;
+                        //data_e_vdW [i] += Tap * e_vdW / 2.0;
 #if defined(__SM_35__)
-						sh_vdw += Tap * e_vdW / 2.0;
+                        sh_vdw += Tap * e_vdW / 2.0;
 #else
-						sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0;
+                        sh_vdw [threadIdx.x] += Tap * e_vdW / 2.0;
 #endif
 
-						CEvd = dTap * e_vdW - 
-							Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
-					}
+                        CEvd = dTap * e_vdW - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
+                    }
 
-					if(gp.vdw_type==2 || gp.vdw_type==3)
-					{ // innner wall
-						e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+                    if(gp.vdw_type==2 || gp.vdw_type==3)
+                    { // innner wall
+                        e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
 
-						//data_e_vdW [i] += Tap * e_core;
-						//data_e_vdW [i] += Tap * e_core / 2.0;
+                        //data_e_vdW [i] += Tap * e_core;
+                        //data_e_vdW [i] += Tap * e_core / 2.0;
 #if defined(__SM_35__)
-						sh_vdw += Tap * e_core / 2.0;
+                        sh_vdw += Tap * e_core / 2.0;
 #else
-						sh_vdw[ threadIdx.x ] += Tap * e_core / 2.0;
+                        sh_vdw[ threadIdx.x ] += Tap * e_core / 2.0;
 #endif
 
-						de_core = -(twbp->acore/twbp->rcore) * e_core;
-						CEvd += dTap * e_core + Tap * de_core;
-					}
+                        de_core = -(twbp->acore/twbp->rcore) * e_core;
+                        CEvd += dTap * e_core + Tap * de_core;
+                    }
 
-					/*Coulomb Calculations*/
-					dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-					dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+                    /*Coulomb Calculations*/
+                    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+                    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
 
-					tmp = Tap / dr3gamij_3;
-					//data_e_ele [i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
-					e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
-					//data_e_ele [i] += e_ele;
-					//data_e_ele [i] += e_ele  / 2.0;
+                    tmp = Tap / dr3gamij_3;
+                    //data_e_ele [i] += e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
+                    e_ele = C_ele * my_atoms[i].q * my_atoms[j].q * tmp;
+                    //data_e_ele [i] += e_ele;
+                    //data_e_ele [i] += e_ele  / 2.0;
 #if defined(__SM_35__)
-					sh_ele += e_ele  / 2.0;
+                    sh_ele += e_ele  / 2.0;
 #else
-					sh_ele [ threadIdx.x ] += e_ele  / 2.0;
+                    sh_ele [ threadIdx.x ] += e_ele  / 2.0;
 #endif
 
 
-					CEclmb = C_ele * my_atoms[i].q * my_atoms[j].q * 
-						( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-					// fprintf( fout, "%5d %5d %10.6f %10.6f\n",
-					//   MIN( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ),
-					//   MAX( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), 
-					//   CEvd, CEclmb );       	  		  
+                    CEclmb = C_ele * my_atoms[i].q * my_atoms[j].q * 
+                        ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+                    // fprintf( fout, "%5d %5d %10.6f %10.6f\n",
+                    //   MIN( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ),
+                    //   MAX( system->my_atoms[i].orig_id, system->my_atoms[j].orig_id ), 
+                    //   CEvd, CEclmb );                  
 
-					if( control->virial == 0 ) {
-						if ( i < j ) 
-							//rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
+                    if( control->virial == 0 ) {
+                        if ( i < j ) 
+                            //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
 #if defined (__SM_35__)
-							rvec_ScaledAdd( sh_force, -(CEvd + CEclmb), nbr_pj->dvec );
+                            rvec_ScaledAdd( sh_force, -(CEvd + CEclmb), nbr_pj->dvec );
 #else
-						rvec_ScaledAdd( sh_force[ threadIdx.x ], -(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force[ threadIdx.x ], -(CEvd + CEclmb), nbr_pj->dvec );
 #endif
-						else 
-							//rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
+                        else 
+                            //rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
 #if defined (__SM_35__)
-							rvec_ScaledAdd( sh_force , +(CEvd + CEclmb), nbr_pj->dvec );
+                            rvec_ScaledAdd( sh_force , +(CEvd + CEclmb), nbr_pj->dvec );
 #else
-						rvec_ScaledAdd( sh_force [ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force [ threadIdx.x ], +(CEvd + CEclmb), nbr_pj->dvec );
 #endif
-						//rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
-					}
-					else { /* NPT, iNPT or sNPT */
-						/* for pressure coupling, terms not related to bond order 
-						   derivatives are added directly into pressure vector/tensor */
-						rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-
-						rvec_ScaledAdd( workspace->f[i], -1., temp );
-						rvec_Add( workspace->f[j], temp );
-
-						rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-						rvec_Add( data_ext_press [i], ext_press );
-
-						// fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)
-						//   force(%f %f %f) ext_press (%12.6f %12.6f %12.6f)\n", 
-						//   i, j, nbr_pj->rel_box[0], nbr_pj->rel_box[1], nbr_pj->rel_box[2],
-						//   temp[0], temp[1], temp[2],
-						//   data->ext_press[0], data->ext_press[1], data->ext_press[2] );
-					}
+                        //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
+                    }
+                    else { /* NPT, iNPT or sNPT */
+                        /* for pressure coupling, terms not related to bond order 
+                           derivatives are added directly into pressure vector/tensor */
+                        rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+
+                        rvec_ScaledAdd( workspace->f[i], -1., temp );
+                        rvec_Add( workspace->f[j], temp );
+
+                        rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                        rvec_Add( data_ext_press [i], ext_press );
+
+                        // fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)
+                        //   force(%f %f %f) ext_press (%12.6f %12.6f %12.6f)\n", 
+                        //   i, j, nbr_pj->rel_box[0], nbr_pj->rel_box[1], nbr_pj->rel_box[2],
+                        //   temp[0], temp[1], temp[2],
+                        //   data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+                    }
 
 #ifdef TEST_ENERGY
-					// fprintf( out_control->evdw, 
-					// "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", 
-					// workspace->Tap[7],workspace->Tap[6],workspace->Tap[5],
-					// workspace->Tap[4],workspace->Tap[3],workspace->Tap[2], 
-					// workspace->Tap[1], Tap );
-					//fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-					fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
-							system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
-							r_ij, e_vdW, data->my_en.e_vdW );
-					//fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-					fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-							system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-							r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
-							e_ele, data->my_en.e_ele );
+                    // fprintf( out_control->evdw, 
+                    // "%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f%12.9f\n", 
+                    // workspace->Tap[7],workspace->Tap[6],workspace->Tap[5],
+                    // workspace->Tap[4],workspace->Tap[3],workspace->Tap[2], 
+                    // workspace->Tap[1], Tap );
+                    //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
+                    fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
+                            system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
+                            r_ij, e_vdW, data->my_en.e_vdW );
+                    //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                    fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                            system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+                            r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
+                            e_ele, data->my_en.e_ele );
 #endif
 #ifdef TEST_FORCES
-					rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-					rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-					rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-					rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+                    rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
 #endif
-				}
+                }
 
-				pj += VDW_KER_THREADS_PER_ATOM;
+                pj += VDW_KER_THREADS_PER_ATOM;
 
-			}
-			//  }
-		} // if i < N
+            }
+            //  }
+        } // if i < N
 
 #if defined( __SM_35__)
-		for (int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2){
-			sh_vdw += shfl( sh_vdw, x);
-			sh_ele += shfl( sh_ele, x );
-			sh_force[0] += shfl( sh_force[0], x );
-			sh_force[1] += shfl( sh_force[1], x );
-			sh_force[2] += shfl( sh_force[2], x );
-		}
-
-		if (laneid == 0) {
-			data_e_vdW[i] += sh_vdw;
-			data_e_ele[i] += sh_ele;
-			rvec_Add (workspace->f[i], sh_force );
-		}
+        for (int x = VDW_KER_THREADS_PER_ATOM >> 1; x >= 1; x/=2){
+            sh_vdw += shfl( sh_vdw, x);
+            sh_ele += shfl( sh_ele, x );
+            sh_force[0] += shfl( sh_force[0], x );
+            sh_force[1] += shfl( sh_force[1], x );
+            sh_force[2] += shfl( sh_force[2], x );
+        }
+
+        if (laneid == 0) {
+            data_e_vdW[i] += sh_vdw;
+            data_e_ele[i] += sh_ele;
+            rvec_Add (workspace->f[i], sh_force );
+        }
 
 #else
 
-		__syncthreads ();
-
-		if (laneid < 16) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
-		}
-		__syncthreads ();
-		if (laneid < 8) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
-		}
-		__syncthreads ();
-		if (laneid < 4) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
-		}
-		__syncthreads ();
-		if (laneid < 2) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
-		}
-		__syncthreads ();
-		if (laneid < 1) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
-		}
-		__syncthreads ();
-		if (laneid == 0) {
-			data_e_vdW[i] += sh_vdw[threadIdx.x];
-			data_e_ele[i] += sh_ele[threadIdx.x];
-			rvec_Add (workspace->f[i], sh_force [ threadIdx.x ]);
-		}
+        __syncthreads ();
+
+        if (laneid < 16) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
+        }
+        __syncthreads ();
+        if (laneid < 8) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
+        }
+        __syncthreads ();
+        if (laneid < 4) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
+        }
+        __syncthreads ();
+        if (laneid < 2) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
+        }
+        __syncthreads ();
+        if (laneid < 1) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
+        }
+        __syncthreads ();
+        if (laneid == 0) {
+            data_e_vdW[i] += sh_vdw[threadIdx.x];
+            data_e_ele[i] += sh_ele[threadIdx.x];
+            rvec_Add (workspace->f[i], sh_force [ threadIdx.x ]);
+        }
 #endif
 
-		}
+        }
 
 
 CUDA_GLOBAL void ker_tabulated_vdW_coulomb_energy( reax_atom *my_atoms, 
-		global_parameters gp, 
-		control_params *control, 
-		storage p_workspace, 
-		reax_list p_far_nbrs, 
-		LR_lookup_table *t_LR,
-		int n, int N, int num_atom_types, 
-		int step, int prev_steps, 
-		int energy_update_freq, 
-		real *data_e_vdW, real *data_e_ele, 
-		rvec *data_ext_press)
+        global_parameters gp, 
+        control_params *control, 
+        storage p_workspace, 
+        reax_list p_far_nbrs, 
+        LR_lookup_table *t_LR,
+        int n, int N, int num_atom_types, 
+        int step, int prev_steps, 
+        int energy_update_freq, 
+        real *data_e_vdW, real *data_e_ele, 
+        rvec *data_ext_press)
 {
-	int i, j, pj, r, natoms, steps, update_freq, update_energies;
-	int type_i, type_j, tmin, tmax;
-	int start_i, end_i, orig_i, orig_j;
-	real r_ij, base, dif;
-	real e_vdW, e_ele;
-	real CEvd, CEclmb;
-	rvec temp, ext_press;
-	far_neighbor_data *nbr_pj;
-	reax_list *far_nbrs;
-	LR_lookup_table *t;
-
-	storage *workspace = &( p_workspace );
-
-	natoms = n;
-	far_nbrs = &( p_far_nbrs );
-	steps = step - prev_steps;
-	update_freq = energy_update_freq;
-	update_energies = update_freq > 0 && steps % update_freq == 0;
-	e_ele = e_vdW = 0;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	data_e_vdW [i] = 0;
-	data_e_ele [i] = 0;
-
-	//for( i = 0; i < natoms; ++i ) {
-	type_i  = my_atoms[i].type;
-	start_i = Dev_Start_Index(i,far_nbrs);
-	end_i   = Dev_End_Index(i,far_nbrs);
-	orig_i  = my_atoms[i].orig_id;
-
-	for( pj = start_i; pj < end_i; ++pj ) {
-		nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
-		j = nbr_pj->nbr;
-		orig_j  = my_atoms[j].orig_id;
-
-		//if( nbr_pj->d <= control->nonb_cut && (j < natoms || orig_i < orig_j) ) {
-		if( nbr_pj->d <= control->nonb_cut && 
-				(((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j))
-				 || ((i > j) && (i < natoms) && (j < natoms)) 
-				 || (i > j && i >= natoms && j < natoms && orig_j < orig_i)))
-		{ // ji with j >= n
-			j = nbr_pj->nbr;
-			type_j = my_atoms[j].type;
-			r_ij   = nbr_pj->d;
-			tmin  = MIN( type_i, type_j );
-			tmax  = MAX( type_i, type_j );
-
-			t = &( t_LR[ index_lr (tmin, tmax, num_atom_types) ]);	
-
-			// table = &( LR[type_i][type_j] ); 
-
-			/* Cubic Spline Interpolation */
-			r = (int)(r_ij * t->inv_dx);
-			if( r == 0 )  ++r;
-			base = (real)(r+1) * t->dx;
-			dif = r_ij - base;
-			//fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
-
-			if( update_energies ) {
-				e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-					t->vdW[r].a;
-
-				e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-					t->ele[r].a;
-				e_ele *= my_atoms[i].q * my_atoms[j].q;
-
-				//data_e_vdW [i] += e_vdW;
-				data_e_vdW [i] += e_vdW / 2.0;
-				//data_e_ele [i] += e_ele;
-				data_e_ele [i] += e_ele / 2.0;
-			}	
-
-			CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-				t->CEvd[r].a;
-
-			CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-				t->CEclmb[r].a;
-			CEclmb *= my_atoms[i].q * my_atoms[j].q;
-
-			if( control->virial == 0 ) {
-				if ( i < j ) 
-					rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
-				else 
-					rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
-				//rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
-				//rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
-			}
-			else { // NPT, iNPT or sNPT
-				/* for pressure coupling, terms not related to bond order derivatives
-				   are added directly into pressure vector/tensor */
-				rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-
-				rvec_ScaledAdd( workspace->f[i], -1., temp );
-				rvec_Add( workspace->f[j], temp );
-
-				rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-				rvec_Add( data_ext_press [i], ext_press );
-			}
+    int i, j, pj, r, natoms, steps, update_freq, update_energies;
+    int type_i, type_j, tmin, tmax;
+    int start_i, end_i, orig_i, orig_j;
+    real r_ij, base, dif;
+    real e_vdW, e_ele;
+    real CEvd, CEclmb;
+    rvec temp, ext_press;
+    far_neighbor_data *nbr_pj;
+    reax_list *far_nbrs;
+    LR_lookup_table *t;
+
+    storage *workspace = &( p_workspace );
+
+    natoms = n;
+    far_nbrs = &( p_far_nbrs );
+    steps = step - prev_steps;
+    update_freq = energy_update_freq;
+    update_energies = update_freq > 0 && steps % update_freq == 0;
+    e_ele = e_vdW = 0;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    data_e_vdW [i] = 0;
+    data_e_ele [i] = 0;
+
+    //for( i = 0; i < natoms; ++i ) {
+    type_i  = my_atoms[i].type;
+    start_i = Dev_Start_Index(i,far_nbrs);
+    end_i   = Dev_End_Index(i,far_nbrs);
+    orig_i  = my_atoms[i].orig_id;
+
+    for( pj = start_i; pj < end_i; ++pj ) {
+        nbr_pj = &(far_nbrs->select.far_nbr_list[pj]);
+        j = nbr_pj->nbr;
+        orig_j  = my_atoms[j].orig_id;
+
+        //if( nbr_pj->d <= control->nonb_cut && (j < natoms || orig_i < orig_j) ) {
+        if( nbr_pj->d <= control->nonb_cut && 
+                (((i < j) && (i < natoms) && (j < natoms || orig_i < orig_j))
+                 || ((i > j) && (i < natoms) && (j < natoms)) 
+                 || (i > j && i >= natoms && j < natoms && orig_j < orig_i)))
+        { // ji with j >= n
+            j = nbr_pj->nbr;
+            type_j = my_atoms[j].type;
+            r_ij   = nbr_pj->d;
+            tmin  = MIN( type_i, type_j );
+            tmax  = MAX( type_i, type_j );
+
+            t = &( t_LR[ index_lr (tmin, tmax, num_atom_types) ]);    
+
+            // table = &( LR[type_i][type_j] ); 
+
+            /* Cubic Spline Interpolation */
+            r = (int)(r_ij * t->inv_dx);
+            if( r == 0 )  ++r;
+            base = (real)(r+1) * t->dx;
+            dif = r_ij - base;
+            //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
+
+            if( update_energies ) {
+                e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
+                    t->vdW[r].a;
+
+                e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                    t->ele[r].a;
+                e_ele *= my_atoms[i].q * my_atoms[j].q;
+
+                //data_e_vdW [i] += e_vdW;
+                data_e_vdW [i] += e_vdW / 2.0;
+                //data_e_ele [i] += e_ele;
+                data_e_ele [i] += e_ele / 2.0;
+            }    
+
+            CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
+                t->CEvd[r].a;
+
+            CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
+                t->CEclmb[r].a;
+            CEclmb *= my_atoms[i].q * my_atoms[j].q;
+
+            if( control->virial == 0 ) {
+                if ( i < j ) 
+                    rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
+                else 
+                    rvec_ScaledAdd( workspace->f[i], +(CEvd + CEclmb), nbr_pj->dvec );
+                //rvec_ScaledAdd( workspace->f[i], -(CEvd + CEclmb), nbr_pj->dvec );
+                //rvec_ScaledAdd( workspace->f[j], +(CEvd + CEclmb), nbr_pj->dvec );
+            }
+            else { // NPT, iNPT or sNPT
+                /* for pressure coupling, terms not related to bond order derivatives
+                   are added directly into pressure vector/tensor */
+                rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+
+                rvec_ScaledAdd( workspace->f[i], -1., temp );
+                rvec_Add( workspace->f[j], temp );
+
+                rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                rvec_Add( data_ext_press [i], ext_press );
+            }
 
 #ifdef TEST_ENERGY
-			//fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-			fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
-					system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
-					r_ij, e_vdW, data->my_en.e_vdW );
-			//fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-			fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-					system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
-					r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
-					e_ele, data->my_en.e_ele );
+            //fprintf( out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
+            fprintf( out_control->evdw, "%6d%6d%12.4f%12.4f%12.4f\n",
+                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id, 
+                    r_ij, e_vdW, data->my_en.e_vdW );
+            //fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+            fprintf( out_control->ecou, "%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                    system->my_atoms[i].orig_id, system->my_atoms[j].orig_id,
+                    r_ij, system->my_atoms[i].q, system->my_atoms[j].q, 
+                    e_ele, data->my_en.e_ele );
 #endif
 #ifdef TEST_FORCES
-			rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-			rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-			rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-			rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+            rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+            rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+            rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+            rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
 #endif
-		}
-	}
-	//  }
+        }
+    }
+    //  }
 }
 
 CUDA_GLOBAL void ker_pol_energy (reax_atom *my_atoms, 
-		single_body_parameters *sbp, 
-		int n, 
-		real *data_e_pol)
+        single_body_parameters *sbp, 
+        int n, 
+        real *data_e_pol)
 {
-	int type_i;
-	real q;
+    int type_i;
+    real q;
 
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n) return;
 
-	data_e_pol [i] = 0;
+    data_e_pol [i] = 0;
 
-	//for( i = 0; i < system->n; i++ ) {
-	q = my_atoms[i].q;
-	type_i = my_atoms[i].type;
+    //for( i = 0; i < system->n; i++ ) {
+    q = my_atoms[i].q;
+    type_i = my_atoms[i].type;
 
-	data_e_pol[i] += 
-		KCALpMOL_to_EV * (sbp[type_i].chi * q + 
-				(sbp[type_i].eta / 2.) * SQR(q));
-	//}
+    data_e_pol[i] += 
+        KCALpMOL_to_EV * (sbp[type_i].chi * q + 
+                (sbp[type_i].eta / 2.) * SQR(q));
+    //}
 }
 
 void Cuda_Compute_Polarization_Energy( reax_system *system, simulation_data *data )
 {
-	int blocks;
-	real *spad = (real *) scratch;
-	cuda_memset (spad, 0, sizeof (real) * 2 * system->n, "pol_energy");
-
-	blocks = system->n / DEF_BLOCK_SIZE + 
-		((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	ker_pol_energy <<< blocks, DEF_BLOCK_SIZE >>>
-		( system->d_my_atoms, system->reax_param.d_sbp, 
-		  system->n, spad );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	//Reduction for polarization energy
-	k_reduction <<< blocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>>
-		( spad, spad + system->n, system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	k_reduction <<< 1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>>
-		( spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_pol, blocks);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    int blocks;
+    real *spad = (real *) scratch;
+    cuda_memset (spad, 0, sizeof (real) * 2 * system->n, "pol_energy");
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    ker_pol_energy <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, 
+          system->n, spad );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //Reduction for polarization energy
+    k_reduction <<< blocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>>
+        ( spad, spad + system->n, system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<< 1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2>>>
+        ( spad + system->n, &((simulation_data *)data->d_simulation_data)->my_en.e_pol, blocks);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 void Cuda_NonBonded_Energy ( reax_system *system, control_params *control, 
-		storage *workspace, simulation_data *data,  reax_list **lists,
-		output_controls *out_control, bool isTabulated )
+        storage *workspace, simulation_data *data,  reax_list **lists,
+        output_controls *out_control, bool isTabulated )
 {
-	int blocks;
-	int rblocks;
-	int size = (2 * system->N + 2 * system->N ) * sizeof (real) + 
-		2 * system->N * sizeof (rvec);
-
-	rvec *spad_rvec;
-	real *spad = (real *) scratch;
-	cuda_memset (spad, 0, size, "pol_energy");
-
-	rblocks = system->N / DEF_BLOCK_SIZE + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-	blocks = ((system->N * VDW_KER_THREADS_PER_ATOM) / DEF_BLOCK_SIZE) 
-		+ (((system->N * VDW_KER_THREADS_PER_ATOM) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
-
-	if (!isTabulated) {
-		ker_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>>
-			( system->d_my_atoms, system->reax_param.d_tbp, 
-			  system->reax_param.d_gp, (control_params *)control->d_control_params, 
-			  *(dev_workspace), *(*dev_lists + FAR_NBRS), 
-			  system->n, system->N, system->reax_param.num_atom_types, 
-			  spad, spad + 2 * system->N, (rvec *)(spad + 4 * system->N));
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-	} else {
-		ker_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>>
-			( system->d_my_atoms, system->reax_param.d_gp, 
-			  (control_params *)control->d_control_params, 
-			  *(dev_workspace), *(*dev_lists + FAR_NBRS), 
-			  d_LR, system->n, system->N,
-			  system->reax_param.num_atom_types, 
-			  data->step, data->prev_steps, 
-			  out_control->energy_update_freq,
-			  spad, spad + 2 * system->N, 
-			  (rvec *)(spad + 4 * system->N));
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-	}
-
-	//reduction for  vdw
-	k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>>
-		( spad, spad + system->N, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>>
-		( spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, rblocks); 
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	//reduction for  ele
-	k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>>
-		( spad + 2 * system->N, spad + 3 * system->N, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>>
-		( spad + 3 * system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ele, rblocks);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	//reduction for ext_press
-	spad_rvec = (rvec *) (spad + 4 * system->N);
-	k_reduction_rvec <<< rblocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>>
-		( spad_rvec, spad_rvec + system->N, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>>
-		( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, rblocks);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	Cuda_Compute_Polarization_Energy( system, data );
+    int blocks;
+    int rblocks;
+    int size = (2 * system->N + 2 * system->N ) * sizeof (real) + 
+        2 * system->N * sizeof (rvec);
+
+    rvec *spad_rvec;
+    real *spad = (real *) scratch;
+    cuda_memset (spad, 0, size, "pol_energy");
+
+    rblocks = system->N / DEF_BLOCK_SIZE + ((system->N % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+    blocks = ((system->N * VDW_KER_THREADS_PER_ATOM) / DEF_BLOCK_SIZE) 
+        + (((system->N * VDW_KER_THREADS_PER_ATOM) % DEF_BLOCK_SIZE == 0) ? 0 : 1);
+
+    if (!isTabulated) {
+        ker_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE, DEF_BLOCK_SIZE * (2 * sizeof(real) + sizeof(rvec)) >>>
+            ( system->d_my_atoms, system->reax_param.d_tbp, 
+              system->reax_param.d_gp, (control_params *)control->d_control_params, 
+              *(dev_workspace), *(*dev_lists + FAR_NBRS), 
+              system->n, system->N, system->reax_param.num_atom_types, 
+              spad, spad + 2 * system->N, (rvec *)(spad + 4 * system->N));
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    } else {
+        ker_tabulated_vdW_coulomb_energy <<< blocks, DEF_BLOCK_SIZE >>>
+            ( system->d_my_atoms, system->reax_param.d_gp, 
+              (control_params *)control->d_control_params, 
+              *(dev_workspace), *(*dev_lists + FAR_NBRS), 
+              d_LR, system->n, system->N,
+              system->reax_param.num_atom_types, 
+              data->step, data->prev_steps, 
+              out_control->energy_update_freq,
+              spad, spad + 2 * system->N, 
+              (rvec *)(spad + 4 * system->N));
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+
+    //reduction for  vdw
+    k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>>
+        ( spad, spad + system->N, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>>
+        ( spad + system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_vdW, rblocks); 
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //reduction for  ele
+    k_reduction <<< rblocks, DEF_BLOCK_SIZE, sizeof (real) * DEF_BLOCK_SIZE >>>
+        ( spad + 2 * system->N, spad + 3 * system->N, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<< 1, BLOCKS_POW_2_N, sizeof (real) * BLOCKS_POW_2_N>>>
+        ( spad + 3 * system->N, &((simulation_data *)data->d_simulation_data)->my_en.e_ele, rblocks);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //reduction for ext_press
+    spad_rvec = (rvec *) (spad + 4 * system->N);
+    k_reduction_rvec <<< rblocks, DEF_BLOCK_SIZE, sizeof (rvec) * DEF_BLOCK_SIZE >>>
+        ( spad_rvec, spad_rvec + system->N, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction_rvec <<< 1, BLOCKS_POW_2_N, sizeof (rvec) * BLOCKS_POW_2_N>>>
+        ( spad_rvec + system->N, &((simulation_data *)data->d_simulation_data)->my_ext_press, rblocks);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    Cuda_Compute_Polarization_Energy( system, data );
 }
diff --git a/PG-PuReMD/src/cuda_post_evolve.cu b/PG-PuReMD/src/cuda_post_evolve.cu
index ebcb22fa..b8008e85 100644
--- a/PG-PuReMD/src/cuda_post_evolve.cu
+++ b/PG-PuReMD/src/cuda_post_evolve.cu
@@ -5,31 +5,31 @@
 #include "cuda_utils.h"
 
 CUDA_GLOBAL void ker_post_evolve (reax_atom *my_atoms, 
-		simulation_data *data, int n)
+        simulation_data *data, int n)
 {
-	rvec diff, cross;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= n) return;
+    rvec diff, cross;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
 
-	//for( i = 0; i < system->n; i++ ) { 
-	/* remove translational vel */
-	rvec_ScaledAdd( my_atoms[i].v, -1., data->vcm );
+    //for( i = 0; i < system->n; i++ ) { 
+    /* remove translational vel */
+    rvec_ScaledAdd( my_atoms[i].v, -1., data->vcm );
 
-	/* remove rotational */
-	rvec_ScaledSum( diff, 1., my_atoms[i].x, -1., data->xcm );
-	rvec_Cross( cross, data->avcm, diff );
-	rvec_ScaledAdd( my_atoms[i].v, -1., cross );
-	//}  
+    /* remove rotational */
+    rvec_ScaledSum( diff, 1., my_atoms[i].x, -1., data->xcm );
+    rvec_Cross( cross, data->avcm, diff );
+    rvec_ScaledAdd( my_atoms[i].v, -1., cross );
+    //}  
 }
 
 void post_evolve_velocities (reax_system *system, simulation_data *data)
 {
-	int blocks;
+    int blocks;
 
-	blocks = system->n / DEF_BLOCK_SIZE + 
-		((system->n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
-	ker_post_evolve <<< blocks, DEF_BLOCK_SIZE >>>
-		(system->d_my_atoms, (simulation_data *)data->d_simulation_data, system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        ((system->n % DEF_BLOCK_SIZE) == 0 ? 0 : 1);
+    ker_post_evolve <<< blocks, DEF_BLOCK_SIZE >>>
+        (system->d_my_atoms, (simulation_data *)data->d_simulation_data, system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
diff --git a/PG-PuReMD/src/cuda_qEq.cu b/PG-PuReMD/src/cuda_qEq.cu
index 271a190e..b2094583 100644
--- a/PG-PuReMD/src/cuda_qEq.cu
+++ b/PG-PuReMD/src/cuda_qEq.cu
@@ -27,95 +27,95 @@
 
 #include "validation.h"
 
-CUDA_GLOBAL void ker_init_matvec( 	reax_atom *my_atoms, 
-		single_body_parameters *sbp, 
-		storage p_workspace, int n  )
+CUDA_GLOBAL void ker_init_matvec( reax_atom *my_atoms, 
+        single_body_parameters *sbp, 
+        storage p_workspace, int n  )
 {
-	storage *workspace = &( p_workspace );
-	reax_atom *atom;
+    storage *workspace = &( p_workspace );
+    reax_atom *atom;
 
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= n) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
 
-	//for( i = 0; i < system->n; ++i ) {
-	atom = &( my_atoms[i] );
+    //for( i = 0; i < system->n; ++i ) {
+    atom = &( my_atoms[i] );
 
-	/* init pre-conditioner for H and init solution vectors */
-	workspace->Hdia_inv[i] = 1. / sbp[ atom->type ].eta;
-	workspace->b_s[i] = -sbp[ atom->type ].chi;
-	workspace->b_t[i] = -1.0;
-	workspace->b[i][0] = -sbp[ atom->type ].chi;
-	workspace->b[i][1] = -1.0;
+    /* init pre-conditioner for H and init solution vectors */
+    workspace->Hdia_inv[i] = 1. / sbp[ atom->type ].eta;
+    workspace->b_s[i] = -sbp[ atom->type ].chi;
+    workspace->b_t[i] = -1.0;
+    workspace->b[i][0] = -sbp[ atom->type ].chi;
+    workspace->b[i][1] = -1.0;
 
-	workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
+    workspace->x[i][1] = atom->t[2] + 3 * ( atom->t[0] - atom->t[1] );
 
-	/* cubic extrapolation for s and t */
-	workspace->x[i][0] = 4*(atom->s[0]+atom->s[2])-(6*atom->s[1]+atom->s[3]);
-	//}
+    /* cubic extrapolation for s and t */
+    workspace->x[i][0] = 4*(atom->s[0]+atom->s[2])-(6*atom->s[1]+atom->s[3]);
+    //}
 }
 
 void Cuda_Init_MatVec ( reax_system *system, storage *workspace )
 {
-	int blocks;
+    int blocks;
 
-	blocks = system->n / DEF_BLOCK_SIZE + 
-		(( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
 
-	ker_init_matvec <<< blocks, DEF_BLOCK_SIZE >>>
-		( system->d_my_atoms, system->reax_param.d_sbp, 
-		  *dev_workspace, system->n );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    ker_init_matvec <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, system->reax_param.d_sbp, 
+          *dev_workspace, system->n );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 void cuda_charges_x (reax_system *system, rvec2 my_sum)
 {
-	int blocks;
-	rvec2 *output = (rvec2 *) scratch;
-	cuda_memset (output, 0, sizeof (rvec2) * 2 * system->n, "cuda_charges_x:q");
+    int blocks;
+    rvec2 *output = (rvec2 *) scratch;
+    cuda_memset (output, 0, sizeof (rvec2) * 2 * system->n, "cuda_charges_x:q");
 
-	blocks = system->n / DEF_BLOCK_SIZE + 
-		(( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
 
-	k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
-		( dev_workspace->x, output, system->n );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    k_reduction_rvec2 <<< blocks, DEF_BLOCK_SIZE, sizeof (rvec2) * DEF_BLOCK_SIZE >>>
+        ( dev_workspace->x, output, system->n );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
-		( output, output + system->n, blocks );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    k_reduction_rvec2 <<< 1, BLOCKS_POW_2, sizeof (rvec2) * BLOCKS_POW_2 >>>
+        ( output, output + system->n, blocks );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	copy_host_device (my_sum, output + system->n, sizeof (rvec2), cudaMemcpyDeviceToHost, "charges:x");
+    copy_host_device (my_sum, output + system->n, sizeof (rvec2), cudaMemcpyDeviceToHost, "charges:x");
 }
 
 CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace, 
-		real u, real *q, int n)
+        real u, real *q, int n)
 {
-	storage *workspace = &( p_workspace );
-	reax_atom *atom;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= n) return;
-
-	//for( i = 0; i < system->n; ++i ) {
-	atom = &( my_atoms[i] );
-
-	//atom->q = workspace->s[i] - u * workspace->t[i];
-	q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1];
-
-	atom->s[3] = atom->s[2];
-	atom->s[2] = atom->s[1];
-	atom->s[1] = atom->s[0];
-	//atom->s[0] = workspace->s[i];
-	atom->s[0] = workspace->x[i][0];
-
-	atom->t[3] = atom->t[2];
-	atom->t[2] = atom->t[1];
-	atom->t[1] = atom->t[0];
-	//atom->t[0] = workspace->t[i];
-	atom->t[0] = workspace->x[i][1];
-	//}
+    storage *workspace = &( p_workspace );
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    //for( i = 0; i < system->n; ++i ) {
+    atom = &( my_atoms[i] );
+
+    //atom->q = workspace->s[i] - u * workspace->t[i];
+    q[i] = atom->q = workspace->x[i][0] - u * workspace->x[i][1];
+
+    atom->s[3] = atom->s[2];
+    atom->s[2] = atom->s[1];
+    atom->s[1] = atom->s[0];
+    //atom->s[0] = workspace->s[i];
+    atom->s[0] = workspace->x[i][0];
+
+    atom->t[3] = atom->t[2];
+    atom->t[2] = atom->t[1];
+    atom->t[1] = atom->t[0];
+    //atom->t[0] = workspace->t[i];
+    atom->t[0] = workspace->x[i][1];
+    //}
 }
 
 //TODO if we use the function argument (output), we are getting 
@@ -128,22 +128,22 @@ CUDA_GLOBAL void ker_calculate_st (reax_atom *my_atoms, storage p_workspace,
 
 extern "C" void cuda_charges_st (reax_system *system, storage *workspace, real *output, real u)
 {
-	int blocks;
-	real *tmp = (real *) scratch;
-	real *tmp_output = (real *) host_scratch;
-
-	cuda_memset (tmp, 0, sizeof (real) * system->n, "charges:q");
-	memset (tmp_output, 0, sizeof (real) * system->n);
-
-	blocks = system->n / DEF_BLOCK_SIZE + 
-		(( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-	ker_calculate_st <<< blocks, DEF_BLOCK_SIZE >>>
-		( system->d_my_atoms, *dev_workspace, u, tmp, system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	copy_host_device (output, tmp, sizeof (real) * system->n, 
-			cudaMemcpyDeviceToHost, "charges:q");
+    int blocks;
+    real *tmp = (real *) scratch;
+    real *tmp_output = (real *) host_scratch;
+
+    cuda_memset (tmp, 0, sizeof (real) * system->n, "charges:q");
+    memset (tmp_output, 0, sizeof (real) * system->n);
+
+    blocks = system->n / DEF_BLOCK_SIZE + 
+        (( system->n % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+    ker_calculate_st <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, *dev_workspace, u, tmp, system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    copy_host_device (output, tmp, sizeof (real) * system->n, 
+            cudaMemcpyDeviceToHost, "charges:q");
 }
 //TODO
 //TODO
@@ -155,23 +155,23 @@ extern "C" void cuda_charges_st (reax_system *system, storage *workspace, real *
 
 CUDA_GLOBAL void ker_update_q (reax_atom *my_atoms, real *q, int n, int N)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= (N-n)) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= (N-n)) return;
 
-	//for( i = system->n; i < system->N; ++i )
-	my_atoms[i + n].q = q[i + n];
+    //for( i = system->n; i < system->N; ++i )
+    my_atoms[i + n].q = q[i + n];
 }
 
 void cuda_charges_updateq (reax_system *system, real *q) 
 {
-	int blocks;
-	real *dev_q = (real *) scratch;
-	copy_host_device (q, dev_q, system->N * sizeof (real), 
-			cudaMemcpyHostToDevice, "charges:q");
-	blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
-		(( (system->N - system->n) % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-	ker_update_q <<< blocks, DEF_BLOCK_SIZE >>>
-		( system->d_my_atoms, dev_q, system->n, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    int blocks;
+    real *dev_q = (real *) scratch;
+    copy_host_device (q, dev_q, system->N * sizeof (real), 
+            cudaMemcpyHostToDevice, "charges:q");
+    blocks = (system->N - system->n) / DEF_BLOCK_SIZE + 
+        (( (system->N - system->n) % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+    ker_update_q <<< blocks, DEF_BLOCK_SIZE >>>
+        ( system->d_my_atoms, dev_q, system->n, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
diff --git a/PG-PuReMD/src/cuda_reset_tools.cu b/PG-PuReMD/src/cuda_reset_tools.cu
index 084da6b0..850a7c5d 100644
--- a/PG-PuReMD/src/cuda_reset_tools.cu
+++ b/PG-PuReMD/src/cuda_reset_tools.cu
@@ -4,159 +4,159 @@
 #include "dev_list.h"
 
 CUDA_GLOBAL void ker_reset_hbond_list (reax_atom *my_atoms, 
-		reax_list hbonds, 
-		int N)
+        reax_list hbonds, 
+        int N)
 {
-	int Hindex = 0;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	Hindex = my_atoms[i].Hindex;
-	if (Hindex > 1) {
-		Dev_Set_End_Index ( Hindex, Dev_Start_Index (Hindex, &hbonds), &hbonds);
-	}
+    int Hindex = 0;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    Hindex = my_atoms[i].Hindex;
+    if (Hindex > 1) {
+        Dev_Set_End_Index ( Hindex, Dev_Start_Index (Hindex, &hbonds), &hbonds);
+    }
 }
 
 CUDA_GLOBAL void ker_reset_bond_list (reax_atom *my_atoms, 
-		reax_list bonds, 
-		int N)
+        reax_list bonds, 
+        int N)
 {
-	int Hindex = 0;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
+    int Hindex = 0;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
 
-	Dev_Set_End_Index ( i, Dev_Start_Index (i, &bonds), &bonds);
+    Dev_Set_End_Index ( i, Dev_Start_Index (i, &bonds), &bonds);
 }
 
 extern "C"
 {
 
-	void Cuda_Reset_Workspace (reax_system *system, storage *workspace)
-	{
-		cuda_memset ( dev_workspace->total_bond_order, 0, system->total_cap * sizeof (real), "total_bond_order");
-		cuda_memset ( dev_workspace->dDeltap_self, 0, system->total_cap * sizeof (rvec), "dDeltap_self");
-		cuda_memset ( dev_workspace->CdDelta, 0, system->total_cap * sizeof (real), "CdDelta");
-		cuda_memset ( dev_workspace->f, 0, system->total_cap * sizeof (rvec), "f");
-	}
-
-	CUDA_GLOBAL void ker_reset_hindex (reax_atom *my_atoms, int N)
-	{
-		int Hindex = 0;
-		int i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= N) return;
-
-		my_atoms[i].Hindex = i;
-	}
-
-	void Cuda_Reset_Atoms( reax_system* system, control_params *control )
-	{
-		int i;
-		reax_atom *atom;
-		int blocks;
-
-		/*
-		   if( control->hbond_cut > 0 ) 
-		//TODO
-		for( i = 0; i < system->N; ++i ) { 
-		atom = &(system->my_atoms[i]);
-		//if( system->reax_param.sbp[ atom->type ].p_hbond == 1 ) 
-		atom->Hindex = system->numH++;
-		//else atom->Hindex = -1; 
-		}   
-		//TODO
-		 */
-		////////////////////////////////
-		////////////////////////////////
-		////////////////////////////////
-		////////////////////////////////
-		// FIX - 3 - Commented out this line for Hydrogen Bond fix
-		// FIX - HBOND ISSUE
-		// FIX - HBOND ISSUE
-		// FIX - HBOND ISSUE
-		// COMMENTED OUT THIS LINE BELOW
-		//system->numH = system->N;
-		// FIX - HBOND ISSUE
-		// FIX - HBOND ISSUE
-		// FIX - HBOND ISSUE
-		////////////////////////////////
-		////////////////////////////////
-		////////////////////////////////
-		////////////////////////////////
-		////////////////////////////////
-
-
-		blocks = system->N / DEF_BLOCK_SIZE + 
-			((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-		ker_reset_hindex <<<blocks, DEF_BLOCK_SIZE>>>
-			(system->d_my_atoms, system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-	}
-
-	int Cuda_Reset_Neighbor_Lists( reax_system *system, control_params *control,
-			storage *workspace, reax_list **lists )
-	{
-		int i, total_bonds, Hindex, total_hbonds;
-		reax_list *bonds, *hbonds;
-		int blocks;
-
-		if (system->N > 0) {
-			bonds = *dev_lists + BONDS;
-			total_bonds = 0;
-
-			//cuda_memset (bonds->index, 0, sizeof (int) * system->total_cap, "bonds:index");
-			//cuda_memset (bonds->end_index, 0, sizeof (int) * system->total_cap, "bonds:end_index");
-			blocks = system->N / DEF_BLOCK_SIZE + 
-				((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-			ker_reset_bond_list <<<blocks, DEF_BLOCK_SIZE>>>
-				(system->d_my_atoms, *(*dev_lists + BONDS), system->N);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			total_bonds = 0;// TODO compute the total bonds here.
-
-			/* is reallocation needed? */
-			if( total_bonds >= bonds->num_intrs * DANGER_ZONE ) { 
-				workspace->realloc.bonds = 1;
-				if( total_bonds >= bonds->num_intrs ) { 
-					fprintf(stderr, "p%d: not enough space for bonds! total=%d allocated=%d\n", 
-							system->my_rank, total_bonds, bonds->num_intrs );
-					return FAILURE;
-				}   
-			}   
-		}
-
-		//HBonds processing
-		//FIX - 4 - Added additional check
-		if( (control->hbond_cut > 0) && (system->numH > 0)) { 
-			hbonds = (*dev_lists) + HBONDS;
-			total_hbonds = 0;
-
-			/* reset start-end indexes */
-			//TODO
-			blocks = system->N / DEF_BLOCK_SIZE + 
-				((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
-			ker_reset_hbond_list <<<blocks, DEF_BLOCK_SIZE>>>
-				(system->d_my_atoms, *(*dev_lists + HBONDS), system->N);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			//TODO compute the total hbonds here
-			total_hbonds = 0;
-
-			/* is reallocation needed? */
-			if( total_hbonds >= hbonds->num_intrs * 0.90/*DANGER_ZONE*/ ) { 
-				workspace->realloc.hbonds = 1;
-				if( total_hbonds >= hbonds->num_intrs ) {
-					fprintf(stderr, "p%d: not enough space for hbonds! total=%d allocated=%d\n",
-							system->my_rank, total_hbonds, hbonds->num_intrs );
-					return FAILURE;
-				}
-			}
-		}
-
-		return SUCCESS;
-	}
+    void Cuda_Reset_Workspace (reax_system *system, storage *workspace)
+    {
+        cuda_memset ( dev_workspace->total_bond_order, 0, system->total_cap * sizeof (real), "total_bond_order");
+        cuda_memset ( dev_workspace->dDeltap_self, 0, system->total_cap * sizeof (rvec), "dDeltap_self");
+        cuda_memset ( dev_workspace->CdDelta, 0, system->total_cap * sizeof (real), "CdDelta");
+        cuda_memset ( dev_workspace->f, 0, system->total_cap * sizeof (rvec), "f");
+    }
+
+    CUDA_GLOBAL void ker_reset_hindex (reax_atom *my_atoms, int N)
+    {
+        int Hindex = 0;
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i >= N) return;
+
+        my_atoms[i].Hindex = i;
+    }
+
+    void Cuda_Reset_Atoms( reax_system* system, control_params *control )
+    {
+        int i;
+        reax_atom *atom;
+        int blocks;
+
+        /*
+           if( control->hbond_cut > 0 ) 
+        //TODO
+        for( i = 0; i < system->N; ++i ) { 
+        atom = &(system->my_atoms[i]);
+        //if( system->reax_param.sbp[ atom->type ].p_hbond == 1 ) 
+        atom->Hindex = system->numH++;
+        //else atom->Hindex = -1; 
+        }   
+        //TODO
+         */
+        ////////////////////////////////
+        ////////////////////////////////
+        ////////////////////////////////
+        ////////////////////////////////
+        // FIX - 3 - Commented out this line for Hydrogen Bond fix
+        // FIX - HBOND ISSUE
+        // FIX - HBOND ISSUE
+        // FIX - HBOND ISSUE
+        // COMMENTED OUT THIS LINE BELOW
+        //system->numH = system->N;
+        // FIX - HBOND ISSUE
+        // FIX - HBOND ISSUE
+        // FIX - HBOND ISSUE
+        ////////////////////////////////
+        ////////////////////////////////
+        ////////////////////////////////
+        ////////////////////////////////
+        ////////////////////////////////
+
+
+        blocks = system->N / DEF_BLOCK_SIZE + 
+            ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+        ker_reset_hindex <<<blocks, DEF_BLOCK_SIZE>>>
+            (system->d_my_atoms, system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+    }
+
+    int Cuda_Reset_Neighbor_Lists( reax_system *system, control_params *control,
+            storage *workspace, reax_list **lists )
+    {
+        int i, total_bonds, Hindex, total_hbonds;
+        reax_list *bonds, *hbonds;
+        int blocks;
+
+        if (system->N > 0) {
+            bonds = *dev_lists + BONDS;
+            total_bonds = 0;
+
+            //cuda_memset (bonds->index, 0, sizeof (int) * system->total_cap, "bonds:index");
+            //cuda_memset (bonds->end_index, 0, sizeof (int) * system->total_cap, "bonds:end_index");
+            blocks = system->N / DEF_BLOCK_SIZE + 
+                ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+            ker_reset_bond_list <<<blocks, DEF_BLOCK_SIZE>>>
+                (system->d_my_atoms, *(*dev_lists + BONDS), system->N);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            total_bonds = 0;// TODO compute the total bonds here.
+
+            /* is reallocation needed? */
+            if( total_bonds >= bonds->num_intrs * DANGER_ZONE ) { 
+                workspace->realloc.bonds = 1;
+                if( total_bonds >= bonds->num_intrs ) { 
+                    fprintf(stderr, "p%d: not enough space for bonds! total=%d allocated=%d\n", 
+                            system->my_rank, total_bonds, bonds->num_intrs );
+                    return FAILURE;
+                }   
+            }   
+        }
+
+        //HBonds processing
+        //FIX - 4 - Added additional check
+        if( (control->hbond_cut > 0) && (system->numH > 0)) { 
+            hbonds = (*dev_lists) + HBONDS;
+            total_hbonds = 0;
+
+            /* reset start-end indexes */
+            //TODO
+            blocks = system->N / DEF_BLOCK_SIZE + 
+                ((system->N % DEF_BLOCK_SIZE == 0 ) ? 0 : 1);
+            ker_reset_hbond_list <<<blocks, DEF_BLOCK_SIZE>>>
+                (system->d_my_atoms, *(*dev_lists + HBONDS), system->N);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            //TODO compute the total hbonds here
+            total_hbonds = 0;
+
+            /* is reallocation needed? */
+            if( total_hbonds >= hbonds->num_intrs * 0.90/*DANGER_ZONE*/ ) { 
+                workspace->realloc.hbonds = 1;
+                if( total_hbonds >= hbonds->num_intrs ) {
+                    fprintf(stderr, "p%d: not enough space for hbonds! total=%d allocated=%d\n",
+                            system->my_rank, total_hbonds, hbonds->num_intrs );
+                    return FAILURE;
+                }
+            }
+        }
+
+        return SUCCESS;
+    }
 
 }
diff --git a/PG-PuReMD/src/cuda_torsion_angles.cu b/PG-PuReMD/src/cuda_torsion_angles.cu
index 42ffe859..e9a9b1f0 100644
--- a/PG-PuReMD/src/cuda_torsion_angles.cu
+++ b/PG-PuReMD/src/cuda_torsion_angles.cu
@@ -29,609 +29,609 @@
 #define MIN_SINE 1e-10
 
 CUDA_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij,
-		rvec dvec_jk, real r_jk,
-		rvec dvec_kl, real r_kl,
-		rvec dvec_li, real r_li,
-		three_body_interaction_data *p_ijk, 
-		three_body_interaction_data *p_jkl, 
-		rvec dcos_omega_di, rvec dcos_omega_dj, 
-		rvec dcos_omega_dk, rvec dcos_omega_dl, 
-		output_controls *out_control )
+        rvec dvec_jk, real r_jk,
+        rvec dvec_kl, real r_kl,
+        rvec dvec_li, real r_li,
+        three_body_interaction_data *p_ijk, 
+        three_body_interaction_data *p_jkl, 
+        rvec dcos_omega_di, rvec dcos_omega_dj, 
+        rvec dcos_omega_dk, rvec dcos_omega_dl, 
+        output_controls *out_control )
 {
-	real unnorm_cos_omega, unnorm_sin_omega, omega;
-	real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
-	real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe;
-	real arg, poem, tel;
-	rvec cross_jk_kl;
-
-	sin_ijk = SIN( p_ijk->theta );
-	cos_ijk = COS( p_ijk->theta );
-	sin_jkl = SIN( p_jkl->theta );
-	cos_jkl = COS( p_jkl->theta );
-
-	/* omega */
-	unnorm_cos_omega = -rvec_Dot(dvec_ij, dvec_jk) * rvec_Dot(dvec_jk, dvec_kl) + 
-		SQR( r_jk ) *  rvec_Dot( dvec_ij, dvec_kl );
-
-	rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
-	unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
-
-	omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); 
-
-
-	/* derivatives */
-	/* coef for adjusments to cos_theta's */
-	/* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li;
-	   coshd = cos_ijk, coshe = cos_jkl;
-	   sinhd = sin_ijk, sinhe = sin_jkl; */
-	htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk );
-	htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl;
-	htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk );
-	hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl );
-	hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk );
-	hnra = r_kl * sin_ijk * sin_jkl;
-	hnrc = r_ij * sin_ijk * sin_jkl;
-	hnhd = r_ij * r_kl * cos_ijk * sin_jkl;
-	hnhe = r_ij * r_kl * sin_ijk * cos_jkl;
-
-
-	poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
-	if( poem < 1e-20 ) poem = 1e-20;
-
-	tel  = SQR( r_ij ) + SQR( r_jk ) + SQR( r_kl ) - SQR( r_li ) - 
-		2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + 
-				r_jk * r_kl * cos_jkl );
-
-	arg  = tel / poem;
-	if( arg >  1.0 ) arg =  1.0;
-	if( arg < -1.0 ) arg = -1.0;
-
-
-	/* fprintf( out_control->etor, 
-	   "%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
-	   htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe );
-	   fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
-	   dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij );
-	   fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
-	   -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk );
-	   fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
-	   -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl );
-	   fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n",
-	   r_li, dvec_li[0], dvec_li[1], dvec_li[2] );
-	   fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
-	   poem, tel, arg ); */
-	/* fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
-	   -p_ijk->dcos_dk[0]/sin_ijk, -p_ijk->dcos_dk[1]/sin_ijk, 
-	   -p_ijk->dcos_dk[2]/sin_ijk );
-	   fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
-	   -p_jkl->dcos_dk[0]/sin_jkl, -p_jkl->dcos_dk[1]/sin_jkl, 
-	   -p_jkl->dcos_dk[2]/sin_jkl );*/
-
-	if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE;
-	else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE;
-	if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE;
-	else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE;
-
-	// dcos_omega_di
-	rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li );
-	rvec_ScaledAdd( dcos_omega_di,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_dk );
-	rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di );
-
-	// dcos_omega_dj
-	rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, 
-			-htrb / r_jk, dvec_jk );
-	rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_dj );
-	rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_di );
-	rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj );
-
-	// dcos_omega_dk
-	rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc)/r_kl, dvec_kl,  
-			htrb / r_jk, dvec_jk );
-	rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_di );
-	rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_dj );
-	rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk );
-
-	// dcos_omega_dl
-	rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc)/r_kl, dvec_kl, 1., dvec_li );
-	rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_dk );
-	rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl );
-
-	return omega;  
+    real unnorm_cos_omega, unnorm_sin_omega, omega;
+    real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
+    real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe;
+    real arg, poem, tel;
+    rvec cross_jk_kl;
+
+    sin_ijk = SIN( p_ijk->theta );
+    cos_ijk = COS( p_ijk->theta );
+    sin_jkl = SIN( p_jkl->theta );
+    cos_jkl = COS( p_jkl->theta );
+
+    /* omega */
+    unnorm_cos_omega = -rvec_Dot(dvec_ij, dvec_jk) * rvec_Dot(dvec_jk, dvec_kl) + 
+        SQR( r_jk ) *  rvec_Dot( dvec_ij, dvec_kl );
+
+    rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
+    unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
+
+    omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); 
+
+
+    /* derivatives */
+    /* coef for adjusments to cos_theta's */
+    /* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li;
+       coshd = cos_ijk, coshe = cos_jkl;
+       sinhd = sin_ijk, sinhe = sin_jkl; */
+    htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk );
+    htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl;
+    htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk );
+    hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl );
+    hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk );
+    hnra = r_kl * sin_ijk * sin_jkl;
+    hnrc = r_ij * sin_ijk * sin_jkl;
+    hnhd = r_ij * r_kl * cos_ijk * sin_jkl;
+    hnhe = r_ij * r_kl * sin_ijk * cos_jkl;
+
+
+    poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
+    if( poem < 1e-20 ) poem = 1e-20;
+
+    tel  = SQR( r_ij ) + SQR( r_jk ) + SQR( r_kl ) - SQR( r_li ) - 
+        2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + 
+                r_jk * r_kl * cos_jkl );
+
+    arg  = tel / poem;
+    if( arg >  1.0 ) arg =  1.0;
+    if( arg < -1.0 ) arg = -1.0;
+
+
+    /* fprintf( out_control->etor, 
+       "%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f%12.6f\n",
+       htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe );
+       fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
+       dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij );
+       fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
+       -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk );
+       fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
+       -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl );
+       fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n",
+       r_li, dvec_li[0], dvec_li[1], dvec_li[2] );
+       fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
+       poem, tel, arg ); */
+    /* fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
+       -p_ijk->dcos_dk[0]/sin_ijk, -p_ijk->dcos_dk[1]/sin_ijk, 
+       -p_ijk->dcos_dk[2]/sin_ijk );
+       fprintf( out_control->etor, "%12.6f%12.6f%12.6f\n",
+       -p_jkl->dcos_dk[0]/sin_jkl, -p_jkl->dcos_dk[1]/sin_jkl, 
+       -p_jkl->dcos_dk[2]/sin_jkl );*/
+
+    if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE;
+    else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE;
+    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE;
+    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE;
+
+    // dcos_omega_di
+    rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_di,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_dk );
+    rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di );
+
+    // dcos_omega_dj
+    rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, 
+            -htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_dj );
+    rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_di );
+    rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj );
+
+    // dcos_omega_dk
+    rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc)/r_kl, dvec_kl,  
+            htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd)/sin_ijk, p_ijk->dcos_di );
+    rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_dj );
+    rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk );
+
+    // dcos_omega_dl
+    rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc)/r_kl, dvec_kl, 1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe)/sin_jkl, p_jkl->dcos_dk );
+    rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl );
+
+    return omega;  
 }
 
 
 
 CUDA_GLOBAL void Cuda_Torsion_Angles( reax_atom *my_atoms, 
-		global_parameters gp, 
-		four_body_header *d_fbp, 
-		control_params *control, 
-		reax_list p_bonds, reax_list p_thb_intrs, 
-		storage p_workspace, 
-		int n, int num_atom_types, 
-		real *data_e_tor, real *data_e_con, 
-		rvec *data_ext_press )
+        global_parameters gp, 
+        four_body_header *d_fbp, 
+        control_params *control, 
+        reax_list p_bonds, reax_list p_thb_intrs, 
+        storage p_workspace, 
+        int n, int num_atom_types, 
+        real *data_e_tor, real *data_e_con, 
+        rvec *data_ext_press )
 {
-	int i, j, k, l, pi, pj, pk, pl, pij, plk, natoms;
-	int type_i, type_j, type_k, type_l;
-	int start_j, end_j, start_k, end_k;
-	int start_pj, end_pj, start_pk, end_pk;
-	int num_frb_intrs = 0;
-
-	real Delta_j, Delta_k;
-	real r_ij, r_jk, r_kl, r_li;
-	real BOA_ij, BOA_jk, BOA_kl;
-
-	real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
-	real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
-	real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
-	real fn10, f11_DjDk, dfn11, fn12;
-	real theta_ijk, theta_jkl;
-	real sin_ijk, sin_jkl;
-	real cos_ijk, cos_jkl;
-	real tan_ijk_i, tan_jkl_i;
-	real omega, cos_omega, cos2omega, cos3omega;
-	rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
-	real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
-	real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
-	real Cconj, CEconj1, CEconj2, CEconj3;
-	real CEconj4, CEconj5, CEconj6;
-	real e_tor, e_con;
-	rvec dvec_li;
-	rvec force, ext_press;
-	ivec rel_box_jl;
-	// rtensor total_rtensor, temp_rtensor;
-	four_body_header *fbh;
-	four_body_parameters *fbp;
-	bond_data *pbond_ij, *pbond_jk, *pbond_kl;
-	bond_order_data *bo_ij, *bo_jk, *bo_kl;
-	three_body_interaction_data *p_ijk, *p_jkl;
-
-	reax_list *bonds = &( p_bonds );
-	reax_list *thb_intrs = &( p_thb_intrs );
-	storage *workspace = &( p_workspace );
-
-	j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= n) return;
-
-	real p_tor2 = gp.l[23];
-	real p_tor3 = gp.l[24];
-	real p_tor4 = gp.l[25];
-	real p_cot2 = gp.l[27];
-	// char  fname[100];
-	// FILE *ftor;
-
-	// sprintf( fname, "tor%d.out", system->my_rank );
-	// ftor = fopen( fname, "w" );
-
-	//natoms = system->n;
-
-	//for( j = 0; j < natoms; ++j ) {
-	type_j = my_atoms[j].type;
-	Delta_j = workspace->Delta_boc[j];
-	start_j = Dev_Start_Index(j, bonds);
-	end_j = Dev_End_Index(j, bonds);
-
-	for( pk = start_j; pk < end_j; ++pk ) {
-		pbond_jk = &( bonds->select.bond_list[pk] );
-		k = pbond_jk->nbr;
-		bo_jk = &( pbond_jk->bo_data );
-		BOA_jk = bo_jk->BO - control->thb_cut;
-
-		/* see if there are any 3-body interactions involving j&k
-		   where j is the central atom. Otherwise there is no point in
-		   trying to form a 4-body interaction out of this neighborhood */
-		if( my_atoms[j].orig_id < my_atoms[k].orig_id && 
-				bo_jk->BO > control->thb_cut/*0*/ && Dev_Num_Entries(pk, thb_intrs) ) {
-			start_k = Dev_Start_Index(k, bonds);
-			end_k = Dev_End_Index(k, bonds);	    	       
-			pj = pbond_jk->sym_index; // pj points to j on k's list
-
-			/* do the same check as above: 
-			   are there any 3-body interactions involving k&j 
-			   where k is the central atom */
-			if( Dev_Num_Entries(pj, thb_intrs) ) {
-				type_k = my_atoms[k].type;
-				Delta_k = workspace->Delta_boc[k];
-				r_jk = pbond_jk->d;
-
-				start_pk = Dev_Start_Index(pk, thb_intrs );
-				end_pk = Dev_End_Index(pk, thb_intrs );
-				start_pj = Dev_Start_Index(pj, thb_intrs );
-				end_pj = Dev_End_Index(pj, thb_intrs );		
-
-				exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
-				exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
-				exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
-				exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
-				exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
-				f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
-
-
-				/* pick i up from j-k interaction where j is the central atom */
-				for( pi = start_pk; pi < end_pk; ++pi ) {
-					p_ijk = &( thb_intrs->select.three_body_list[pi] );
-					pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
-					pbond_ij = &( bonds->select.bond_list[pij] );
-					bo_ij = &( pbond_ij->bo_data );
-
-
-					if( bo_ij->BO > control->thb_cut/*0*/ ) {
-						i = p_ijk->thb;
-						type_i = my_atoms[i].type;
-						r_ij = pbond_ij->d;
-						BOA_ij = bo_ij->BO - control->thb_cut;
-
-						theta_ijk = p_ijk->theta;
-						sin_ijk = SIN( theta_ijk );
-						cos_ijk = COS( theta_ijk );
-						//tan_ijk_i = 1. / TAN( theta_ijk );
-						if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
-							tan_ijk_i = cos_ijk / MIN_SINE;
-						else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
-							tan_ijk_i = cos_ijk / -MIN_SINE;
-						else tan_ijk_i = cos_ijk / sin_ijk;
-
-						exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
-						exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
-
-
-						/* pick l up from j-k interaction where k is the central atom */
-						for( pl = start_pj; pl < end_pj; ++pl ) {
-							p_jkl = &( thb_intrs->select.three_body_list[pl] );
-							l = p_jkl->thb;
-							plk = p_jkl->pthb; //pointer to l on k's bond_list!
-							pbond_kl = &( bonds->select.bond_list[plk] );
-							bo_kl = &( pbond_kl->bo_data );
-							type_l = my_atoms[l].type;
-							fbh = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)]);
-							fbp = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]);
-
-
-							if( i != l && fbh->cnt && 
-									bo_kl->BO > control->thb_cut/*0*/ &&
-									bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
-								++num_frb_intrs;
-								r_kl = pbond_kl->d;
-								BOA_kl = bo_kl->BO - control->thb_cut;
-
-								theta_jkl = p_jkl->theta;
-								sin_jkl = SIN( theta_jkl );
-								cos_jkl = COS( theta_jkl );
-								//tan_jkl_i = 1. / TAN( theta_jkl );
-								if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
-									tan_jkl_i = cos_jkl / MIN_SINE;
-								else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
-									tan_jkl_i = cos_jkl / -MIN_SINE;
-								else tan_jkl_i = cos_jkl /sin_jkl;
-
-								rvec_ScaledSum( dvec_li, 1., my_atoms[i].x, 
-										-1., my_atoms[l].x );
-								r_li = rvec_Norm( dvec_li );				 
-
-
-								/* omega and its derivative */
-								omega = Calculate_Omega( pbond_ij->dvec, r_ij, 
-										pbond_jk->dvec, r_jk, 
-										pbond_kl->dvec, r_kl,
-										dvec_li, r_li,
-										p_ijk, p_jkl,
-										dcos_omega_di, dcos_omega_dj,
-										dcos_omega_dk, dcos_omega_dl,
-										NULL);
-
-								cos_omega = COS( omega );
-								cos2omega = COS( 2. * omega );
-								cos3omega = COS( 3. * omega );
-								/* end omega calculations */
-
-								/* torsion energy */
-								exp_tor1 = EXP( fbp->p_tor1 * 
-										SQR(2.0 - bo_jk->BO_pi - f11_DjDk) );
-								exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
-								exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl - 1.5) );
-								fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
-									(1.0 - exp_tor2_kl);
-
-								CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
-										fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
-										fbp->V3 * (1.0 + cos3omega) );
-
-								data_e_tor [j] += e_tor = fn10 * sin_ijk * sin_jkl * CV;
-
-								dfn11 = (-p_tor3 * exp_tor3_DjDk +
-										(p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
-										(2.0 + exp_tor3_DjDk) * exp_tor34_inv) * 
-									exp_tor34_inv;
-
-								CEtors1 = sin_ijk * sin_jkl * CV;
-
-								CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 *
-									(2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) *
-									sin_ijk * sin_jkl; 
-								CEtors3 = CEtors2 * dfn11;
-
-								CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
-									(1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
-								CEtors5 = CEtors1 * p_tor2 * 
-									(1.0 - exp_tor2_ij) * exp_tor2_jk * (1.0 - exp_tor2_kl);
-								CEtors6 = CEtors1 * p_tor2 * 
-									(1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * exp_tor2_kl;
-
-								cmn = -fn10 * CV;
-								CEtors7 = cmn * sin_jkl * tan_ijk_i;
-								CEtors8 = cmn * sin_ijk * tan_jkl_i;
-
-								CEtors9 = fn10 * sin_ijk * sin_jkl * 
-									(0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-									 1.5 * fbp->V3 * (cos2omega + 2.0 * SQR(cos_omega)));
-								/* end  of torsion energy */
-
-
-								/* 4-body conjugation energy */
-								fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-								data_e_con [j] += e_con =
-									fbp->p_cot1 * fn12 * 
-									(1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
-
-								Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
-									(1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
-
-								CEconj1 = Cconj * (BOA_ij - 1.5e0);
-								CEconj2 = Cconj * (BOA_jk - 1.5e0);
-								CEconj3 = Cconj * (BOA_kl - 1.5e0);
-
-								CEconj4 = -fbp->p_cot1 * fn12 * 
-									(SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
-								CEconj5 = -fbp->p_cot1 * fn12 * 
-									(SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
-								CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
-									cos_omega * sin_ijk * sin_jkl;
-								/* end 4-body conjugation energy */
-
-								/* forces */
-								/*
-								   bo_jk->Cdbopi += CEtors2;
-								   workspace->CdDelta[j] += CEtors3;
-								   workspace->CdDelta[k] += CEtors3;
-								   bo_ij->Cdbo += (CEtors4 + CEconj1);
-								   bo_jk->Cdbo += (CEtors5 + CEconj2);
-								   bo_kl->Cdbo += (CEtors6 + CEconj3);
-								 */
-								bo_jk->Cdbopi += CEtors2;
-								workspace->CdDelta[j] += CEtors3;
-								pbond_jk->ta_CdDelta += CEtors3;
-								bo_ij->Cdbo += (CEtors4 + CEconj1);
-								bo_jk->Cdbo += (CEtors5 + CEconj2);
-								atomicAdd ( &pbond_kl->ta_Cdbo, (CEtors6 + CEconj3));
-
-								if( control->virial == 0 ) {
-									/* dcos_theta_ijk */
-									//rvec_ScaledAdd( workspace->f[i], 
-									atomic_rvecScaledAdd( pbond_ij->ta_f, 
-											CEtors7 + CEconj4, p_ijk->dcos_dk );
-									rvec_ScaledAdd( workspace->f[j], 
-											CEtors7 + CEconj4, p_ijk->dcos_dj );
-									//rvec_ScaledAdd( workspace->f[k], 
-									atomic_rvecScaledAdd( pbond_jk->ta_f,
-											CEtors7 + CEconj4, p_ijk->dcos_di );
-
-									/* dcos_theta_jkl */
-									rvec_ScaledAdd( workspace->f[j], 
-											CEtors8 + CEconj5, p_jkl->dcos_di );
-									//rvec_ScaledAdd( workspace->f[k], 
-									atomic_rvecScaledAdd( pbond_jk->ta_f,
-											CEtors8 + CEconj5, p_jkl->dcos_dj );
-									//rvec_ScaledAdd( workspace->f[l], 
-									atomic_rvecScaledAdd( pbond_kl->ta_f, 
-											CEtors8 + CEconj5, p_jkl->dcos_dk );
-
-									/* dcos_omega */
-									//rvec_ScaledAdd( workspace->f[i], 
-									atomic_rvecScaledAdd( pbond_ij->ta_f,
-											CEtors9 + CEconj6, dcos_omega_di );
-									rvec_ScaledAdd( workspace->f[j], 
-											CEtors9 + CEconj6, dcos_omega_dj );
-									//rvec_ScaledAdd( workspace->f[k], 
-									atomic_rvecScaledAdd( pbond_jk->ta_f,
-											CEtors9 + CEconj6, dcos_omega_dk );
-									//rvec_ScaledAdd( workspace->f[l], 
-									atomic_rvecScaledAdd( pbond_kl->ta_f,
-											CEtors9 + CEconj6, dcos_omega_dl );
-								}
-								else {
-									ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
-
-									/* dcos_theta_ijk */
-									rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
-									//rvec_Add( workspace->f[i], force );
-									atomic_rvecAdd( pbond_ij->ta_f, force );
-									rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-									rvec_Add( data_ext_press [j], ext_press );
-
-									rvec_ScaledAdd( workspace->f[j], 
-											CEtors7 + CEconj4, p_ijk->dcos_dj );
-
-									rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
-									//rvec_Add( workspace->f[k], force );
-									atomic_rvecAdd( pbond_jk->ta_f, force );
-									rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-									rvec_Add( data_ext_press[j], ext_press );
-
-
-									/* dcos_theta_jkl */
-									rvec_ScaledAdd( workspace->f[j], 
-											CEtors8 + CEconj5, p_jkl->dcos_di );
-
-									rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
-									//rvec_Add( workspace->f[k], force );
-									atomic_rvecAdd( pbond_jk->ta_f, force );
-									rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-									rvec_Add( data_ext_press [j], ext_press );
-
-									rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
-									//rvec_Add( workspace->f[l], force );
-									rvec_Add( pbond_kl->ta_f, force );
-									rvec_iMultiply( ext_press, rel_box_jl, force );
-									rvec_Add( data_ext_press [j], ext_press );
-
-
-									/* dcos_omega */				      
-									rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
-									//rvec_Add( workspace->f[i], force );
-									atomic_rvecAdd( pbond_ij->ta_f, force );
-									rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-									rvec_Add( data_ext_press [j], ext_press );
-
-									rvec_ScaledAdd( workspace->f[j], 
-											CEtors9 + CEconj6, dcos_omega_dj );
-
-									rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
-									//rvec_Add( workspace->f[k], force );
-									rvec_Add( pbond_jk->ta_f, force );
-									rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-									rvec_Add( data_ext_press [j], ext_press );
-
-									rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
-									//rvec_Add( workspace->f[l], force );
-									rvec_Add( pbond_kl->ta_f, force );
-									rvec_iMultiply( ext_press, rel_box_jl, force );
-									rvec_Add( data_ext_press [j], ext_press );
-								}
+    int i, j, k, l, pi, pj, pk, pl, pij, plk, natoms;
+    int type_i, type_j, type_k, type_l;
+    int start_j, end_j, start_k, end_k;
+    int start_pj, end_pj, start_pk, end_pk;
+    int num_frb_intrs = 0;
+
+    real Delta_j, Delta_k;
+    real r_ij, r_jk, r_kl, r_li;
+    real BOA_ij, BOA_jk, BOA_kl;
+
+    real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
+    real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
+    real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
+    real fn10, f11_DjDk, dfn11, fn12;
+    real theta_ijk, theta_jkl;
+    real sin_ijk, sin_jkl;
+    real cos_ijk, cos_jkl;
+    real tan_ijk_i, tan_jkl_i;
+    real omega, cos_omega, cos2omega, cos3omega;
+    rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
+    real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
+    real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
+    real Cconj, CEconj1, CEconj2, CEconj3;
+    real CEconj4, CEconj5, CEconj6;
+    real e_tor, e_con;
+    rvec dvec_li;
+    rvec force, ext_press;
+    ivec rel_box_jl;
+    // rtensor total_rtensor, temp_rtensor;
+    four_body_header *fbh;
+    four_body_parameters *fbp;
+    bond_data *pbond_ij, *pbond_jk, *pbond_kl;
+    bond_order_data *bo_ij, *bo_jk, *bo_kl;
+    three_body_interaction_data *p_ijk, *p_jkl;
+
+    reax_list *bonds = &( p_bonds );
+    reax_list *thb_intrs = &( p_thb_intrs );
+    storage *workspace = &( p_workspace );
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= n) return;
+
+    real p_tor2 = gp.l[23];
+    real p_tor3 = gp.l[24];
+    real p_tor4 = gp.l[25];
+    real p_cot2 = gp.l[27];
+    // char  fname[100];
+    // FILE *ftor;
+
+    // sprintf( fname, "tor%d.out", system->my_rank );
+    // ftor = fopen( fname, "w" );
+
+    //natoms = system->n;
+
+    //for( j = 0; j < natoms; ++j ) {
+    type_j = my_atoms[j].type;
+    Delta_j = workspace->Delta_boc[j];
+    start_j = Dev_Start_Index(j, bonds);
+    end_j = Dev_End_Index(j, bonds);
+
+    for( pk = start_j; pk < end_j; ++pk ) {
+        pbond_jk = &( bonds->select.bond_list[pk] );
+        k = pbond_jk->nbr;
+        bo_jk = &( pbond_jk->bo_data );
+        BOA_jk = bo_jk->BO - control->thb_cut;
+
+        /* see if there are any 3-body interactions involving j&k
+           where j is the central atom. Otherwise there is no point in
+           trying to form a 4-body interaction out of this neighborhood */
+        if( my_atoms[j].orig_id < my_atoms[k].orig_id && 
+                bo_jk->BO > control->thb_cut/*0*/ && Dev_Num_Entries(pk, thb_intrs) ) {
+            start_k = Dev_Start_Index(k, bonds);
+            end_k = Dev_End_Index(k, bonds);               
+            pj = pbond_jk->sym_index; // pj points to j on k's list
+
+            /* do the same check as above: 
+               are there any 3-body interactions involving k&j 
+               where k is the central atom */
+            if( Dev_Num_Entries(pj, thb_intrs) ) {
+                type_k = my_atoms[k].type;
+                Delta_k = workspace->Delta_boc[k];
+                r_jk = pbond_jk->d;
+
+                start_pk = Dev_Start_Index(pk, thb_intrs );
+                end_pk = Dev_End_Index(pk, thb_intrs );
+                start_pj = Dev_Start_Index(pj, thb_intrs );
+                end_pj = Dev_End_Index(pj, thb_intrs );        
+
+                exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
+                exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
+                exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
+                exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
+                exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
+                f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
+
+
+                /* pick i up from j-k interaction where j is the central atom */
+                for( pi = start_pk; pi < end_pk; ++pi ) {
+                    p_ijk = &( thb_intrs->select.three_body_list[pi] );
+                    pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
+                    pbond_ij = &( bonds->select.bond_list[pij] );
+                    bo_ij = &( pbond_ij->bo_data );
+
+
+                    if( bo_ij->BO > control->thb_cut/*0*/ ) {
+                        i = p_ijk->thb;
+                        type_i = my_atoms[i].type;
+                        r_ij = pbond_ij->d;
+                        BOA_ij = bo_ij->BO - control->thb_cut;
+
+                        theta_ijk = p_ijk->theta;
+                        sin_ijk = SIN( theta_ijk );
+                        cos_ijk = COS( theta_ijk );
+                        //tan_ijk_i = 1. / TAN( theta_ijk );
+                        if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
+                            tan_ijk_i = cos_ijk / MIN_SINE;
+                        else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
+                            tan_ijk_i = cos_ijk / -MIN_SINE;
+                        else tan_ijk_i = cos_ijk / sin_ijk;
+
+                        exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
+                        exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
+
+
+                        /* pick l up from j-k interaction where k is the central atom */
+                        for( pl = start_pj; pl < end_pj; ++pl ) {
+                            p_jkl = &( thb_intrs->select.three_body_list[pl] );
+                            l = p_jkl->thb;
+                            plk = p_jkl->pthb; //pointer to l on k's bond_list!
+                            pbond_kl = &( bonds->select.bond_list[plk] );
+                            bo_kl = &( pbond_kl->bo_data );
+                            type_l = my_atoms[l].type;
+                            fbh = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)]);
+                            fbp = &(d_fbp[index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]);
+
+
+                            if( i != l && fbh->cnt && 
+                                    bo_kl->BO > control->thb_cut/*0*/ &&
+                                    bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
+                                ++num_frb_intrs;
+                                r_kl = pbond_kl->d;
+                                BOA_kl = bo_kl->BO - control->thb_cut;
+
+                                theta_jkl = p_jkl->theta;
+                                sin_jkl = SIN( theta_jkl );
+                                cos_jkl = COS( theta_jkl );
+                                //tan_jkl_i = 1. / TAN( theta_jkl );
+                                if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
+                                    tan_jkl_i = cos_jkl / MIN_SINE;
+                                else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
+                                    tan_jkl_i = cos_jkl / -MIN_SINE;
+                                else tan_jkl_i = cos_jkl /sin_jkl;
+
+                                rvec_ScaledSum( dvec_li, 1., my_atoms[i].x, 
+                                        -1., my_atoms[l].x );
+                                r_li = rvec_Norm( dvec_li );                 
+
+
+                                /* omega and its derivative */
+                                omega = Calculate_Omega( pbond_ij->dvec, r_ij, 
+                                        pbond_jk->dvec, r_jk, 
+                                        pbond_kl->dvec, r_kl,
+                                        dvec_li, r_li,
+                                        p_ijk, p_jkl,
+                                        dcos_omega_di, dcos_omega_dj,
+                                        dcos_omega_dk, dcos_omega_dl,
+                                        NULL);
+
+                                cos_omega = COS( omega );
+                                cos2omega = COS( 2. * omega );
+                                cos3omega = COS( 3. * omega );
+                                /* end omega calculations */
+
+                                /* torsion energy */
+                                exp_tor1 = EXP( fbp->p_tor1 * 
+                                        SQR(2.0 - bo_jk->BO_pi - f11_DjDk) );
+                                exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
+                                exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl - 1.5) );
+                                fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
+                                    (1.0 - exp_tor2_kl);
+
+                                CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
+                                        fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
+                                        fbp->V3 * (1.0 + cos3omega) );
+
+                                data_e_tor [j] += e_tor = fn10 * sin_ijk * sin_jkl * CV;
+
+                                dfn11 = (-p_tor3 * exp_tor3_DjDk +
+                                        (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
+                                        (2.0 + exp_tor3_DjDk) * exp_tor34_inv) * 
+                                    exp_tor34_inv;
+
+                                CEtors1 = sin_ijk * sin_jkl * CV;
+
+                                CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 *
+                                    (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) *
+                                    sin_ijk * sin_jkl; 
+                                CEtors3 = CEtors2 * dfn11;
+
+                                CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
+                                    (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
+                                CEtors5 = CEtors1 * p_tor2 * 
+                                    (1.0 - exp_tor2_ij) * exp_tor2_jk * (1.0 - exp_tor2_kl);
+                                CEtors6 = CEtors1 * p_tor2 * 
+                                    (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * exp_tor2_kl;
+
+                                cmn = -fn10 * CV;
+                                CEtors7 = cmn * sin_jkl * tan_ijk_i;
+                                CEtors8 = cmn * sin_ijk * tan_jkl_i;
+
+                                CEtors9 = fn10 * sin_ijk * sin_jkl * 
+                                    (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                     1.5 * fbp->V3 * (cos2omega + 2.0 * SQR(cos_omega)));
+                                /* end  of torsion energy */
+
+
+                                /* 4-body conjugation energy */
+                                fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
+                                data_e_con [j] += e_con =
+                                    fbp->p_cot1 * fn12 * 
+                                    (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
+
+                                Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
+                                    (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jkl);
+
+                                CEconj1 = Cconj * (BOA_ij - 1.5e0);
+                                CEconj2 = Cconj * (BOA_jk - 1.5e0);
+                                CEconj3 = Cconj * (BOA_kl - 1.5e0);
+
+                                CEconj4 = -fbp->p_cot1 * fn12 * 
+                                    (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
+                                CEconj5 = -fbp->p_cot1 * fn12 * 
+                                    (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
+                                CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
+                                    cos_omega * sin_ijk * sin_jkl;
+                                /* end 4-body conjugation energy */
+
+                                /* forces */
+                                /*
+                                   bo_jk->Cdbopi += CEtors2;
+                                   workspace->CdDelta[j] += CEtors3;
+                                   workspace->CdDelta[k] += CEtors3;
+                                   bo_ij->Cdbo += (CEtors4 + CEconj1);
+                                   bo_jk->Cdbo += (CEtors5 + CEconj2);
+                                   bo_kl->Cdbo += (CEtors6 + CEconj3);
+                                 */
+                                bo_jk->Cdbopi += CEtors2;
+                                workspace->CdDelta[j] += CEtors3;
+                                pbond_jk->ta_CdDelta += CEtors3;
+                                bo_ij->Cdbo += (CEtors4 + CEconj1);
+                                bo_jk->Cdbo += (CEtors5 + CEconj2);
+                                atomicAdd ( &pbond_kl->ta_Cdbo, (CEtors6 + CEconj3));
+
+                                if( control->virial == 0 ) {
+                                    /* dcos_theta_ijk */
+                                    //rvec_ScaledAdd( workspace->f[i], 
+                                    atomic_rvecScaledAdd( pbond_ij->ta_f, 
+                                            CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                    rvec_ScaledAdd( workspace->f[j], 
+                                            CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                    //rvec_ScaledAdd( workspace->f[k], 
+                                    atomic_rvecScaledAdd( pbond_jk->ta_f,
+                                            CEtors7 + CEconj4, p_ijk->dcos_di );
+
+                                    /* dcos_theta_jkl */
+                                    rvec_ScaledAdd( workspace->f[j], 
+                                            CEtors8 + CEconj5, p_jkl->dcos_di );
+                                    //rvec_ScaledAdd( workspace->f[k], 
+                                    atomic_rvecScaledAdd( pbond_jk->ta_f,
+                                            CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                    //rvec_ScaledAdd( workspace->f[l], 
+                                    atomic_rvecScaledAdd( pbond_kl->ta_f, 
+                                            CEtors8 + CEconj5, p_jkl->dcos_dk );
+
+                                    /* dcos_omega */
+                                    //rvec_ScaledAdd( workspace->f[i], 
+                                    atomic_rvecScaledAdd( pbond_ij->ta_f,
+                                            CEtors9 + CEconj6, dcos_omega_di );
+                                    rvec_ScaledAdd( workspace->f[j], 
+                                            CEtors9 + CEconj6, dcos_omega_dj );
+                                    //rvec_ScaledAdd( workspace->f[k], 
+                                    atomic_rvecScaledAdd( pbond_jk->ta_f,
+                                            CEtors9 + CEconj6, dcos_omega_dk );
+                                    //rvec_ScaledAdd( workspace->f[l], 
+                                    atomic_rvecScaledAdd( pbond_kl->ta_f,
+                                            CEtors9 + CEconj6, dcos_omega_dl );
+                                }
+                                else {
+                                    ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
+
+                                    /* dcos_theta_ijk */
+                                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                    //rvec_Add( workspace->f[i], force );
+                                    atomic_rvecAdd( pbond_ij->ta_f, force );
+                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                    rvec_Add( data_ext_press [j], ext_press );
+
+                                    rvec_ScaledAdd( workspace->f[j], 
+                                            CEtors7 + CEconj4, p_ijk->dcos_dj );
+
+                                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
+                                    //rvec_Add( workspace->f[k], force );
+                                    atomic_rvecAdd( pbond_jk->ta_f, force );
+                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                    rvec_Add( data_ext_press[j], ext_press );
+
+
+                                    /* dcos_theta_jkl */
+                                    rvec_ScaledAdd( workspace->f[j], 
+                                            CEtors8 + CEconj5, p_jkl->dcos_di );
+
+                                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                    //rvec_Add( workspace->f[k], force );
+                                    atomic_rvecAdd( pbond_jk->ta_f, force );
+                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                    rvec_Add( data_ext_press [j], ext_press );
+
+                                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                    //rvec_Add( workspace->f[l], force );
+                                    rvec_Add( pbond_kl->ta_f, force );
+                                    rvec_iMultiply( ext_press, rel_box_jl, force );
+                                    rvec_Add( data_ext_press [j], ext_press );
+
+
+                                    /* dcos_omega */                      
+                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
+                                    //rvec_Add( workspace->f[i], force );
+                                    atomic_rvecAdd( pbond_ij->ta_f, force );
+                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                    rvec_Add( data_ext_press [j], ext_press );
+
+                                    rvec_ScaledAdd( workspace->f[j], 
+                                            CEtors9 + CEconj6, dcos_omega_dj );
+
+                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
+                                    //rvec_Add( workspace->f[k], force );
+                                    rvec_Add( pbond_jk->ta_f, force );
+                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                    rvec_Add( data_ext_press [j], ext_press );
+
+                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
+                                    //rvec_Add( workspace->f[l], force );
+                                    rvec_Add( pbond_kl->ta_f, force );
+                                    rvec_iMultiply( ext_press, rel_box_jl, force );
+                                    rvec_Add( data_ext_press [j], ext_press );
+                                }
 
 #ifdef TEST_ENERGY
-								/* fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								   r_ij, r_jk, r_kl, cos_ijk, cos_jkl, sin_ijk, sin_jkl );
-								   fprintf( out_control->etor, "%12.8f\n", dfn11 ); */
-								/* fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								   CEtors2, CEtors3, CEtors4, CEtors5, CEtors6, 
-								   CEtors7, CEtors8, CEtors9 ); */
-								/* fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								   htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
-								/* fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								   CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); */
-
-								/* fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n",
-								   fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
-
-								fprintf(out_control->etor, 
-										//"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e\n", 
-										"%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f\n", 
-										system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, 
-										system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, 
-										RAD2DEG(omega), BOA_jk, e_tor, data->my_en.e_tor );
-
-								fprintf(out_control->econ, 
-										//"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n", 
-										"%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n", 
-										system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, 
-										system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, 
-										RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
-										e_con, data->my_en.e_con );
+                                /* fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                   r_ij, r_jk, r_kl, cos_ijk, cos_jkl, sin_ijk, sin_jkl );
+                                   fprintf( out_control->etor, "%12.8f\n", dfn11 ); */
+                                /* fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                   CEtors2, CEtors3, CEtors4, CEtors5, CEtors6, 
+                                   CEtors7, CEtors8, CEtors9 ); */
+                                /* fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                   htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
+                                /* fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                   CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 ); */
+
+                                /* fprintf( out_control->etor, "%12.6f%12.6f%12.6f%12.6f\n",
+                                   fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
+
+                                fprintf(out_control->etor, 
+                                        //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e\n", 
+                                        "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f\n", 
+                                        system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, 
+                                        system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, 
+                                        RAD2DEG(omega), BOA_jk, e_tor, data->my_en.e_tor );
+
+                                fprintf(out_control->econ, 
+                                        //"%6d%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n", 
+                                        "%6d%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n", 
+                                        system->my_atoms[i].orig_id,system->my_atoms[j].orig_id, 
+                                        system->my_atoms[k].orig_id,system->my_atoms[l].orig_id, 
+                                        RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
+                                        e_con, data->my_en.e_con );
 #endif
 
 #ifdef TEST_FORCES
-								/* Torsion Forces */
-								Add_dBOpinpi2( system, lists, j, pk, CEtors2, 0.0, 
-										workspace->f_tor, workspace->f_tor );
-								Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
-								Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
-								Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
-								Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
-								Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
-
-								rvec_ScaledAdd( workspace->f_tor[i], 
-										CEtors7, p_ijk->dcos_dk );
-								rvec_ScaledAdd( workspace->f_tor[j], 
-										CEtors7, p_ijk->dcos_dj );
-								rvec_ScaledAdd( workspace->f_tor[k], 
-										CEtors7, p_ijk->dcos_di );
-
-								rvec_ScaledAdd( workspace->f_tor[j], 
-										CEtors8, p_jkl->dcos_di );
-								rvec_ScaledAdd( workspace->f_tor[k], 
-										CEtors8, p_jkl->dcos_dj );
-								rvec_ScaledAdd( workspace->f_tor[l], 
-										CEtors8, p_jkl->dcos_dk );
-
-								rvec_ScaledAdd( workspace->f_tor[i], 
-										CEtors9, dcos_omega_di );
-								rvec_ScaledAdd( workspace->f_tor[j], 
-										CEtors9, dcos_omega_dj );
-								rvec_ScaledAdd( workspace->f_tor[k], 
-										CEtors9, dcos_omega_dk );
-								rvec_ScaledAdd( workspace->f_tor[l], 
-										CEtors9, dcos_omega_dl );
-
-								/* Conjugation Forces */
-								Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
-								Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
-								Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
-
-								rvec_ScaledAdd( workspace->f_con[i], 
-										CEconj4, p_ijk->dcos_dk );
-								rvec_ScaledAdd( workspace->f_con[j], 
-										CEconj4, p_ijk->dcos_dj );
-								rvec_ScaledAdd( workspace->f_con[k], 
-										CEconj4, p_ijk->dcos_di );
-
-								rvec_ScaledAdd( workspace->f_con[j], 
-										CEconj5, p_jkl->dcos_di );
-								rvec_ScaledAdd( workspace->f_con[k], 
-										CEconj5, p_jkl->dcos_dj );
-								rvec_ScaledAdd( workspace->f_con[l], 
-										CEconj5, p_jkl->dcos_dk );
-
-								rvec_ScaledAdd( workspace->f_con[i], 
-										CEconj6, dcos_omega_di );
-								rvec_ScaledAdd( workspace->f_con[j], 
-										CEconj6, dcos_omega_dj );
-								rvec_ScaledAdd( workspace->f_con[k], 
-										CEconj6, dcos_omega_dk );
-								rvec_ScaledAdd( workspace->f_con[l], 
-										CEconj6, dcos_omega_dl );
+                                /* Torsion Forces */
+                                Add_dBOpinpi2( system, lists, j, pk, CEtors2, 0.0, 
+                                        workspace->f_tor, workspace->f_tor );
+                                Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
+                                Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
+                                Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
+                                Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
+                                Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
+
+                                rvec_ScaledAdd( workspace->f_tor[i], 
+                                        CEtors7, p_ijk->dcos_dk );
+                                rvec_ScaledAdd( workspace->f_tor[j], 
+                                        CEtors7, p_ijk->dcos_dj );
+                                rvec_ScaledAdd( workspace->f_tor[k], 
+                                        CEtors7, p_ijk->dcos_di );
+
+                                rvec_ScaledAdd( workspace->f_tor[j], 
+                                        CEtors8, p_jkl->dcos_di );
+                                rvec_ScaledAdd( workspace->f_tor[k], 
+                                        CEtors8, p_jkl->dcos_dj );
+                                rvec_ScaledAdd( workspace->f_tor[l], 
+                                        CEtors8, p_jkl->dcos_dk );
+
+                                rvec_ScaledAdd( workspace->f_tor[i], 
+                                        CEtors9, dcos_omega_di );
+                                rvec_ScaledAdd( workspace->f_tor[j], 
+                                        CEtors9, dcos_omega_dj );
+                                rvec_ScaledAdd( workspace->f_tor[k], 
+                                        CEtors9, dcos_omega_dk );
+                                rvec_ScaledAdd( workspace->f_tor[l], 
+                                        CEtors9, dcos_omega_dl );
+
+                                /* Conjugation Forces */
+                                Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
+                                Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
+                                Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
+
+                                rvec_ScaledAdd( workspace->f_con[i], 
+                                        CEconj4, p_ijk->dcos_dk );
+                                rvec_ScaledAdd( workspace->f_con[j], 
+                                        CEconj4, p_ijk->dcos_dj );
+                                rvec_ScaledAdd( workspace->f_con[k], 
+                                        CEconj4, p_ijk->dcos_di );
+
+                                rvec_ScaledAdd( workspace->f_con[j], 
+                                        CEconj5, p_jkl->dcos_di );
+                                rvec_ScaledAdd( workspace->f_con[k], 
+                                        CEconj5, p_jkl->dcos_dj );
+                                rvec_ScaledAdd( workspace->f_con[l], 
+                                        CEconj5, p_jkl->dcos_dk );
+
+                                rvec_ScaledAdd( workspace->f_con[i], 
+                                        CEconj6, dcos_omega_di );
+                                rvec_ScaledAdd( workspace->f_con[j], 
+                                        CEconj6, dcos_omega_dj );
+                                rvec_ScaledAdd( workspace->f_con[k], 
+                                        CEconj6, dcos_omega_dk );
+                                rvec_ScaledAdd( workspace->f_con[l], 
+                                        CEconj6, dcos_omega_dl );
 #endif
-							} // pl check ends
-						} // pl loop ends
-					} // pi check ends
-				} // pi loop ends
-			} // k-j neighbor check ends
-		} // j<k && j-k neighbor check ends
-	} // pk loop ends
-	//  } // j loop
+                            } // pl check ends
+                        } // pl loop ends
+                    } // pi check ends
+                } // pi loop ends
+            } // k-j neighbor check ends
+        } // j<k && j-k neighbor check ends
+    } // pk loop ends
+    //  } // j loop
 }
 
 CUDA_GLOBAL void Cuda_Torsion_Angles_PostProcess ( reax_atom *my_atoms, 
-		storage p_workspace, 
-		reax_list p_bonds, int N )
+        storage p_workspace, 
+        reax_list p_bonds, int N )
 {
-	int i, pj;
+    int i, pj;
 
-	bond_data *pbond;
-	bond_data *sym_index_bond;
-	bond_order_data *bo_data;
+    bond_data *pbond;
+    bond_data *sym_index_bond;
+    bond_order_data *bo_data;
 
-	reax_list *bonds = &p_bonds;
-	storage *workspace = &p_workspace;
+    reax_list *bonds = &p_bonds;
+    storage *workspace = &p_workspace;
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= N) return;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N) return;
 
-	for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
+    for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
 
-		pbond = &(bonds->select.bond_list[pj]);
-		bo_data = &pbond->bo_data;
-		sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); 
+        pbond = &(bonds->select.bond_list[pj]);
+        bo_data = &pbond->bo_data;
+        sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); 
 
-		workspace->CdDelta [i] += sym_index_bond->ta_CdDelta;
+        workspace->CdDelta [i] += sym_index_bond->ta_CdDelta;
 
-		bo_data->Cdbo += pbond->ta_Cdbo;
+        bo_data->Cdbo += pbond->ta_Cdbo;
 
-		//update f vector
-		//rvec_Add (my_atoms [i].f, sym_index_bond->ta_f ); 
-		rvec_Add (workspace->f[i], sym_index_bond->ta_f ); 
-	}
+        //update f vector
+        //rvec_Add (my_atoms [i].f, sym_index_bond->ta_f ); 
+        rvec_Add (workspace->f[i], sym_index_bond->ta_f ); 
+    }
 }
diff --git a/PG-PuReMD/src/cuda_utils.cu b/PG-PuReMD/src/cuda_utils.cu
index 3f304e59..dcd8d61f 100644
--- a/PG-PuReMD/src/cuda_utils.cu
+++ b/PG-PuReMD/src/cuda_utils.cu
@@ -2,114 +2,114 @@
 
 extern "C" void cuda_malloc (void **ptr, int size, int memset, char *msg) {
 
-	cudaError_t retVal = cudaSuccess;
-
-	retVal = cudaMalloc (ptr, size);
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "Failed to allocate memory on device for the res: %s...  exiting with code: %d size: %d \n", 
-				msg, retVal, size);
-		exit (-1);
-	}  
-
-	if (memset) {
-		retVal = cudaMemset (*ptr, 0, size);
-		if (retVal != cudaSuccess) {
-			fprintf (stderr, "Failed to memset memory on device for resource %s\n", 
-					msg);
-			exit (-1);
-		}
-	}  
+    cudaError_t retVal = cudaSuccess;
+
+    retVal = cudaMalloc (ptr, size);
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "Failed to allocate memory on device for the res: %s...  exiting with code: %d size: %d \n", 
+                msg, retVal, size);
+        exit (-1);
+    }  
+
+    if (memset) {
+        retVal = cudaMemset (*ptr, 0, size);
+        if (retVal != cudaSuccess) {
+            fprintf (stderr, "Failed to memset memory on device for resource %s\n", 
+                    msg);
+            exit (-1);
+        }
+    }  
 }
 
 extern "C" void cuda_free (void *ptr, char *msg) {
 
-	cudaError_t retVal = cudaSuccess;
-	if (!ptr) return;
+    cudaError_t retVal = cudaSuccess;
+    if (!ptr) return;
 
-	retVal = cudaFree (ptr);
+    retVal = cudaFree (ptr);
 
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "Failed to release memory on device for res %s... exiting with code %d -- Address %ld\n", 
-				msg, retVal, (long int) ptr);
-		return;
-	}  
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "Failed to release memory on device for res %s... exiting with code %d -- Address %ld\n", 
+                msg, retVal, (long int) ptr);
+        return;
+    }  
 }
 
 extern "C" void cuda_memset (void *ptr, int data, size_t count, char *msg){
-	cudaError_t retVal = cudaSuccess;
-
-	retVal = cudaMemset (ptr, data, count);
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "Failed to memset memory on device for %s, cuda code %d\n", 
-				msg, retVal);
-		exit (-1);
-	}
+    cudaError_t retVal = cudaSuccess;
+
+    retVal = cudaMemset (ptr, data, count);
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "Failed to memset memory on device for %s, cuda code %d\n", 
+                msg, retVal);
+        exit (-1);
+    }
 }
 
 extern "C" void copy_host_device (void *host, void *dev, int size, enum cudaMemcpyKind dir, char *msg)
 {
-	cudaError_t	retVal = cudaErrorNotReady;
-
-	if (dir == cudaMemcpyHostToDevice)
-		retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice);
-	else
-		retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost);
-
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "could not copy resource %s from host to device: reason %d \n",
-				msg, retVal);
-		exit (-1);
-	}
+    cudaError_t    retVal = cudaErrorNotReady;
+
+    if (dir == cudaMemcpyHostToDevice)
+        retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice);
+    else
+        retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost);
+
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "could not copy resource %s from host to device: reason %d \n",
+                msg, retVal);
+        exit (-1);
+    }
 }
 
 extern "C" void copy_device (void *dest, void *src, int size, char *msg)
 {
-	cudaError_t	retVal = cudaErrorNotReady;
-
-	retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice);
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "could not copy resource %s from device to device: reason %d \n",
-				msg, retVal);
-		exit (-1);
-	}
+    cudaError_t    retVal = cudaErrorNotReady;
+
+    retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice);
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "could not copy resource %s from device to device: reason %d \n",
+                msg, retVal);
+        exit (-1);
+    }
 }
 
 extern "C" void compute_blocks ( int *blocks, int *block_size, int count )
 {
-	*block_size = CUDA_BLOCK_SIZE;
-	*blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1);
+    *block_size = CUDA_BLOCK_SIZE;
+    *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1);
 }
 
 extern "C" void compute_matvec_blocks ( int *blocks, int count )
 {
-	*blocks = ((count * MATVEC_KER_THREADS_PER_ROW) / MATVEC_BLOCK_SIZE) + 
-		(((count * MATVEC_KER_THREADS_PER_ROW) % MATVEC_BLOCK_SIZE) == 0 ? 0 : 1);
+    *blocks = ((count * MATVEC_KER_THREADS_PER_ROW) / MATVEC_BLOCK_SIZE) + 
+        (((count * MATVEC_KER_THREADS_PER_ROW) % MATVEC_BLOCK_SIZE) == 0 ? 0 : 1);
 }
 
 extern "C" void compute_nearest_pow_2 (int blocks, int *result)
 {
-	int power = 1;
-	while (power < blocks) power *= 2;
+    int power = 1;
+    while (power < blocks) power *= 2;
 
-	*result = power;
+    *result = power;
 }
 
 void print_info ()
 {
-	size_t total, free;
-	cudaMemGetInfo (&free, &total);
-	if (cudaGetLastError () != cudaSuccess )
-	{
-		fprintf (stderr, "Error on the memory call \n");
-		return;
-	}
-
-	fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", 
-			total, total/(1024*1024), total/ (1024*1024*1024), 
-			free, free/(1024*1024), free/ (1024*1024*1024) );
+    size_t total, free;
+    cudaMemGetInfo (&free, &total);
+    if (cudaGetLastError () != cudaSuccess )
+    {
+        fprintf (stderr, "Error on the memory call \n");
+        return;
+    }
+
+    fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", 
+            total, total/(1024*1024), total/ (1024*1024*1024), 
+            free, free/(1024*1024), free/ (1024*1024*1024) );
 }
 
 extern "C" void print_device_mem_usage ()
 {
-	print_info ();
+    print_info ();
 }
diff --git a/PG-PuReMD/src/cuda_valence_angles.cu b/PG-PuReMD/src/cuda_valence_angles.cu
index 18dfb16c..b7e62c90 100644
--- a/PG-PuReMD/src/cuda_valence_angles.cu
+++ b/PG-PuReMD/src/cuda_valence_angles.cu
@@ -29,586 +29,586 @@
 /* this is a 3-body interaction in which the main role is 
    played by j which sits in the middle of the other two. */
 CUDA_GLOBAL void Cuda_Valence_Angles( reax_atom *my_atoms, 
-		global_parameters gp, 
-		single_body_parameters *sbp, 
-		three_body_header *d_thbh, 
-		control_params *control, 
-		storage p_workspace, 
-		reax_list p_bonds, reax_list p_thb_intrs, 
-		int n, int N, int num_atom_types, 
-		real *data_e_ang, real *data_e_pen, real *data_e_coa, 
-		rvec *my_ext_press
-		)
+        global_parameters gp, 
+        single_body_parameters *sbp, 
+        three_body_header *d_thbh, 
+        control_params *control, 
+        storage p_workspace, 
+        reax_list p_bonds, reax_list p_thb_intrs, 
+        int n, int N, int num_atom_types, 
+        real *data_e_ang, real *data_e_pen, real *data_e_coa, 
+        rvec *my_ext_press
+        )
 {
-	int i, j, pi, k, pk, t;
-	int type_i, type_j, type_k;
-	int start_j, end_j, start_pk, end_pk;
-	int cnt, num_thb_intrs;
-
-	real temp, temp_bo_jt, pBOjt7;
-	real p_val1, p_val2, p_val3, p_val4, p_val5;
-	real p_val6, p_val7, p_val8, p_val9, p_val10;
-	real p_pen1, p_pen2, p_pen3, p_pen4;
-	real p_coa1, p_coa2, p_coa3, p_coa4;
-	real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
-	real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
-	real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj;
-	real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
-	real CEpen1, CEpen2, CEpen3;
-	real e_ang, e_coa, e_pen;
-	real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
-	real Cf7ij, Cf7jk, Cf8j, Cf9j;
-	real f7_ij, f7_jk, f8_Dj, f9_Dj;
-	real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
-	real r_ij, r_jk;
-	real BOA_ij, BOA_jk;
-	rvec force, ext_press;
-	// rtensor temp_rtensor, total_rtensor;
-
-	three_body_header *thbh;
-	three_body_parameters *thbp;
-	three_body_interaction_data *p_ijk, *p_kji;
-	bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-	bond_order_data *bo_ij, *bo_jk, *bo_jt;
-
-	reax_list *bonds = &( p_bonds );
-	reax_list *thb_intrs =  &( p_thb_intrs );
-	storage *workspace = &( p_workspace );
-
-	/* global parameters used in these calculations */
-	p_val6 = gp.l[14];
-	p_val8 = gp.l[33];
-	p_val9 = gp.l[16];
-	p_val10 = gp.l[17];
-
-	j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= N) return;
-
-	//num_thb_intrs = j * THREE_BODY_OFFSET;
-
-	//for( j = 0; j < system->N; ++j ) {
-	// fprintf( out_control->eval, "j: %d\n", j );
-	type_j = my_atoms[j].type;
-	start_j = Dev_Start_Index(j, bonds);
-	end_j = Dev_End_Index(j, bonds);
-
-	p_val3 = sbp[ type_j ].p_val3;
-	p_val5 = sbp[ type_j ].p_val5;
-
-	SBOp = 0, prod_SBO = 1;
-	for( t = start_j; t < end_j; ++t ) {
-		bo_jt = &(bonds->select.bond_list[t].bo_data);
-		SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
-		temp = SQR( bo_jt->BO );
-		temp *= temp; 
-		temp *= temp;
-		prod_SBO *= EXP( -temp );
-	}
-
-	/* modifications to match Adri's code - 09/01/09 */
-	if( workspace->vlpex[j] >= 0 ){
-		vlpadj = 0;
-		dSBO2 = prod_SBO - 1;
-	}
-	else{
-		vlpadj = workspace->nlp[j];
-		dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
-	}
-
-	SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
-	dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
-
-	if( SBO <= 0 )
-		SBO2 = 0, CSBO2 = 0;
-	else if( SBO > 0 && SBO <= 1 ) {
-		SBO2 = POW( SBO, p_val9 );
-		CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
-	}
-	else if( SBO > 1 && SBO < 2 ) {
-		SBO2 = 2 - POW( 2-SBO, p_val9 );
-		CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
-	}
-	else 
-		SBO2 = 2, CSBO2 = 0;  
-
-	expval6 = EXP( p_val6 * workspace->Delta_boc[j] );    
-
-	for( pi = start_j; pi < end_j; ++pi ) {
-
-		//num_thb_intrs = pi * THREE_BODY_OFFSET;
-		//Dev_Set_Start_Index( pi, num_thb_intrs, thb_intrs );
-		num_thb_intrs = Dev_Start_Index (pi, thb_intrs);
-
-		pbond_ij = &(bonds->select.bond_list[pi]);
-		bo_ij = &(pbond_ij->bo_data);
-		BOA_ij = bo_ij->BO - control->thb_cut;
-
-		//TODO REMOVE THIS
-		//TODO REMOVE THIS
-		//TODO REMOVE THIS
-		//TODO REMOVE THIS
-		//TODO REMOVE THIS
-
-		if( BOA_ij/*bo_ij->BO*/ > 0.0 && 
-				( j < n || pbond_ij->nbr < n ) ) {
-			//if( BOA_ij/*bo_ij->BO*/ > 0.0) {
-			i = pbond_ij->nbr;
-			r_ij = pbond_ij->d;	 
-			type_i = my_atoms[i].type;
-			// fprintf( out_control->eval, "i: %d\n", i );
-
-
-			/* first copy 3-body intrs from previously computed ones where i>k.
-			   in the second for-loop below, 
-			   we compute only new 3-body intrs where i < k */
-
-			for( pk = start_j; pk < pi; ++pk ) {
-				// fprintf( out_control->eval, "pk: %d\n", pk );
-				start_pk = Dev_Start_Index( pk, thb_intrs );
-				end_pk = Dev_End_Index( pk, thb_intrs );
-
-				for( t = start_pk; t < end_pk; ++t )
-					if( thb_intrs->select.three_body_list[t].thb == i ) {
-						p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] );
-						p_kji = &(thb_intrs->select.three_body_list[t]);
-
-						p_ijk->thb = bonds->select.bond_list[pk].nbr;
-						p_ijk->pthb  = pk;
-						p_ijk->theta = p_kji->theta;			  
-						rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-						rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-						rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-
-						++num_thb_intrs;
-						break;
-					}
-			}
-
-
-
-			/* and this is the second for loop mentioned above */
-			for( pk = pi+1; pk < end_j; ++pk ) {
-				//for( pk = start_j; pk < end_j; ++pk ) {
-				if (pk == pi) continue;
-				pbond_jk = &(bonds->select.bond_list[pk]);
-				bo_jk    = &(pbond_jk->bo_data);
-				BOA_jk   = bo_jk->BO - control->thb_cut;
-				k        = pbond_jk->nbr;
-				type_k   = my_atoms[k].type;
-				p_ijk    = &( thb_intrs->select.three_body_list[num_thb_intrs] );
-
-				//CHANGE ORIGINAL
-				if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue;
-				//if ((BOA_jk <= 0) ) continue;
-				//CHANGE ORIGINAL
-
-				Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
-						pbond_jk->dvec, pbond_jk->d,
-						&theta, &cos_theta );
-
-				Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
-						pbond_jk->dvec, pbond_jk->d, 
-						&(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
-						&(p_ijk->dcos_dk) );
-				p_ijk->thb = k;
-				p_ijk->pthb = pk;
-				p_ijk->theta = theta;
-
-				sin_theta = SIN( theta );
-				if( sin_theta < 1.0e-5 )
-					sin_theta = 1.0e-5;
-
-				++num_thb_intrs;
-
-
-				if( (j < n) && (BOA_jk > 0.0) && 
-						(bo_ij->BO * bo_jk->BO > SQR(control->thb_cut)/*0*/) ) {
-					r_jk = pbond_jk->d;		      
-					thbh = &( d_thbh[ index_thbp (type_i,type_j,type_k,num_atom_types) ] );
-
-					/* if( system->my_atoms[i].orig_id < system->my_atoms[k].orig_id )
-					   fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-					   system->my_atoms[i].orig_id, 
-					   system->my_atoms[j].orig_id, 
-					   system->my_atoms[k].orig_id,
-					   bo_ij->BO, bo_jk->BO, p_ijk->theta );
-					   else 
-					   fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-					   system->my_atoms[k].orig_id,
-					   system->my_atoms[j].orig_id, 
-					   system->my_atoms[i].orig_id, 
-					   bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
-
-					for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
-						// fprintf( out_control->eval, "%6d%6d%6d -- exists in thbp\n", 
-						//          i+1, j+1, k+1 );
-
-						if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
-							thbp = &( thbh->prm[cnt] );			     
-
-							/* ANGLE ENERGY */
-							p_val1 = thbp->p_val1;
-							p_val2 = thbp->p_val2;
-							p_val4 = thbp->p_val4;
-							p_val7 = thbp->p_val7;
-							theta_00 = thbp->theta_00;
-
-							exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
-							f7_ij = 1.0 - exp3ij;
-							Cf7ij = p_val3 * p_val4 * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
-
-							exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
-							f7_jk = 1.0 - exp3jk;
-							Cf7jk = p_val3 * p_val4 * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
-
-							expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
-							trm8 = 1.0 + expval6 + expval7;
-							f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
-							Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
-								( p_val6 * expval6 * trm8 - 
-								  (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7 ) );
-
-							theta_0 = 180.0 - theta_00 * (1.0 - 
-									EXP(-p_val10 * (2.0 - SBO2)));
-							theta_0 = DEG2RAD( theta_0 );		      
-
-							expval2theta  = EXP( -p_val2 * SQR(theta_0 - theta) );
-							if( p_val1 >= 0 ) 
-								expval12theta = p_val1 * (1.0 - expval2theta);
-							else // To avoid linear Me-H-Me angles (6/6/06)
-								expval12theta = p_val1 * -expval2theta;
-
-							CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
-							CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
-							CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
-							CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
-								expval2theta * (theta_0 - theta);
-
-							Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
-								exp( -p_val10 * (2.0 - SBO2) );
-
-							CEval5 = -CEval4 * Ctheta_0 * CSBO2;
-							CEval6 = CEval5 * dSBO1;
-							CEval7 = CEval5 * dSBO2;
-							CEval8 = -CEval4 / sin_theta;
-
-							data_e_ang [j] += e_ang = 
-								f7_ij * f7_jk * f8_Dj * expval12theta;
-							/* END ANGLE ENERGY*/
-
-
-							/* PENALTY ENERGY */
-							p_pen1 = thbp->p_pen1;
-							p_pen2 = gp.l[19];
-							p_pen3 = gp.l[20];
-							p_pen4 = gp.l[21];
-
-							exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
-							exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
-							exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
-							exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
-							trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
-							f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-							Cf9j = ( -p_pen3 * exp_pen3 * trm_pen34 - 
-									(2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + 
-										p_pen4 * exp_pen4 ) ) / 
-								SQR( trm_pen34 );
-
-							data_e_pen [j] += e_pen = 
-								p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
-
-							CEpen1 = e_pen * Cf9j / f9_Dj;
-							temp   = -2.0 * p_pen2 * e_pen;
-							CEpen2 = temp * (BOA_ij - 2.0);
-							CEpen3 = temp * (BOA_jk - 2.0);
-							/* END PENALTY ENERGY */
-
-
-							/* COALITION ENERGY */
-							p_coa1 = thbp->p_coa1;
-							p_coa2 = gp.l[2];
-							p_coa3 = gp.l[38];
-							p_coa4 = gp.l[30];
-
-							exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
-							data_e_coa [j] += e_coa = 
-								p_coa1 / (1. + exp_coa2) *
-								EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) *
-								EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) *
-								EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
-								EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
-
-							CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
-							CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-							CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
-							CEcoa4 = -2 * p_coa3 * 
-								(workspace->total_bond_order[i]-BOA_ij) * e_coa;
-							CEcoa5 = -2 * p_coa3 * 
-								(workspace->total_bond_order[k]-BOA_jk) * e_coa;
-							/* END COALITION ENERGY */
-
-							/* FORCES */
-							/*
-							   bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
-							   bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
-							   workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
-							   workspace->CdDelta[i] += CEcoa4;
-							   workspace->CdDelta[k] += CEcoa5;		      
-							 */
-							bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
-							bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
-							workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
-							pbond_ij->va_CdDelta += CEcoa4;
-							pbond_jk->va_CdDelta += CEcoa5;
-
-
-							for( t = start_j; t < end_j; ++t ) {
-								pbond_jt = &( bonds->select.bond_list[t] );
-								bo_jt = &(pbond_jt->bo_data);
-								temp_bo_jt = bo_jt->BO;
-								temp = CUBE( temp_bo_jt );
-								pBOjt7 = temp * temp * temp_bo_jt; 
-
-								// fprintf( out_control->eval, "%6d%12.8f\n", 
-								// workspace->reverse_map[bonds->select.bond_list[t].nbr],
-								// (CEval6 * pBOjt7) );
-
-								bo_jt->Cdbo += (CEval6 * pBOjt7);
-								bo_jt->Cdbopi += CEval5;
-								bo_jt->Cdbopi2 += CEval5;
-							}		      
-
-
-							if( control->virial == 0 ) {
-								/*
-								   rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di );
-								   rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
-								   rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk );
-								 */
-
-								rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di );
-								rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
-								rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk );
-							}
-							else {
-								/* terms not related to bond order derivatives are
-								   added directly into forces and pressure vector/tensor */
-								rvec_Scale( force, CEval8, p_ijk->dcos_di );
-								//rvec_Add( workspace->f[i], force );
-								rvec_Add( pbond_ij->va_f, force );
-								rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-								//rvec_Add( data->my_ext_press, ext_press );
-								rvec_Add( my_ext_press [j], ext_press );
-
-								rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
-
-								rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-								//rvec_Add( workspace->f[k], force );
-								rvec_Add( pbond_jk->va_f, force );
-								rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-								rvec_Add( my_ext_press [j], ext_press );
-							}
+    int i, j, pi, k, pk, t;
+    int type_i, type_j, type_k;
+    int start_j, end_j, start_pk, end_pk;
+    int cnt, num_thb_intrs;
+
+    real temp, temp_bo_jt, pBOjt7;
+    real p_val1, p_val2, p_val3, p_val4, p_val5;
+    real p_val6, p_val7, p_val8, p_val9, p_val10;
+    real p_pen1, p_pen2, p_pen3, p_pen4;
+    real p_coa1, p_coa2, p_coa3, p_coa4;
+    real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
+    real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
+    real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj;
+    real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
+    real CEpen1, CEpen2, CEpen3;
+    real e_ang, e_coa, e_pen;
+    real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
+    real Cf7ij, Cf7jk, Cf8j, Cf9j;
+    real f7_ij, f7_jk, f8_Dj, f9_Dj;
+    real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
+    real r_ij, r_jk;
+    real BOA_ij, BOA_jk;
+    rvec force, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+
+    three_body_header *thbh;
+    three_body_parameters *thbp;
+    three_body_interaction_data *p_ijk, *p_kji;
+    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+    bond_order_data *bo_ij, *bo_jk, *bo_jt;
+
+    reax_list *bonds = &( p_bonds );
+    reax_list *thb_intrs =  &( p_thb_intrs );
+    storage *workspace = &( p_workspace );
+
+    /* global parameters used in these calculations */
+    p_val6 = gp.l[14];
+    p_val8 = gp.l[33];
+    p_val9 = gp.l[16];
+    p_val10 = gp.l[17];
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
+
+    //num_thb_intrs = j * THREE_BODY_OFFSET;
+
+    //for( j = 0; j < system->N; ++j ) {
+    // fprintf( out_control->eval, "j: %d\n", j );
+    type_j = my_atoms[j].type;
+    start_j = Dev_Start_Index(j, bonds);
+    end_j = Dev_End_Index(j, bonds);
+
+    p_val3 = sbp[ type_j ].p_val3;
+    p_val5 = sbp[ type_j ].p_val5;
+
+    SBOp = 0, prod_SBO = 1;
+    for( t = start_j; t < end_j; ++t ) {
+        bo_jt = &(bonds->select.bond_list[t].bo_data);
+        SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
+        temp = SQR( bo_jt->BO );
+        temp *= temp; 
+        temp *= temp;
+        prod_SBO *= EXP( -temp );
+    }
+
+    /* modifications to match Adri's code - 09/01/09 */
+    if( workspace->vlpex[j] >= 0 ){
+        vlpadj = 0;
+        dSBO2 = prod_SBO - 1;
+    }
+    else{
+        vlpadj = workspace->nlp[j];
+        dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
+    }
+
+    SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
+    dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
+
+    if( SBO <= 0 )
+        SBO2 = 0, CSBO2 = 0;
+    else if( SBO > 0 && SBO <= 1 ) {
+        SBO2 = POW( SBO, p_val9 );
+        CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
+    }
+    else if( SBO > 1 && SBO < 2 ) {
+        SBO2 = 2 - POW( 2-SBO, p_val9 );
+        CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
+    }
+    else 
+        SBO2 = 2, CSBO2 = 0;  
+
+    expval6 = EXP( p_val6 * workspace->Delta_boc[j] );    
+
+    for( pi = start_j; pi < end_j; ++pi ) {
+
+        //num_thb_intrs = pi * THREE_BODY_OFFSET;
+        //Dev_Set_Start_Index( pi, num_thb_intrs, thb_intrs );
+        num_thb_intrs = Dev_Start_Index (pi, thb_intrs);
+
+        pbond_ij = &(bonds->select.bond_list[pi]);
+        bo_ij = &(pbond_ij->bo_data);
+        BOA_ij = bo_ij->BO - control->thb_cut;
+
+        //TODO REMOVE THIS
+        //TODO REMOVE THIS
+        //TODO REMOVE THIS
+        //TODO REMOVE THIS
+        //TODO REMOVE THIS
+
+        if( BOA_ij/*bo_ij->BO*/ > 0.0 && 
+                ( j < n || pbond_ij->nbr < n ) ) {
+            //if( BOA_ij/*bo_ij->BO*/ > 0.0) {
+            i = pbond_ij->nbr;
+            r_ij = pbond_ij->d;     
+            type_i = my_atoms[i].type;
+            // fprintf( out_control->eval, "i: %d\n", i );
+
+
+            /* first copy 3-body intrs from previously computed ones where i>k.
+               in the second for-loop below, 
+               we compute only new 3-body intrs where i < k */
+
+            for( pk = start_j; pk < pi; ++pk ) {
+                // fprintf( out_control->eval, "pk: %d\n", pk );
+                start_pk = Dev_Start_Index( pk, thb_intrs );
+                end_pk = Dev_End_Index( pk, thb_intrs );
+
+                for( t = start_pk; t < end_pk; ++t )
+                    if( thb_intrs->select.three_body_list[t].thb == i ) {
+                        p_ijk = &(thb_intrs->select.three_body_list[num_thb_intrs] );
+                        p_kji = &(thb_intrs->select.three_body_list[t]);
+
+                        p_ijk->thb = bonds->select.bond_list[pk].nbr;
+                        p_ijk->pthb  = pk;
+                        p_ijk->theta = p_kji->theta;              
+                        rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
+                        rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
+                        rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
+
+                        ++num_thb_intrs;
+                        break;
+                    }
+            }
+
+
+
+            /* and this is the second for loop mentioned above */
+            for( pk = pi+1; pk < end_j; ++pk ) {
+                //for( pk = start_j; pk < end_j; ++pk ) {
+                if (pk == pi) continue;
+                pbond_jk = &(bonds->select.bond_list[pk]);
+                bo_jk    = &(pbond_jk->bo_data);
+                BOA_jk   = bo_jk->BO - control->thb_cut;
+                k        = pbond_jk->nbr;
+                type_k   = my_atoms[k].type;
+                p_ijk    = &( thb_intrs->select.three_body_list[num_thb_intrs] );
+
+                //CHANGE ORIGINAL
+                if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue;
+                //if ((BOA_jk <= 0) ) continue;
+                //CHANGE ORIGINAL
+
+                Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
+                        pbond_jk->dvec, pbond_jk->d,
+                        &theta, &cos_theta );
+
+                Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
+                        pbond_jk->dvec, pbond_jk->d, 
+                        &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
+                        &(p_ijk->dcos_dk) );
+                p_ijk->thb = k;
+                p_ijk->pthb = pk;
+                p_ijk->theta = theta;
+
+                sin_theta = SIN( theta );
+                if( sin_theta < 1.0e-5 )
+                    sin_theta = 1.0e-5;
+
+                ++num_thb_intrs;
+
+
+                if( (j < n) && (BOA_jk > 0.0) && 
+                        (bo_ij->BO * bo_jk->BO > SQR(control->thb_cut)/*0*/) ) {
+                    r_jk = pbond_jk->d;              
+                    thbh = &( d_thbh[ index_thbp (type_i,type_j,type_k,num_atom_types) ] );
+
+                    /* if( system->my_atoms[i].orig_id < system->my_atoms[k].orig_id )
+                       fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                       system->my_atoms[i].orig_id, 
+                       system->my_atoms[j].orig_id, 
+                       system->my_atoms[k].orig_id,
+                       bo_ij->BO, bo_jk->BO, p_ijk->theta );
+                       else 
+                       fprintf( fval, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                       system->my_atoms[k].orig_id,
+                       system->my_atoms[j].orig_id, 
+                       system->my_atoms[i].orig_id, 
+                       bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
+
+                    for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
+                        // fprintf( out_control->eval, "%6d%6d%6d -- exists in thbp\n", 
+                        //          i+1, j+1, k+1 );
+
+                        if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
+                            thbp = &( thbh->prm[cnt] );                 
+
+                            /* ANGLE ENERGY */
+                            p_val1 = thbp->p_val1;
+                            p_val2 = thbp->p_val2;
+                            p_val4 = thbp->p_val4;
+                            p_val7 = thbp->p_val7;
+                            theta_00 = thbp->theta_00;
+
+                            exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
+                            f7_ij = 1.0 - exp3ij;
+                            Cf7ij = p_val3 * p_val4 * POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+
+                            exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
+                            f7_jk = 1.0 - exp3jk;
+                            Cf7jk = p_val3 * p_val4 * POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+
+                            expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
+                            trm8 = 1.0 + expval6 + expval7;
+                            f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
+                            Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
+                                ( p_val6 * expval6 * trm8 - 
+                                  (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7 ) );
+
+                            theta_0 = 180.0 - theta_00 * (1.0 - 
+                                    EXP(-p_val10 * (2.0 - SBO2)));
+                            theta_0 = DEG2RAD( theta_0 );              
+
+                            expval2theta  = EXP( -p_val2 * SQR(theta_0 - theta) );
+                            if( p_val1 >= 0 ) 
+                                expval12theta = p_val1 * (1.0 - expval2theta);
+                            else // To avoid linear Me-H-Me angles (6/6/06)
+                                expval12theta = p_val1 * -expval2theta;
+
+                            CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
+                            CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
+                            CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
+                            CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
+                                expval2theta * (theta_0 - theta);
+
+                            Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
+                                exp( -p_val10 * (2.0 - SBO2) );
+
+                            CEval5 = -CEval4 * Ctheta_0 * CSBO2;
+                            CEval6 = CEval5 * dSBO1;
+                            CEval7 = CEval5 * dSBO2;
+                            CEval8 = -CEval4 / sin_theta;
+
+                            data_e_ang [j] += e_ang = 
+                                f7_ij * f7_jk * f8_Dj * expval12theta;
+                            /* END ANGLE ENERGY*/
+
+
+                            /* PENALTY ENERGY */
+                            p_pen1 = thbp->p_pen1;
+                            p_pen2 = gp.l[19];
+                            p_pen3 = gp.l[20];
+                            p_pen4 = gp.l[21];
+
+                            exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                            exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                            exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
+                            exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
+                            trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
+                            f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
+                            Cf9j = ( -p_pen3 * exp_pen3 * trm_pen34 - 
+                                    (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 + 
+                                        p_pen4 * exp_pen4 ) ) / 
+                                SQR( trm_pen34 );
+
+                            data_e_pen [j] += e_pen = 
+                                p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+
+                            CEpen1 = e_pen * Cf9j / f9_Dj;
+                            temp   = -2.0 * p_pen2 * e_pen;
+                            CEpen2 = temp * (BOA_ij - 2.0);
+                            CEpen3 = temp * (BOA_jk - 2.0);
+                            /* END PENALTY ENERGY */
+
+
+                            /* COALITION ENERGY */
+                            p_coa1 = thbp->p_coa1;
+                            p_coa2 = gp.l[2];
+                            p_coa3 = gp.l[38];
+                            p_coa4 = gp.l[30];
+
+                            exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
+                            data_e_coa [j] += e_coa = 
+                                p_coa1 / (1. + exp_coa2) *
+                                EXP( -p_coa3 * SQR(workspace->total_bond_order[i]-BOA_ij) ) *
+                                EXP( -p_coa3 * SQR(workspace->total_bond_order[k]-BOA_jk) ) *
+                                EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
+                                EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+
+                            CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
+                            CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
+                            CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
+                            CEcoa4 = -2 * p_coa3 * 
+                                (workspace->total_bond_order[i]-BOA_ij) * e_coa;
+                            CEcoa5 = -2 * p_coa3 * 
+                                (workspace->total_bond_order[k]-BOA_jk) * e_coa;
+                            /* END COALITION ENERGY */
+
+                            /* FORCES */
+                            /*
+                               bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
+                               bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
+                               workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
+                               workspace->CdDelta[i] += CEcoa4;
+                               workspace->CdDelta[k] += CEcoa5;              
+                             */
+                            bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
+                            bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
+                            workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
+                            pbond_ij->va_CdDelta += CEcoa4;
+                            pbond_jk->va_CdDelta += CEcoa5;
+
+
+                            for( t = start_j; t < end_j; ++t ) {
+                                pbond_jt = &( bonds->select.bond_list[t] );
+                                bo_jt = &(pbond_jt->bo_data);
+                                temp_bo_jt = bo_jt->BO;
+                                temp = CUBE( temp_bo_jt );
+                                pBOjt7 = temp * temp * temp_bo_jt; 
+
+                                // fprintf( out_control->eval, "%6d%12.8f\n", 
+                                // workspace->reverse_map[bonds->select.bond_list[t].nbr],
+                                // (CEval6 * pBOjt7) );
+
+                                bo_jt->Cdbo += (CEval6 * pBOjt7);
+                                bo_jt->Cdbopi += CEval5;
+                                bo_jt->Cdbopi2 += CEval5;
+                            }              
+
+
+                            if( control->virial == 0 ) {
+                                /*
+                                   rvec_ScaledAdd( workspace->f[i], CEval8, p_ijk->dcos_di );
+                                   rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
+                                   rvec_ScaledAdd( workspace->f[k], CEval8, p_ijk->dcos_dk );
+                                 */
+
+                                rvec_ScaledAdd( pbond_ij->va_f, CEval8, p_ijk->dcos_di );
+                                rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
+                                rvec_ScaledAdd( pbond_jk->va_f, CEval8, p_ijk->dcos_dk );
+                            }
+                            else {
+                                /* terms not related to bond order derivatives are
+                                   added directly into forces and pressure vector/tensor */
+                                rvec_Scale( force, CEval8, p_ijk->dcos_di );
+                                //rvec_Add( workspace->f[i], force );
+                                rvec_Add( pbond_ij->va_f, force );
+                                rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                //rvec_Add( data->my_ext_press, ext_press );
+                                rvec_Add( my_ext_press [j], ext_press );
+
+                                rvec_ScaledAdd( workspace->f[j], CEval8, p_ijk->dcos_dj );
+
+                                rvec_Scale( force, CEval8, p_ijk->dcos_dk );
+                                //rvec_Add( workspace->f[k], force );
+                                rvec_Add( pbond_jk->va_f, force );
+                                rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                rvec_Add( my_ext_press [j], ext_press );
+                            }
 
 #ifdef TEST_ENERGY
-							/*fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
-							  p_val3, p_val4, BOA_ij, BOA_jk );
-							  fprintf(out_control->eval, "%13.8f%13.8f%13.8f%13.8f%13.8f\n",
-							  workspace->Delta_e[j], workspace->vlpex[j],
-							  dSBO1, dSBO2, vlpadj );
-							  fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
-							  f7_ij, f7_jk, f8_Dj, expval12theta );
-							  fprintf( out_control->eval, 
-							  "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-							  CEval1, CEval2, CEval3, CEval4, 
-							  CEval5, CEval6, CEval7, CEval8 );
-
-							  fprintf( out_control->eval, 
-							  "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-							  p_ijk->dcos_di[0]/sin_theta, p_ijk->dcos_di[1]/sin_theta,
-							  p_ijk->dcos_di[2]/sin_theta, 
-							  p_ijk->dcos_dj[0]/sin_theta, p_ijk->dcos_dj[1]/sin_theta,
-							  p_ijk->dcos_dj[2]/sin_theta, 
-							  p_ijk->dcos_dk[0]/sin_theta, p_ijk->dcos_dk[1]/sin_theta,
-							  p_ijk->dcos_dk[2]/sin_theta);
-
-							  fprintf( out_control->eval, 
-							  "%6d%6d%6d%15.8f%15.8f\n",
-							  system->my_atoms[i].orig_id, 
-							  system->my_atoms[j].orig_id, 
-							  system->my_atoms[k].orig_id,
-							  RAD2DEG(theta), e_ang );*/
-
-							fprintf( out_control->eval, 
-									//"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-									"%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-									system->my_atoms[i].orig_id, 
-									system->my_atoms[j].orig_id, 
-									system->my_atoms[k].orig_id,
-									RAD2DEG(theta), theta_0, BOA_ij, BOA_jk,
-									e_ang, data->my_en.e_ang );
-
-							fprintf( out_control->epen, 
-									//"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-									"%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-									system->my_atoms[i].orig_id,
-									system->my_atoms[j].orig_id,
-									system->my_atoms[k].orig_id,
-									RAD2DEG(theta), BOA_ij, BOA_jk, e_pen, 
-									data->my_en.e_pen );
-
-							fprintf( out_control->ecoa, 
-									//"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-									"%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
-									system->my_atoms[i].orig_id, 
-									system->my_atoms[j].orig_id, 
-									system->my_atoms[k].orig_id,
-									RAD2DEG(theta), BOA_ij, BOA_jk, 
-									e_coa, data->my_en.e_coa );
+                            /*fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
+                              p_val3, p_val4, BOA_ij, BOA_jk );
+                              fprintf(out_control->eval, "%13.8f%13.8f%13.8f%13.8f%13.8f\n",
+                              workspace->Delta_e[j], workspace->vlpex[j],
+                              dSBO1, dSBO2, vlpadj );
+                              fprintf( out_control->eval, "%12.8f%12.8f%12.8f%12.8f\n",
+                              f7_ij, f7_jk, f8_Dj, expval12theta );
+                              fprintf( out_control->eval, 
+                              "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                              CEval1, CEval2, CEval3, CEval4, 
+                              CEval5, CEval6, CEval7, CEval8 );
+
+                              fprintf( out_control->eval, 
+                              "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                              p_ijk->dcos_di[0]/sin_theta, p_ijk->dcos_di[1]/sin_theta,
+                              p_ijk->dcos_di[2]/sin_theta, 
+                              p_ijk->dcos_dj[0]/sin_theta, p_ijk->dcos_dj[1]/sin_theta,
+                              p_ijk->dcos_dj[2]/sin_theta, 
+                              p_ijk->dcos_dk[0]/sin_theta, p_ijk->dcos_dk[1]/sin_theta,
+                              p_ijk->dcos_dk[2]/sin_theta);
+
+                              fprintf( out_control->eval, 
+                              "%6d%6d%6d%15.8f%15.8f\n",
+                              system->my_atoms[i].orig_id, 
+                              system->my_atoms[j].orig_id, 
+                              system->my_atoms[k].orig_id,
+                              RAD2DEG(theta), e_ang );*/
+
+                            fprintf( out_control->eval, 
+                                    //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                                    "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                                    system->my_atoms[i].orig_id, 
+                                    system->my_atoms[j].orig_id, 
+                                    system->my_atoms[k].orig_id,
+                                    RAD2DEG(theta), theta_0, BOA_ij, BOA_jk,
+                                    e_ang, data->my_en.e_ang );
+
+                            fprintf( out_control->epen, 
+                                    //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                                    "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                                    system->my_atoms[i].orig_id,
+                                    system->my_atoms[j].orig_id,
+                                    system->my_atoms[k].orig_id,
+                                    RAD2DEG(theta), BOA_ij, BOA_jk, e_pen, 
+                                    data->my_en.e_pen );
+
+                            fprintf( out_control->ecoa, 
+                                    //"%6d%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                                    "%6d%6d%6d%12.4f%12.4f%12.4f%12.4f%12.4f\n",
+                                    system->my_atoms[i].orig_id, 
+                                    system->my_atoms[j].orig_id, 
+                                    system->my_atoms[k].orig_id,
+                                    RAD2DEG(theta), BOA_ij, BOA_jk, 
+                                    e_coa, data->my_en.e_coa );
 #endif
 
 #ifdef TEST_FORCES            /* angle forces */
-							Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
-							Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
-							Add_dDelta( system, lists, j, 
-									CEval3 + CEval7, workspace->f_ang );
-
-							for( t = start_j; t < end_j; ++t ) {
-								pbond_jt = &( bonds->select.bond_list[t] );
-								bo_jt = &(pbond_jt->bo_data);
-								temp_bo_jt = bo_jt->BO;
-								temp = CUBE( temp_bo_jt );
-								pBOjt7 = temp * temp * temp_bo_jt; 
-
-								Add_dBO( system, lists, j, t, pBOjt7 * CEval6, 
-										workspace->f_ang );
-								Add_dBOpinpi2( system, lists, j, t, CEval5, CEval5, 
-										workspace->f_ang, workspace->f_ang );
-							}
-
-							rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
-							rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
-							rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
-							/* end angle forces */
-
-							/* penalty forces */
-							Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
-							Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
-							Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
-							/* end penalty forces */
-
-							/* coalition forces */
-							Add_dBO( system, lists, j, pi, CEcoa1 - CEcoa4, 
-									workspace->f_coa );
-							Add_dBO( system, lists, j, pk, CEcoa2 - CEcoa5, 
-									workspace->f_coa );
-							Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
-							Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
-							Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
-							/* end coalition forces */
+                            Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
+                            Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
+                            Add_dDelta( system, lists, j, 
+                                    CEval3 + CEval7, workspace->f_ang );
+
+                            for( t = start_j; t < end_j; ++t ) {
+                                pbond_jt = &( bonds->select.bond_list[t] );
+                                bo_jt = &(pbond_jt->bo_data);
+                                temp_bo_jt = bo_jt->BO;
+                                temp = CUBE( temp_bo_jt );
+                                pBOjt7 = temp * temp * temp_bo_jt; 
+
+                                Add_dBO( system, lists, j, t, pBOjt7 * CEval6, 
+                                        workspace->f_ang );
+                                Add_dBOpinpi2( system, lists, j, t, CEval5, CEval5, 
+                                        workspace->f_ang, workspace->f_ang );
+                            }
+
+                            rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
+                            rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
+                            rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
+                            /* end angle forces */
+
+                            /* penalty forces */
+                            Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
+                            Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
+                            Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
+                            /* end penalty forces */
+
+                            /* coalition forces */
+                            Add_dBO( system, lists, j, pi, CEcoa1 - CEcoa4, 
+                                    workspace->f_coa );
+                            Add_dBO( system, lists, j, pk, CEcoa2 - CEcoa5, 
+                                    workspace->f_coa );
+                            Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
+                            Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
+                            Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
+                            /* end coalition forces */
 #endif
-						}
-					}
-				}
-			}
-			}
+                        }
+                    }
+                }
+            }
+            }
 
-			Dev_Set_End_Index(pi, num_thb_intrs, thb_intrs );
-		}
-		// } CUDA Commented
-	}
+            Dev_Set_End_Index(pi, num_thb_intrs, thb_intrs );
+        }
+        // } CUDA Commented
+    }
 
-	CUDA_GLOBAL void Cuda_Valence_Angles_PostProcess (   reax_atom *atoms, control_params *control,
-			storage p_workspace, 
-			reax_list p_bonds, int N )
-	{
-		int i, pj;
+    CUDA_GLOBAL void Cuda_Valence_Angles_PostProcess (   reax_atom *atoms, control_params *control,
+            storage p_workspace, 
+            reax_list p_bonds, int N )
+    {
+        int i, pj;
 
-		bond_data *pbond;
-		bond_data *sym_index_bond;
-		reax_list *bonds = &p_bonds;
-		storage *workspace = &p_workspace;
+        bond_data *pbond;
+        bond_data *sym_index_bond;
+        reax_list *bonds = &p_bonds;
+        storage *workspace = &p_workspace;
 
-		i = blockIdx.x * blockDim.x + threadIdx.x;
-		if ( i >= N) return;
+        i = blockIdx.x * blockDim.x + threadIdx.x;
+        if ( i >= N) return;
 
-		for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
+        for( pj = Dev_Start_Index(i, bonds); pj < Dev_End_Index(i, bonds); ++pj ){
 
-			pbond = &(bonds->select.bond_list[pj]);
-			sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); 
+            pbond = &(bonds->select.bond_list[pj]);
+            sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] ); 
 
-			workspace->CdDelta [i] += sym_index_bond->va_CdDelta;
+            workspace->CdDelta [i] += sym_index_bond->va_CdDelta;
 
-			//rvec_Add (atoms[i].f, sym_index_bond->va_f );
-			rvec_Add (workspace->f[i], sym_index_bond->va_f );
-		}
-	}
+            //rvec_Add (atoms[i].f, sym_index_bond->va_f );
+            rvec_Add (workspace->f[i], sym_index_bond->va_f );
+        }
+    }
 
 
-	// THREE BODY ESTIMATION HERE
-	CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms, 
-			control_params *control, 
-			reax_list p_bonds, 
-			int n, int N, 
-			int *count
-			)
-	{
-		int i, j, pi, k, pk, t;
-		int type_i, type_j, type_k;
-		int start_j, end_j;
-		int cnt, num_thb_intrs;
+    // THREE BODY ESTIMATION HERE
+    CUDA_GLOBAL void Estimate_Cuda_Valence_Angles( reax_atom *my_atoms, 
+            control_params *control, 
+            reax_list p_bonds, 
+            int n, int N, 
+            int *count
+            )
+    {
+        int i, j, pi, k, pk, t;
+        int type_i, type_j, type_k;
+        int start_j, end_j;
+        int cnt, num_thb_intrs;
 
-		real r_ij, r_jk;
-		real BOA_ij, BOA_jk;
+        real r_ij, r_jk;
+        real BOA_ij, BOA_jk;
 
-		bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-		bond_order_data *bo_ij, *bo_jk, *bo_jt;
+        bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+        bond_order_data *bo_ij, *bo_jk, *bo_jt;
 
-		reax_list *bonds = &( p_bonds );
+        reax_list *bonds = &( p_bonds );
 
-		j = blockIdx.x * blockDim.x + threadIdx.x;
-		if (j >= N) return;
+        j = blockIdx.x * blockDim.x + threadIdx.x;
+        if (j >= N) return;
 
-		type_j = my_atoms[j].type;
-		start_j = Dev_Start_Index(j, bonds);
-		end_j = Dev_End_Index(j, bonds);
+        type_j = my_atoms[j].type;
+        start_j = Dev_Start_Index(j, bonds);
+        end_j = Dev_End_Index(j, bonds);
 
 
-		for( pi = start_j; pi < end_j; ++pi ) {
+        for( pi = start_j; pi < end_j; ++pi ) {
 
-			num_thb_intrs = 0;
-			count[ pi ] = 0;
+            num_thb_intrs = 0;
+            count[ pi ] = 0;
 
-			pbond_ij = &(bonds->select.bond_list[pi]);
-			bo_ij = &(pbond_ij->bo_data);
-			BOA_ij = bo_ij->BO - control->thb_cut;
+            pbond_ij = &(bonds->select.bond_list[pi]);
+            bo_ij = &(pbond_ij->bo_data);
+            BOA_ij = bo_ij->BO - control->thb_cut;
 
 
-			if( BOA_ij/*bo_ij->BO*/ > 0.0 && 
-					( j < n || pbond_ij->nbr < n ) ) {
-				//if( BOA_ij/*bo_ij->BO*/ > 0.0) {
-				i = pbond_ij->nbr;
-				r_ij = pbond_ij->d;	 
-				type_i = my_atoms[i].type;
+            if( BOA_ij/*bo_ij->BO*/ > 0.0 && 
+                    ( j < n || pbond_ij->nbr < n ) ) {
+                //if( BOA_ij/*bo_ij->BO*/ > 0.0) {
+                i = pbond_ij->nbr;
+                r_ij = pbond_ij->d;     
+                type_i = my_atoms[i].type;
 
 
-				for( pk = start_j; pk < end_j; ++pk ) {
-					if (pk == pi) continue;
+                for( pk = start_j; pk < end_j; ++pk ) {
+                    if (pk == pi) continue;
 
-					pbond_jk = &(bonds->select.bond_list[pk]);
-					bo_jk    = &(pbond_jk->bo_data);
-					BOA_jk   = bo_jk->BO - control->thb_cut;
+                    pbond_jk = &(bonds->select.bond_list[pk]);
+                    bo_jk    = &(pbond_jk->bo_data);
+                    BOA_jk   = bo_jk->BO - control->thb_cut;
 
-					//CHANGE ORIGINAL
-					//if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue;
-					if ((BOA_jk <= 0) ) continue;
-					//CHANGE ORIGINAL
+                    //CHANGE ORIGINAL
+                    //if ((BOA_jk <= 0) || ((j >= n) && (k >= n))) continue;
+                    if ((BOA_jk <= 0) ) continue;
+                    //CHANGE ORIGINAL
 
-					++num_thb_intrs;
-				}
+                    ++num_thb_intrs;
+                }
 
-			}
+            }
 
-			count[ pi ] = num_thb_intrs;
-			}
-		}
+            count[ pi ] = num_thb_intrs;
+            }
+        }
 
diff --git a/PG-PuReMD/src/dev_alloc.cu b/PG-PuReMD/src/dev_alloc.cu
index 72ae58e7..b0a76a21 100644
--- a/PG-PuReMD/src/dev_alloc.cu
+++ b/PG-PuReMD/src/dev_alloc.cu
@@ -7,403 +7,403 @@
 extern "C"
 {
 
-	int dev_alloc_control (control_params *control)
-	{
-		cuda_malloc ((void **)&control->d_control_params, sizeof (control_params), 1, "control_params");
-		copy_host_device (control, control->d_control_params, sizeof (control_params), cudaMemcpyHostToDevice, "control_params");
-	}
-
-	CUDA_GLOBAL void Init_Nbrs(ivec *nbrs, int N)
-	{
-		int index = blockIdx.x * blockDim.x + threadIdx.x;
-		if (index >= N) return;
-
-		nbrs[index][0] = -1; 
-		nbrs[index][1] = -1; 
-		nbrs[index][2] = -1; 
-	}
-
-
-	int dev_alloc_grid (reax_system *system)
-	{
-		int total;
-		grid_cell local_cell;
-		grid *host = &system->my_grid;
-		grid *device = &system->d_my_grid;
-		ivec *nbrs_x = (ivec *) scratch;
-
-		total = host->ncells[0] * host->ncells[1] * host->ncells[2];
-		ivec_Copy (device->ncells, host->ncells);
-		rvec_Copy (device->cell_len, host->cell_len);
-		rvec_Copy (device->inv_len, host->inv_len);
-
-		ivec_Copy (device->bond_span, host->bond_span );
-		ivec_Copy (device->nonb_span, host->nonb_span );
-		ivec_Copy (device->vlist_span, host->vlist_span );
-
-		ivec_Copy (device->native_cells, host->native_cells );
-		ivec_Copy (device->native_str, host->native_str );
-		ivec_Copy (device->native_end, host->native_end );
-
-		device->ghost_cut = host->ghost_cut;
-		ivec_Copy (device->ghost_span, host->ghost_span );
-		ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span );
-		ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span );
-		ivec_Copy (device->ghost_bond_span, host->ghost_bond_span );
-
-		cuda_malloc ((void **) &device->str, sizeof (int) * total, 1, "grid:str");
-		cuda_malloc ((void **) &device->end, sizeof (int) * total, 1, "grid:end");
-		cuda_malloc ((void **) &device->cutoff, sizeof (real) * total, 1, "grid:cutoff");
-		cuda_malloc ((void **) &device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, 1, "grid:nbrs_x");
-		cuda_malloc ((void **) &device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, 1, "grid:nbrs_cp");
-		cuda_malloc ((void **) &device->rel_box, sizeof (ivec) * total, 1, "grid:rel_box");
-
-		/*
-		   int block_size = 512;
-		   int blocks = (host->max_nbrs) / block_size + ((host->max_nbrs) % block_size == 0 ? 0 : 1); 
-
-		   Init_Nbrs <<<blocks, block_size>>>
-		   (nbrs_x, host->max_nbrs );
-		   cudaThreadSynchronize (); 
-		   cudaCheckError ();
-
-		   cuda_malloc ((void **)& device->cells, 
-		   sizeof (grid_cell) * total, 
-		   1, "grid:cells");
-		   fprintf (stderr, " Device cells address --> %ld \n", device->cells );
-		   cuda_malloc ((void **) &device->order, sizeof (ivec) * (host->total + 1), 1, "grid:order");
-
-		   local_cell.top = local_cell.mark = local_cell.str = local_cell.end = 0;
-		   fprintf (stderr, "Total cells to be allocated -- > %d \n", total );
-		   for (int i = 0; i < total; i++) {
-		//fprintf (stderr, "Address of the local atom -> %ld  \n", &local_cell);
-
-		cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * host->max_atoms, 
-		1, "alloc:grid:cells:atoms");
-		//fprintf (stderr, "Allocated address of the atoms --> %ld  (%d)\n", local_cell.atoms, host->max_atoms );
-
-		cuda_malloc ((void **) &local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, 
-		1, "alloc:grid:cells:nbrs_x" );
-		copy_device (local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof (ivec), "grid:nbrs_x");	
-		//fprintf (stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x);
-
-		cuda_malloc ((void **) &local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, 
-		1, "alloc:grid:cells:nbrs_cp" );
-		//fprintf (stderr, "Allocated address of the nbrs_cp--> %ld \n", local_cell.nbrs_cp);
-
-		//cuda_malloc ((void **) &local_cell.nbrs, sizeof (grid_cell *) * host->max_nbrs , 
-		//				1, "alloc:grid:cells:nbrs" );
-		//fprintf (stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs);
-
-		copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-alloc");
-		}
-		 */
-
-		return SUCCESS;
-	}
-
-	int dev_dealloc_grid_cell_atoms (reax_system *system)
-	{
-		int total;
-		grid_cell local_cell;
-		grid *host = &system->my_grid;
-		grid *device = &system->d_my_grid;
-
-		total = host->ncells[0] * host->ncells[1] * host->ncells[2];
-
-
-		for (int i = 0; i < total; i++) {
-			copy_host_device (&local_cell, &device->cells[i], 
-					sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc");
-			cuda_free (local_cell.atoms, "grid_cell:atoms" );
-		}
-	}
-
-	int dev_alloc_grid_cell_atoms (reax_system *system, int cap)
-	{
-		int total;
-		grid_cell local_cell;
-		grid *host = &system->my_grid;
-		grid *device = &system->d_my_grid;
-
-		total = host->ncells[0] * host->ncells[1] * host->ncells[2];
-
-		for (int i = 0; i < total; i++) {
-			copy_host_device (&local_cell, &device->cells[i], 
-					sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc");
-			cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * cap, 
-					1, "realloc:grid:cells:atoms");
-			copy_host_device (&local_cell, &device->cells[i], 
-					sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-realloc");
-		}
-	}
-
-
-	int dev_alloc_system (reax_system *system)
-	{
-		cuda_malloc ( (void **) &system->d_my_atoms, system->total_cap * sizeof (reax_atom), 1, "system:d_my_atoms");  
-		//fprintf (stderr, "p:%d - allocated atoms : %d (%ld, %ld) \n", system->my_rank, system->total_cap, 
-		//																					system->my_atoms, system->d_my_atoms);
-
-		//simulation boxes
-		cuda_malloc ( (void **) &system->d_big_box, sizeof (simulation_box), 1, "system:d_big_box");
-		cuda_malloc ( (void **) &system->d_my_box, sizeof (simulation_box), 1, "system:d_my_box");
-		cuda_malloc ( (void **) &system->d_my_ext_box, sizeof (simulation_box), 1, "d_my_ext_box");
-
-		//interaction parameters
-		cuda_malloc ((void **) &system->reax_param.d_sbp, system->reax_param.num_atom_types * sizeof (single_body_parameters),
-				1, "system:d_sbp");
-
-		cuda_malloc ((void **) &system->reax_param.d_tbp, pow (system->reax_param.num_atom_types, 2) * sizeof (two_body_parameters), 
-				1, "system:d_tbp");
-
-		cuda_malloc ((void **) &system->reax_param.d_thbp, pow (system->reax_param.num_atom_types, 3) * sizeof (three_body_header),
-				1, "system:d_thbp");
-
-		cuda_malloc ((void **) &system->reax_param.d_hbp, pow (system->reax_param.num_atom_types, 3) * sizeof (hbond_parameters),
-				1, "system:d_hbp");
-
-		cuda_malloc ((void **) &system->reax_param.d_fbp, pow (system->reax_param.num_atom_types, 4) * sizeof (four_body_header),
-				1, "system:d_fbp");
-
-		cuda_malloc ((void **) &system->reax_param.d_gp.l, system->reax_param.gp.n_global * sizeof (real), 1, "system:d_gp.l");
-
-		system->reax_param.d_gp.n_global = 0;
-		system->reax_param.d_gp.vdw_type = 0;
-
-		return SUCCESS;
-	}
-
-	int dev_realloc_system (reax_system *system, int local_cap, int total_cap, char *msg)
-	{
-		//free the existing storage for atoms
-		cuda_free (system->d_my_atoms, "system:d_my_atoms");
-
-		cuda_malloc ((void **) &system->d_my_atoms, sizeof (reax_atom) * total_cap, 
-				1, "system:d_my_atoms");
-		return FAILURE;
-	}
-
-
-	int dev_alloc_simulation_data(simulation_data *data)
-	{
-		cuda_malloc ((void **) &(data->d_simulation_data), sizeof (simulation_data), 1, "simulation_data");
-		return SUCCESS;
-	}
-
-	int dev_alloc_workspace (reax_system *system, control_params *control, 
-			storage *workspace, int local_cap, int total_cap, 
-			char *msg)
-	{
-		int i, total_real, total_rvec, local_int, local_real, local_rvec;
-
-		workspace->allocated = 1;
-		total_real = total_cap * sizeof(real);
-		total_rvec = total_cap * sizeof(rvec);
-		local_int = local_cap * sizeof(int);
-		local_real = local_cap * sizeof(real);
-		local_rvec = local_cap * sizeof(rvec);
-
-		/* communication storage */  
-		/*
-		   workspace->tmp_dbl = NULL;
-		   workspace->tmp_rvec = NULL;
-		   workspace->tmp_rvec2 = NULL;
-		 */
-
-		//fprintf (stderr, "Deltap and TOTAL BOND ORDER size --> %d \n", total_cap );
-
-		/* bond order related storage  */
-		cuda_malloc ((void **) &workspace->within_bond_box, total_cap * sizeof (int), 1, "skin");
-		cuda_malloc ((void **) &workspace->total_bond_order, total_real, 1, "total_bo");
-		cuda_malloc ((void **) &workspace->Deltap, total_real, 1, "Deltap");
-		cuda_malloc ((void **) &workspace->Deltap_boc, total_real, 1, "Deltap_boc");
-		cuda_malloc ((void **) &workspace->dDeltap_self, total_rvec, 1, "dDeltap_self");
-		cuda_malloc ((void **) &workspace->Delta, total_real, 1, "Delta" );
-		cuda_malloc ((void **) &workspace->Delta_lp, total_real, 1, "Delta_lp" );
-		cuda_malloc ((void **) &workspace->Delta_lp_temp, total_real, 1, "Delta_lp_temp" );
-		cuda_malloc ((void **) &workspace->dDelta_lp, total_real, 1, "Delta_lp_temp" );
-		cuda_malloc ((void **) &workspace->dDelta_lp_temp, total_real, 1, "dDelta_lp_temp" );
-		cuda_malloc ((void **) &workspace->Delta_e, total_real, 1, "Delta_e" );
-		cuda_malloc ((void **) &workspace->Delta_boc, total_real, 1, "Delta_boc");
-		cuda_malloc ((void **) &workspace->nlp, total_real, 1, "nlp");
-		cuda_malloc ((void **) &workspace->nlp_temp, total_real, 1, "nlp_temp");
-		cuda_malloc ((void **) &workspace->Clp, total_real, 1, "Clp");
-		cuda_malloc ((void **) &workspace->vlpex, total_real, 1, "vlpex");
-		cuda_malloc ((void **) &workspace->bond_mark, total_real, 1, "bond_mark");
-		cuda_malloc ((void **) &workspace->done_after, total_real, 1, "done_after");
-
-
-		/* QEq storage */
-		cuda_malloc ((void **) &workspace->Hdia_inv, total_cap * sizeof (real), 1, "Hdia_inv");
-		cuda_malloc ((void **) &workspace->b_s, total_cap * sizeof (real), 1, "b_s");
-		cuda_malloc ((void **) &workspace->b_t, total_cap * sizeof (real), 1, "b_t");
-		cuda_malloc ((void **) &workspace->b_prc, total_cap * sizeof (real), 1, "b_prc");
-		cuda_malloc ((void **) &workspace->b_prm, total_cap * sizeof (real), 1, "b_prm");
-		cuda_malloc ((void **) &workspace->s, total_cap * sizeof (real), 1, "s");
-		cuda_malloc ((void **) &workspace->t, total_cap * sizeof (real), 1, "t");
-		cuda_malloc ((void **) &workspace->droptol, total_cap * sizeof (real), 1, "droptol");
-		cuda_malloc ((void **) &workspace->b, total_cap * sizeof (rvec2), 1, "b");
-		cuda_malloc ((void **) &workspace->x, total_cap * sizeof (rvec2), 1, "x");
-
-		/* GMRES storage */
-		cuda_malloc ((void **) &workspace->y, (RESTART+1)*sizeof (real), 1, "y");
-		cuda_malloc ((void **) &workspace->z, (RESTART+1)*sizeof (real), 1, "z");
-		cuda_malloc ((void **) &workspace->g, (RESTART+1)*sizeof (real), 1, "g");
-		cuda_malloc ((void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "h");
-		cuda_malloc ((void **) &workspace->hs, (RESTART+1)*sizeof (real), 1, "hs");
-		cuda_malloc ((void **) &workspace->hc, (RESTART+1)*sizeof (real), 1, "hc");
-		cuda_malloc ((void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "v");
-
-		/* CG storage */
-		cuda_malloc ((void **) &workspace->r, total_cap * sizeof (real), 1,  "r");
-		cuda_malloc ((void **) &workspace->d, total_cap * sizeof (real), 1, "d");
-		cuda_malloc ((void **) &workspace->q, total_cap * sizeof (real), 1, "q");
-		cuda_malloc ((void **) &workspace->p, total_cap * sizeof (real), 1, "p");
-		cuda_malloc ((void **) &workspace->r2, total_cap * sizeof (rvec2), 1, "r2");
-		cuda_malloc ((void **) &workspace->d2, total_cap * sizeof (rvec2), 1, "d2");
-		cuda_malloc ((void **) &workspace->q2, total_cap * sizeof (rvec2), 1, "q2");
-		cuda_malloc ((void **) &workspace->p2, total_cap * sizeof (rvec2), 1, "p2");
-
-		/* integrator storage */
-		cuda_malloc ((void **) &workspace->v_const, local_rvec, 1, "v_const");
-
-		/* storage for analysis */
-		if( control->molecular_analysis || control->diffusion_coef ) {
-			cuda_malloc ((void **) &workspace->mark, local_cap * sizeof (int), 1, "mark");
-			cuda_malloc ((void **) &workspace->old_mark, local_cap * sizeof (int), 1, "old_mark");
-		}
-		else
-			workspace->mark = workspace->old_mark = NULL;
-
-		if( control->diffusion_coef )
-			cuda_malloc ((void **) &workspace->x_old, local_cap * sizeof (rvec), 1, "x_old");
-		else
-			workspace->x_old = NULL;
-
-		/* force related storage */
-		cuda_malloc ((void **) &workspace->f, total_cap * sizeof (rvec), 1, "f");
-		cuda_malloc ((void **) &workspace->CdDelta, total_cap * sizeof (rvec), 1, "CdDelta");
-
-		/* Taper params */
-		cuda_malloc ((void **) &workspace->Tap, 8 * sizeof (real), 1, "Tap");
-
-		return SUCCESS;
-	}
-
-	int dev_dealloc_workspace (reax_system *system, control_params *control, 
-			storage *workspace, int local_cap, int total_cap, 
-			char *msg)
-	{
-		/* communication storage */  
-		/*
-		   workspace->tmp_dbl = NULL;
-		   workspace->tmp_rvec = NULL;
-		   workspace->tmp_rvec2 = NULL;
-		 */
-
-		/* bond order related storage  */
-		cuda_free (workspace->within_bond_box, "skin");
-		cuda_free (workspace->total_bond_order, "total_bo");
-		cuda_free (workspace->Deltap, "Deltap");
-		cuda_free (workspace->Deltap_boc, "Deltap_boc");
-		cuda_free (workspace->dDeltap_self, "dDeltap_self");
-		cuda_free (workspace->Delta, "Delta" );
-		cuda_free (workspace->Delta_lp, "Delta_lp" );
-		cuda_free (workspace->Delta_lp_temp, "Delta_lp_temp" );
-		cuda_free (workspace->dDelta_lp, "Delta_lp_temp" );
-		cuda_free (workspace->dDelta_lp_temp, "dDelta_lp_temp" );
-		cuda_free (workspace->Delta_e, "Delta_e" );
-		cuda_free (workspace->Delta_boc, "Delta_boc");
-		cuda_free (workspace->nlp, "nlp");
-		cuda_free (workspace->nlp_temp, "nlp_temp");
-		cuda_free (workspace->Clp, "Clp");
-		cuda_free (workspace->vlpex, "vlpex");
-		cuda_free (workspace->bond_mark, "bond_mark");
-		cuda_free (workspace->done_after, "done_after");
-
-		/* QEq storage */
-		cuda_free (workspace->Hdia_inv, "Hdia_inv");
-		cuda_free (workspace->b_s, "b_s");
-		cuda_free (workspace->b_t, "b_t");
-		cuda_free (workspace->b_prc, "b_prc");
-		cuda_free (workspace->b_prm, "b_prm");
-		cuda_free (workspace->s, "s");
-		cuda_free (workspace->t, "t");
-		cuda_free (workspace->droptol, "droptol");
-		cuda_free (workspace->b, "b");
-		cuda_free (workspace->x, "x");
-
-		/* GMRES storage */
-		cuda_free (workspace->y, "y");
-		cuda_free (workspace->z, "z");
-		cuda_free (workspace->g, "g");
-		cuda_free (workspace->h, "h");
-		cuda_free (workspace->hs, "hs");
-		cuda_free (workspace->hc, "hc");
-		cuda_free (workspace->v, "v");
-
-		/* CG storage */
-		cuda_free (workspace->r, "r");
-		cuda_free (workspace->d, "d");
-		cuda_free (workspace->q, "q");
-		cuda_free (workspace->p, "p");
-		cuda_free (workspace->r2, "r2");
-		cuda_free (workspace->d2, "d2");
-		cuda_free (workspace->q2, "q2");
-		cuda_free (workspace->p2, "p2");
-
-		/* integrator storage */
-		cuda_free (workspace->v_const, "v_const");
-
-		/* storage for analysis */
-		if( control->molecular_analysis || control->diffusion_coef ) {
-			cuda_free (workspace->mark, "mark");
-			cuda_free (workspace->old_mark, "old_mark");
-		}
-		else
-			workspace->mark = workspace->old_mark = NULL;
-
-		if( control->diffusion_coef )
-			cuda_free (workspace->x_old, "x_old");
-		else
-			workspace->x_old = NULL;
-
-		/* force related storage */
-		cuda_free (workspace->f, "f");
-		cuda_free (workspace->CdDelta, "CdDelta");
-
-		/* Taper params */
-		cuda_free (workspace->Tap, "Tap");
-
-		return FAILURE;
-	}
-
-
-
-
-	int dev_alloc_matrix (sparse_matrix *H, int cap, int m)
-	{
-		//sparse_matrix *H;
-		//H = *pH;
-
-		H->cap = cap;
-		H->m = m;
-		cuda_malloc ((void **) &H->start, sizeof (int) * cap, 1, "matrix_start");
-		cuda_malloc ((void **) &H->end, sizeof (int) * cap, 1, "matrix_end");
-		cuda_malloc ((void **) &H->entries, sizeof (sparse_matrix_entry) * m, 1, "matrix_entries");
-
-		return SUCCESS;
-	}
-
-	int dev_dealloc_matrix (sparse_matrix *H)
-	{
-		cuda_free (H->start, "matrix_start");
-		cuda_free (H->end, "matrix_end");
-		cuda_free (H->entries, "matrix_entries");
-
-		return SUCCESS;
-	}
+    int dev_alloc_control (control_params *control)
+    {
+        cuda_malloc ((void **)&control->d_control_params, sizeof (control_params), 1, "control_params");
+        copy_host_device (control, control->d_control_params, sizeof (control_params), cudaMemcpyHostToDevice, "control_params");
+    }
+
+    CUDA_GLOBAL void Init_Nbrs(ivec *nbrs, int N)
+    {
+        int index = blockIdx.x * blockDim.x + threadIdx.x;
+        if (index >= N) return;
+
+        nbrs[index][0] = -1; 
+        nbrs[index][1] = -1; 
+        nbrs[index][2] = -1; 
+    }
+
+
+    int dev_alloc_grid (reax_system *system)
+    {
+        int total;
+        grid_cell local_cell;
+        grid *host = &system->my_grid;
+        grid *device = &system->d_my_grid;
+        ivec *nbrs_x = (ivec *) scratch;
+
+        total = host->ncells[0] * host->ncells[1] * host->ncells[2];
+        ivec_Copy (device->ncells, host->ncells);
+        rvec_Copy (device->cell_len, host->cell_len);
+        rvec_Copy (device->inv_len, host->inv_len);
+
+        ivec_Copy (device->bond_span, host->bond_span );
+        ivec_Copy (device->nonb_span, host->nonb_span );
+        ivec_Copy (device->vlist_span, host->vlist_span );
+
+        ivec_Copy (device->native_cells, host->native_cells );
+        ivec_Copy (device->native_str, host->native_str );
+        ivec_Copy (device->native_end, host->native_end );
+
+        device->ghost_cut = host->ghost_cut;
+        ivec_Copy (device->ghost_span, host->ghost_span );
+        ivec_Copy (device->ghost_nonb_span, host->ghost_nonb_span );
+        ivec_Copy (device->ghost_hbond_span, host->ghost_hbond_span );
+        ivec_Copy (device->ghost_bond_span, host->ghost_bond_span );
+
+        cuda_malloc ((void **) &device->str, sizeof (int) * total, 1, "grid:str");
+        cuda_malloc ((void **) &device->end, sizeof (int) * total, 1, "grid:end");
+        cuda_malloc ((void **) &device->cutoff, sizeof (real) * total, 1, "grid:cutoff");
+        cuda_malloc ((void **) &device->nbrs_x, sizeof (ivec) * total * host->max_nbrs, 1, "grid:nbrs_x");
+        cuda_malloc ((void **) &device->nbrs_cp, sizeof (rvec) * total * host->max_nbrs, 1, "grid:nbrs_cp");
+        cuda_malloc ((void **) &device->rel_box, sizeof (ivec) * total, 1, "grid:rel_box");
+
+        /*
+           int block_size = 512;
+           int blocks = (host->max_nbrs) / block_size + ((host->max_nbrs) % block_size == 0 ? 0 : 1); 
+
+           Init_Nbrs <<<blocks, block_size>>>
+           (nbrs_x, host->max_nbrs );
+           cudaThreadSynchronize (); 
+           cudaCheckError ();
+
+           cuda_malloc ((void **)& device->cells, 
+           sizeof (grid_cell) * total, 
+           1, "grid:cells");
+           fprintf (stderr, " Device cells address --> %ld \n", device->cells );
+           cuda_malloc ((void **) &device->order, sizeof (ivec) * (host->total + 1), 1, "grid:order");
+
+           local_cell.top = local_cell.mark = local_cell.str = local_cell.end = 0;
+           fprintf (stderr, "Total cells to be allocated -- > %d \n", total );
+           for (int i = 0; i < total; i++) {
+        //fprintf (stderr, "Address of the local atom -> %ld  \n", &local_cell);
+
+        cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * host->max_atoms, 
+        1, "alloc:grid:cells:atoms");
+        //fprintf (stderr, "Allocated address of the atoms --> %ld  (%d)\n", local_cell.atoms, host->max_atoms );
+
+        cuda_malloc ((void **) &local_cell.nbrs_x, sizeof (ivec) * host->max_nbrs, 
+        1, "alloc:grid:cells:nbrs_x" );
+        copy_device (local_cell.nbrs_x, nbrs_x, host->max_nbrs * sizeof (ivec), "grid:nbrs_x");    
+        //fprintf (stderr, "Allocated address of the nbrs_x--> %ld \n", local_cell.nbrs_x);
+
+        cuda_malloc ((void **) &local_cell.nbrs_cp, sizeof (rvec) * host->max_nbrs, 
+        1, "alloc:grid:cells:nbrs_cp" );
+        //fprintf (stderr, "Allocated address of the nbrs_cp--> %ld \n", local_cell.nbrs_cp);
+
+        //cuda_malloc ((void **) &local_cell.nbrs, sizeof (grid_cell *) * host->max_nbrs , 
+        //                1, "alloc:grid:cells:nbrs" );
+        //fprintf (stderr, "Allocated address of the nbrs--> %ld \n", local_cell.nbrs);
+
+        copy_host_device (&local_cell, &device->cells[i], sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-alloc");
+        }
+         */
+
+        return SUCCESS;
+    }
+
+    int dev_dealloc_grid_cell_atoms (reax_system *system)
+    {
+        int total;
+        grid_cell local_cell;
+        grid *host = &system->my_grid;
+        grid *device = &system->d_my_grid;
+
+        total = host->ncells[0] * host->ncells[1] * host->ncells[2];
+
+
+        for (int i = 0; i < total; i++) {
+            copy_host_device (&local_cell, &device->cells[i], 
+                    sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc");
+            cuda_free (local_cell.atoms, "grid_cell:atoms" );
+        }
+    }
+
+    int dev_alloc_grid_cell_atoms (reax_system *system, int cap)
+    {
+        int total;
+        grid_cell local_cell;
+        grid *host = &system->my_grid;
+        grid *device = &system->d_my_grid;
+
+        total = host->ncells[0] * host->ncells[1] * host->ncells[2];
+
+        for (int i = 0; i < total; i++) {
+            copy_host_device (&local_cell, &device->cells[i], 
+                    sizeof (grid_cell), cudaMemcpyDeviceToHost, "grid:cell-dealloc");
+            cuda_malloc ((void **) &local_cell.atoms, sizeof (int) * cap, 
+                    1, "realloc:grid:cells:atoms");
+            copy_host_device (&local_cell, &device->cells[i], 
+                    sizeof (grid_cell), cudaMemcpyHostToDevice, "grid:cell-realloc");
+        }
+    }
+
+
+    int dev_alloc_system (reax_system *system)
+    {
+        cuda_malloc ( (void **) &system->d_my_atoms, system->total_cap * sizeof (reax_atom), 1, "system:d_my_atoms");  
+        //fprintf (stderr, "p:%d - allocated atoms : %d (%ld, %ld) \n", system->my_rank, system->total_cap, 
+        //                                                                                    system->my_atoms, system->d_my_atoms);
+
+        //simulation boxes
+        cuda_malloc ( (void **) &system->d_big_box, sizeof (simulation_box), 1, "system:d_big_box");
+        cuda_malloc ( (void **) &system->d_my_box, sizeof (simulation_box), 1, "system:d_my_box");
+        cuda_malloc ( (void **) &system->d_my_ext_box, sizeof (simulation_box), 1, "d_my_ext_box");
+
+        //interaction parameters
+        cuda_malloc ((void **) &system->reax_param.d_sbp, system->reax_param.num_atom_types * sizeof (single_body_parameters),
+                1, "system:d_sbp");
+
+        cuda_malloc ((void **) &system->reax_param.d_tbp, pow (system->reax_param.num_atom_types, 2) * sizeof (two_body_parameters), 
+                1, "system:d_tbp");
+
+        cuda_malloc ((void **) &system->reax_param.d_thbp, pow (system->reax_param.num_atom_types, 3) * sizeof (three_body_header),
+                1, "system:d_thbp");
+
+        cuda_malloc ((void **) &system->reax_param.d_hbp, pow (system->reax_param.num_atom_types, 3) * sizeof (hbond_parameters),
+                1, "system:d_hbp");
+
+        cuda_malloc ((void **) &system->reax_param.d_fbp, pow (system->reax_param.num_atom_types, 4) * sizeof (four_body_header),
+                1, "system:d_fbp");
+
+        cuda_malloc ((void **) &system->reax_param.d_gp.l, system->reax_param.gp.n_global * sizeof (real), 1, "system:d_gp.l");
+
+        system->reax_param.d_gp.n_global = 0;
+        system->reax_param.d_gp.vdw_type = 0;
+
+        return SUCCESS;
+    }
+
+    int dev_realloc_system (reax_system *system, int local_cap, int total_cap, char *msg)
+    {
+        //free the existing storage for atoms
+        cuda_free (system->d_my_atoms, "system:d_my_atoms");
+
+        cuda_malloc ((void **) &system->d_my_atoms, sizeof (reax_atom) * total_cap, 
+                1, "system:d_my_atoms");
+        return FAILURE;
+    }
+
+
+    int dev_alloc_simulation_data(simulation_data *data)
+    {
+        cuda_malloc ((void **) &(data->d_simulation_data), sizeof (simulation_data), 1, "simulation_data");
+        return SUCCESS;
+    }
+
+    int dev_alloc_workspace (reax_system *system, control_params *control, 
+            storage *workspace, int local_cap, int total_cap, 
+            char *msg)
+    {
+        int i, total_real, total_rvec, local_int, local_real, local_rvec;
+
+        workspace->allocated = 1;
+        total_real = total_cap * sizeof(real);
+        total_rvec = total_cap * sizeof(rvec);
+        local_int = local_cap * sizeof(int);
+        local_real = local_cap * sizeof(real);
+        local_rvec = local_cap * sizeof(rvec);
+
+        /* communication storage */  
+        /*
+           workspace->tmp_dbl = NULL;
+           workspace->tmp_rvec = NULL;
+           workspace->tmp_rvec2 = NULL;
+         */
+
+        //fprintf (stderr, "Deltap and TOTAL BOND ORDER size --> %d \n", total_cap );
+
+        /* bond order related storage  */
+        cuda_malloc ((void **) &workspace->within_bond_box, total_cap * sizeof (int), 1, "skin");
+        cuda_malloc ((void **) &workspace->total_bond_order, total_real, 1, "total_bo");
+        cuda_malloc ((void **) &workspace->Deltap, total_real, 1, "Deltap");
+        cuda_malloc ((void **) &workspace->Deltap_boc, total_real, 1, "Deltap_boc");
+        cuda_malloc ((void **) &workspace->dDeltap_self, total_rvec, 1, "dDeltap_self");
+        cuda_malloc ((void **) &workspace->Delta, total_real, 1, "Delta" );
+        cuda_malloc ((void **) &workspace->Delta_lp, total_real, 1, "Delta_lp" );
+        cuda_malloc ((void **) &workspace->Delta_lp_temp, total_real, 1, "Delta_lp_temp" );
+        cuda_malloc ((void **) &workspace->dDelta_lp, total_real, 1, "Delta_lp_temp" );
+        cuda_malloc ((void **) &workspace->dDelta_lp_temp, total_real, 1, "dDelta_lp_temp" );
+        cuda_malloc ((void **) &workspace->Delta_e, total_real, 1, "Delta_e" );
+        cuda_malloc ((void **) &workspace->Delta_boc, total_real, 1, "Delta_boc");
+        cuda_malloc ((void **) &workspace->nlp, total_real, 1, "nlp");
+        cuda_malloc ((void **) &workspace->nlp_temp, total_real, 1, "nlp_temp");
+        cuda_malloc ((void **) &workspace->Clp, total_real, 1, "Clp");
+        cuda_malloc ((void **) &workspace->vlpex, total_real, 1, "vlpex");
+        cuda_malloc ((void **) &workspace->bond_mark, total_real, 1, "bond_mark");
+        cuda_malloc ((void **) &workspace->done_after, total_real, 1, "done_after");
+
+
+        /* QEq storage */
+        cuda_malloc ((void **) &workspace->Hdia_inv, total_cap * sizeof (real), 1, "Hdia_inv");
+        cuda_malloc ((void **) &workspace->b_s, total_cap * sizeof (real), 1, "b_s");
+        cuda_malloc ((void **) &workspace->b_t, total_cap * sizeof (real), 1, "b_t");
+        cuda_malloc ((void **) &workspace->b_prc, total_cap * sizeof (real), 1, "b_prc");
+        cuda_malloc ((void **) &workspace->b_prm, total_cap * sizeof (real), 1, "b_prm");
+        cuda_malloc ((void **) &workspace->s, total_cap * sizeof (real), 1, "s");
+        cuda_malloc ((void **) &workspace->t, total_cap * sizeof (real), 1, "t");
+        cuda_malloc ((void **) &workspace->droptol, total_cap * sizeof (real), 1, "droptol");
+        cuda_malloc ((void **) &workspace->b, total_cap * sizeof (rvec2), 1, "b");
+        cuda_malloc ((void **) &workspace->x, total_cap * sizeof (rvec2), 1, "x");
+
+        /* GMRES storage */
+        cuda_malloc ((void **) &workspace->y, (RESTART+1)*sizeof (real), 1, "y");
+        cuda_malloc ((void **) &workspace->z, (RESTART+1)*sizeof (real), 1, "z");
+        cuda_malloc ((void **) &workspace->g, (RESTART+1)*sizeof (real), 1, "g");
+        cuda_malloc ((void **) &workspace->h, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "h");
+        cuda_malloc ((void **) &workspace->hs, (RESTART+1)*sizeof (real), 1, "hs");
+        cuda_malloc ((void **) &workspace->hc, (RESTART+1)*sizeof (real), 1, "hc");
+        cuda_malloc ((void **) &workspace->v, (RESTART+1)*(RESTART+1)*sizeof (real), 1, "v");
+
+        /* CG storage */
+        cuda_malloc ((void **) &workspace->r, total_cap * sizeof (real), 1,  "r");
+        cuda_malloc ((void **) &workspace->d, total_cap * sizeof (real), 1, "d");
+        cuda_malloc ((void **) &workspace->q, total_cap * sizeof (real), 1, "q");
+        cuda_malloc ((void **) &workspace->p, total_cap * sizeof (real), 1, "p");
+        cuda_malloc ((void **) &workspace->r2, total_cap * sizeof (rvec2), 1, "r2");
+        cuda_malloc ((void **) &workspace->d2, total_cap * sizeof (rvec2), 1, "d2");
+        cuda_malloc ((void **) &workspace->q2, total_cap * sizeof (rvec2), 1, "q2");
+        cuda_malloc ((void **) &workspace->p2, total_cap * sizeof (rvec2), 1, "p2");
+
+        /* integrator storage */
+        cuda_malloc ((void **) &workspace->v_const, local_rvec, 1, "v_const");
+
+        /* storage for analysis */
+        if( control->molecular_analysis || control->diffusion_coef ) {
+            cuda_malloc ((void **) &workspace->mark, local_cap * sizeof (int), 1, "mark");
+            cuda_malloc ((void **) &workspace->old_mark, local_cap * sizeof (int), 1, "old_mark");
+        }
+        else
+            workspace->mark = workspace->old_mark = NULL;
+
+        if( control->diffusion_coef )
+            cuda_malloc ((void **) &workspace->x_old, local_cap * sizeof (rvec), 1, "x_old");
+        else
+            workspace->x_old = NULL;
+
+        /* force related storage */
+        cuda_malloc ((void **) &workspace->f, total_cap * sizeof (rvec), 1, "f");
+        cuda_malloc ((void **) &workspace->CdDelta, total_cap * sizeof (rvec), 1, "CdDelta");
+
+        /* Taper params */
+        cuda_malloc ((void **) &workspace->Tap, 8 * sizeof (real), 1, "Tap");
+
+        return SUCCESS;
+    }
+
+    int dev_dealloc_workspace (reax_system *system, control_params *control, 
+            storage *workspace, int local_cap, int total_cap, 
+            char *msg)
+    {
+        /* communication storage */  
+        /*
+           workspace->tmp_dbl = NULL;
+           workspace->tmp_rvec = NULL;
+           workspace->tmp_rvec2 = NULL;
+         */
+
+        /* bond order related storage  */
+        cuda_free (workspace->within_bond_box, "skin");
+        cuda_free (workspace->total_bond_order, "total_bo");
+        cuda_free (workspace->Deltap, "Deltap");
+        cuda_free (workspace->Deltap_boc, "Deltap_boc");
+        cuda_free (workspace->dDeltap_self, "dDeltap_self");
+        cuda_free (workspace->Delta, "Delta" );
+        cuda_free (workspace->Delta_lp, "Delta_lp" );
+        cuda_free (workspace->Delta_lp_temp, "Delta_lp_temp" );
+        cuda_free (workspace->dDelta_lp, "Delta_lp_temp" );
+        cuda_free (workspace->dDelta_lp_temp, "dDelta_lp_temp" );
+        cuda_free (workspace->Delta_e, "Delta_e" );
+        cuda_free (workspace->Delta_boc, "Delta_boc");
+        cuda_free (workspace->nlp, "nlp");
+        cuda_free (workspace->nlp_temp, "nlp_temp");
+        cuda_free (workspace->Clp, "Clp");
+        cuda_free (workspace->vlpex, "vlpex");
+        cuda_free (workspace->bond_mark, "bond_mark");
+        cuda_free (workspace->done_after, "done_after");
+
+        /* QEq storage */
+        cuda_free (workspace->Hdia_inv, "Hdia_inv");
+        cuda_free (workspace->b_s, "b_s");
+        cuda_free (workspace->b_t, "b_t");
+        cuda_free (workspace->b_prc, "b_prc");
+        cuda_free (workspace->b_prm, "b_prm");
+        cuda_free (workspace->s, "s");
+        cuda_free (workspace->t, "t");
+        cuda_free (workspace->droptol, "droptol");
+        cuda_free (workspace->b, "b");
+        cuda_free (workspace->x, "x");
+
+        /* GMRES storage */
+        cuda_free (workspace->y, "y");
+        cuda_free (workspace->z, "z");
+        cuda_free (workspace->g, "g");
+        cuda_free (workspace->h, "h");
+        cuda_free (workspace->hs, "hs");
+        cuda_free (workspace->hc, "hc");
+        cuda_free (workspace->v, "v");
+
+        /* CG storage */
+        cuda_free (workspace->r, "r");
+        cuda_free (workspace->d, "d");
+        cuda_free (workspace->q, "q");
+        cuda_free (workspace->p, "p");
+        cuda_free (workspace->r2, "r2");
+        cuda_free (workspace->d2, "d2");
+        cuda_free (workspace->q2, "q2");
+        cuda_free (workspace->p2, "p2");
+
+        /* integrator storage */
+        cuda_free (workspace->v_const, "v_const");
+
+        /* storage for analysis */
+        if( control->molecular_analysis || control->diffusion_coef ) {
+            cuda_free (workspace->mark, "mark");
+            cuda_free (workspace->old_mark, "old_mark");
+        }
+        else
+            workspace->mark = workspace->old_mark = NULL;
+
+        if( control->diffusion_coef )
+            cuda_free (workspace->x_old, "x_old");
+        else
+            workspace->x_old = NULL;
+
+        /* force related storage */
+        cuda_free (workspace->f, "f");
+        cuda_free (workspace->CdDelta, "CdDelta");
+
+        /* Taper params */
+        cuda_free (workspace->Tap, "Tap");
+
+        return FAILURE;
+    }
+
+
+
+
+    int dev_alloc_matrix (sparse_matrix *H, int cap, int m)
+    {
+        //sparse_matrix *H;
+        //H = *pH;
+
+        H->cap = cap;
+        H->m = m;
+        cuda_malloc ((void **) &H->start, sizeof (int) * cap, 1, "matrix_start");
+        cuda_malloc ((void **) &H->end, sizeof (int) * cap, 1, "matrix_end");
+        cuda_malloc ((void **) &H->entries, sizeof (sparse_matrix_entry) * m, 1, "matrix_entries");
+
+        return SUCCESS;
+    }
+
+    int dev_dealloc_matrix (sparse_matrix *H)
+    {
+        cuda_free (H->start, "matrix_start");
+        cuda_free (H->end, "matrix_end");
+        cuda_free (H->entries, "matrix_entries");
+
+        return SUCCESS;
+    }
 
 
 }
diff --git a/PG-PuReMD/src/dev_list.cu b/PG-PuReMD/src/dev_list.cu
index 35e74d4a..7453fc8e 100644
--- a/PG-PuReMD/src/dev_list.cu
+++ b/PG-PuReMD/src/dev_list.cu
@@ -33,80 +33,80 @@
 extern "C" {
 
 
-	/************* allocate list space ******************/
-	int Dev_Make_List(int n, int num_intrs, int type, reax_list *l)
-	{
-		l->allocated = 1;
+    /************* allocate list space ******************/
+    int Dev_Make_List(int n, int num_intrs, int type, reax_list *l)
+    {
+        l->allocated = 1;
 
-		l->n = n;
-		l->num_intrs = num_intrs;
+        l->n = n;
+        l->num_intrs = num_intrs;
 
-		cuda_malloc ((void **) &l->index, n * sizeof (int), 1, "list:index");
-		cuda_malloc ((void **) &l->end_index, n * sizeof (int), 1, "list:end_index");
+        cuda_malloc ((void **) &l->index, n * sizeof (int), 1, "list:index");
+        cuda_malloc ((void **) &l->end_index, n * sizeof (int), 1, "list:end_index");
 
-		l->type = type;
+        l->type = type;
 #if defined(DEBUG_FOCUS)
-		fprintf( stderr, "list: n=%d num_intrs=%d type=%d\n", n, num_intrs, type );
+        fprintf( stderr, "list: n=%d num_intrs=%d type=%d\n", n, num_intrs, type );
 #endif
 
-		switch(l->type) {
-
-			case TYP_FAR_NEIGHBOR:
-				cuda_malloc ((void **) &l->select.far_nbr_list, 
-						l->num_intrs * sizeof (far_neighbor_data), 1, "list:far_nbrs");
-				break;
-
-			case TYP_THREE_BODY:
-				cuda_malloc ((void **) &l->select.three_body_list,
-						l->num_intrs * sizeof (three_body_interaction_data), 1, 
-						"list:three_bodies" );
-				break;
-
-			case TYP_HBOND:
-				cuda_malloc ((void **) &l->select.hbond_list, 
-						l->num_intrs * sizeof(hbond_data), 1, "list:hbonds" );
-				break;			
-
-			case TYP_BOND:
-				cuda_malloc ((void **) &l->select.bond_list,
-						l->num_intrs * sizeof(bond_data), 1, "list:bonds" );
-				break;
-
-			default:
-				fprintf( stderr, "ERROR: no %d list type defined!\n", l->type );
-				MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
-		}
-
-		return SUCCESS;
-	}
-
-
-	void Dev_Delete_List( reax_list *l)
-	{
-		if( l->allocated == 0 )
-			return;
-		l->allocated = 0;
-
-		cuda_free ( l->index, "index");
-		cuda_free ( l->end_index, "end_index" );
-
-		switch (l->type) {
-			case TYP_HBOND:
-				cuda_free( l->select.hbond_list, "list:hbonds" );
-				break;
-			case TYP_FAR_NEIGHBOR:
-				cuda_free( l->select.far_nbr_list, "list:far_nbrs" );
-				break;
-			case TYP_BOND:
-				cuda_free( l->select.bond_list, "list:bonds" );
-				break;
-			case TYP_THREE_BODY:
-				cuda_free( l->select.three_body_list, "list:three_bodies" );
-				break;
-			default:
-				fprintf (stderr, "ERROR no %d list type defined !\n", l->type);
-				MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
-		}
-	}
+        switch(l->type) {
+
+            case TYP_FAR_NEIGHBOR:
+                cuda_malloc ((void **) &l->select.far_nbr_list, 
+                        l->num_intrs * sizeof (far_neighbor_data), 1, "list:far_nbrs");
+                break;
+
+            case TYP_THREE_BODY:
+                cuda_malloc ((void **) &l->select.three_body_list,
+                        l->num_intrs * sizeof (three_body_interaction_data), 1, 
+                        "list:three_bodies" );
+                break;
+
+            case TYP_HBOND:
+                cuda_malloc ((void **) &l->select.hbond_list, 
+                        l->num_intrs * sizeof(hbond_data), 1, "list:hbonds" );
+                break;            
+
+            case TYP_BOND:
+                cuda_malloc ((void **) &l->select.bond_list,
+                        l->num_intrs * sizeof(bond_data), 1, "list:bonds" );
+                break;
+
+            default:
+                fprintf( stderr, "ERROR: no %d list type defined!\n", l->type );
+                MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+        }
+
+        return SUCCESS;
+    }
+
+
+    void Dev_Delete_List( reax_list *l)
+    {
+        if( l->allocated == 0 )
+            return;
+        l->allocated = 0;
+
+        cuda_free ( l->index, "index");
+        cuda_free ( l->end_index, "end_index" );
+
+        switch (l->type) {
+            case TYP_HBOND:
+                cuda_free( l->select.hbond_list, "list:hbonds" );
+                break;
+            case TYP_FAR_NEIGHBOR:
+                cuda_free( l->select.far_nbr_list, "list:far_nbrs" );
+                break;
+            case TYP_BOND:
+                cuda_free( l->select.bond_list, "list:bonds" );
+                break;
+            case TYP_THREE_BODY:
+                cuda_free( l->select.three_body_list, "list:three_bodies" );
+                break;
+            default:
+                fprintf (stderr, "ERROR no %d list type defined !\n", l->type);
+                MPI_Abort( MPI_COMM_WORLD, INVALID_INPUT );
+        }
+    }
 
 }
diff --git a/PG-PuReMD/src/dev_system_props.cu b/PG-PuReMD/src/dev_system_props.cu
index 53bc68d3..fdb3a567 100644
--- a/PG-PuReMD/src/dev_system_props.cu
+++ b/PG-PuReMD/src/dev_system_props.cu
@@ -10,307 +10,307 @@
 #include "cuda_shuffle.h"
 
 CUDA_GLOBAL void k_compute_total_mass (single_body_parameters *sbp, reax_atom *my_atoms, 
-		real *block_results, int n)
+        real *block_results, int n)
 {
 #if defined(__SM_35__)
 
-	extern __shared__ real my_sbp[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real	sdata = 0;
+    extern __shared__ real my_sbp[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real    sdata = 0;
 
-	if (i < n)
-		sdata = sbp [ my_atoms [i].type ].mass;
-	__syncthreads ();
+    if (i < n)
+        sdata = sbp [ my_atoms [i].type ].mass;
+    __syncthreads ();
 
-	for(int z = 16; z >=1; z/=2)
-		sdata += shfl ( sdata, z);
+    for(int z = 16; z >=1; z/=2)
+        sdata += shfl ( sdata, z);
 
-	if (threadIdx.x % 32 == 0)
-		my_sbp[threadIdx.x >> 5] = sdata;
+    if (threadIdx.x % 32 == 0)
+        my_sbp[threadIdx.x >> 5] = sdata;
 
-	__syncthreads ();
+    __syncthreads ();
 
-	for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-		if(threadIdx.x < offset)
-			my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset];
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
+        if(threadIdx.x < offset)
+            my_sbp[threadIdx.x] += my_sbp[threadIdx.x + offset];
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0)
-		block_results[blockIdx.x] = my_sbp[0];
+    if(threadIdx.x == 0)
+        block_results[blockIdx.x] = my_sbp[0];
 
 
 #else
 
-	extern __shared__ real sdata [];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real	x = 0;
+    extern __shared__ real sdata [];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real    x = 0;
 
-	if (i < n)
-		x = sbp [ my_atoms [i].type ].mass;
+    if (i < n)
+        x = sbp [ my_atoms [i].type ].mass;
 
-	sdata[ threadIdx.x ] = x;
-	__syncthreads ();
+    sdata[ threadIdx.x ] = x;
+    __syncthreads ();
 
-	for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-		if (threadIdx.x < offset)
-			sdata [threadIdx.x] += sdata [threadIdx.x + offset];
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset)
+            sdata [threadIdx.x] += sdata [threadIdx.x + offset];
 
-		__syncthreads ();
-	}
+        __syncthreads ();
+    }
 
-	if (threadIdx.x == 0)
-		block_results[ blockIdx.x] = sdata [0];
+    if (threadIdx.x == 0)
+        block_results[ blockIdx.x] = sdata [0];
 
 #endif
 }
 
 extern "C" void dev_compute_total_mass (reax_system *system, real *local_val)
 {
-	real *block_mass = (real *) scratch;
-	cuda_memset (block_mass, 0, sizeof (real) * (1 + BLOCKS_POW_2), "total_mass:tmp");
+    real *block_mass = (real *) scratch;
+    cuda_memset (block_mass, 0, sizeof (real) * (1 + BLOCKS_POW_2), "total_mass:tmp");
 
-	k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-		(system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    k_compute_total_mass <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, block_mass, system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-		(block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
+        (block_mass, block_mass + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof (real), 
-			cudaMemcpyDeviceToHost, "total_mass:tmp");
+    copy_host_device (local_val, block_mass + BLOCKS_POW_2, sizeof (real), 
+            cudaMemcpyDeviceToHost, "total_mass:tmp");
 }
 
 CUDA_GLOBAL void k_compute_kinetic_energy (single_body_parameters *sbp, reax_atom *my_atoms, 
-		real *block_results, int n)
+        real *block_results, int n)
 {
 
 #if defined(__SM_35__)
 
-	extern __shared__ real my_sbpdot[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real	sdata = 0;
-	rvec p;
+    extern __shared__ real my_sbpdot[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real    sdata = 0;
+    rvec p;
 
-	if (i < n) {
-		sdata = sbp [ my_atoms [i].type ].mass;
-		rvec_Scale( p, sdata, my_atoms[ i ].v );
-		sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
-	}
+    if (i < n) {
+        sdata = sbp [ my_atoms [i].type ].mass;
+        rvec_Scale( p, sdata, my_atoms[ i ].v );
+        sdata = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
+    }
 
-	__syncthreads ();
+    __syncthreads ();
 
-	for(int z = 16; z >=1; z/=2)
-		sdata += shfl ( sdata, z);
+    for(int z = 16; z >=1; z/=2)
+        sdata += shfl ( sdata, z);
 
-	if (threadIdx.x % 32 == 0)
-		my_sbpdot[threadIdx.x >> 5] = sdata;
+    if (threadIdx.x % 32 == 0)
+        my_sbpdot[threadIdx.x >> 5] = sdata;
 
-	__syncthreads ();
+    __syncthreads ();
 
-	for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
-		if (threadIdx.x < offset)
-			my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset];
+    for (int offset = blockDim.x >> 6; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset)
+            my_sbpdot[threadIdx.x] += my_sbpdot[threadIdx.x + offset];
 
-		__syncthreads ();
-	}
+        __syncthreads ();
+    }
 
-	if (threadIdx.x == 0)
-		block_results[ blockIdx.x] = my_sbpdot[0];
+    if (threadIdx.x == 0)
+        block_results[ blockIdx.x] = my_sbpdot[0];
 
 #else
 
 
-	extern __shared__ real sdata [];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real	m = 0;
-	rvec p;
+    extern __shared__ real sdata [];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real    m = 0;
+    rvec p;
 
-	if (i < n) {
-		m = sbp [ my_atoms [i].type ].mass;
-		rvec_Scale( p, m, my_atoms[ i ].v );
-		m = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
-	}
+    if (i < n) {
+        m = sbp [ my_atoms [i].type ].mass;
+        rvec_Scale( p, m, my_atoms[ i ].v );
+        m = 0.5 * rvec_Dot( p, my_atoms[ i ].v );
+    }
 
-	sdata[ threadIdx.x ] = m;
-	__syncthreads ();
+    sdata[ threadIdx.x ] = m;
+    __syncthreads ();
 
-	for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-		if (threadIdx.x < offset)
-			sdata [threadIdx.x] += sdata [threadIdx.x + offset];
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset)
+            sdata [threadIdx.x] += sdata [threadIdx.x + offset];
 
-		__syncthreads ();
-	}
+        __syncthreads ();
+    }
 
-	if (threadIdx.x == 0)
-		block_results[ blockIdx.x] = sdata [0];
+    if (threadIdx.x == 0)
+        block_results[ blockIdx.x] = sdata [0];
 
 #endif
 }
 
 extern "C" void dev_compute_kinetic_energy (reax_system *system, simulation_data *data, real *local_val)
 {
-	real *block_energy = (real *) scratch;
-	cuda_memset (block_energy, 0, sizeof (real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp");
-
-	k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
-		(system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
-		(block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	copy_host_device (local_val, block_energy + BLOCKS_POW_2,
-			//copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
-			sizeof (real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp");
-			//copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
-			//		sizeof (real), "kinetic_energy");
-			}
-
-			extern "C" void dev_compute_momentum (reax_system *system, rvec xcm, 
-				rvec vcm, rvec amcm)
-			{
-			rvec *l_xcm, *l_vcm, *l_amcm;
-			rvec *r_scratch = (rvec *)scratch;
+    real *block_energy = (real *) scratch;
+    cuda_memset (block_energy, 0, sizeof (real) * (BLOCKS_POW_2 + 1), "kinetic_energy:tmp");
+
+    k_compute_kinetic_energy <<<BLOCKS, BLOCK_SIZE, sizeof (real) * BLOCK_SIZE >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, block_energy, system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    k_reduction <<<1, BLOCKS_POW_2, sizeof (real) * BLOCKS_POW_2 >>>
+        (block_energy, block_energy + BLOCKS_POW_2, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    copy_host_device (local_val, block_energy + BLOCKS_POW_2,
+            //copy_host_device (local_val, &((simulation_data *)data->d_simulation_data)->my_en.e_kin, 
+            sizeof (real), cudaMemcpyDeviceToHost, "kinetic_energy:tmp");
+            //copy_device (block_energy + BLOCKS_POW_2, &((simulation_data *)data->d_simulation_data)->my_en.e_kin,
+            //        sizeof (real), "kinetic_energy");
+            }
+
+            extern "C" void dev_compute_momentum (reax_system *system, rvec xcm, 
+                rvec vcm, rvec amcm)
+            {
+            rvec *l_xcm, *l_vcm, *l_amcm;
+            rvec *r_scratch = (rvec *)scratch;
 
 #if defined( __SM_35__)
-			// xcm
-			cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-			l_xcm = r_scratch;
-
-			center_of_mass_blocks_xcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>>
-			(system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>>
-				(l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-			copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:xcm");
-
-			// vcm
-			cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-			l_vcm = r_scratch;
-
-			center_of_mass_blocks_vcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>>
-				(system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>>
-				(l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-			copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm");
-
-			// amcm
-			cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-			l_amcm = r_scratch;
-
-			center_of_mass_blocks_amcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>>
-				(system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>>
-				(l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-			copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:amcm");
+            // xcm
+            cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
+            l_xcm = r_scratch;
+
+            center_of_mass_blocks_xcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>>
+            (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, system->n );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>>
+                (l_xcm, l_xcm + BLOCKS_POW_2, BLOCKS_POW_2);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+            copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:xcm");
+
+            // vcm
+            cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
+            l_vcm = r_scratch;
+
+            center_of_mass_blocks_vcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>>
+                (system->reax_param.d_sbp, system->d_my_atoms, l_vcm, system->n );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>>
+                (l_vcm, l_vcm + BLOCKS_POW_2, BLOCKS_POW_2);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+            copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm");
+
+            // amcm
+            cuda_memset( scratch, 0,  sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
+            l_amcm = r_scratch;
+
+            center_of_mass_blocks_amcm <<<BLOCKS_POW_2,BLOCK_SIZE,(sizeof (rvec) * BLOCK_SIZE) >>>
+                (system->reax_param.d_sbp, system->d_my_atoms, l_amcm, system->n );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            k_reduction_rvec <<<1, BLOCKS_POW_2, (sizeof (rvec) * BLOCKS_POW_2) >>>
+                (l_amcm, l_amcm + BLOCKS_POW_2, BLOCKS_POW_2);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+            copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:amcm");
 
 #else
-			cuda_memset ( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
-
-			l_xcm = r_scratch;
-			l_vcm = r_scratch + (BLOCKS_POW_2 + 1); 
-			l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); 
-
-			center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> 
-				(system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n);
-			cudaThreadSynchronize (); 
-			cudaCheckError (); 
-
-			center_of_mass <<<1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> 
-				(l_xcm, l_vcm, l_amcm,
-				 l_xcm + BLOCKS_POW_2, 
-				 l_vcm + BLOCKS_POW_2, 
-				 l_amcm + BLOCKS_POW_2, 
-				 BLOCKS_POW_2);
-			cudaThreadSynchronize (); 
-			cudaCheckError ();
-
-			copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" );
-			copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" );
-			copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" );
+            cuda_memset ( scratch, 0, 3 * sizeof (rvec) * (BLOCKS_POW_2 + 1), "momentum:tmp");
+
+            l_xcm = r_scratch;
+            l_vcm = r_scratch + (BLOCKS_POW_2 + 1); 
+            l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1); 
+
+            center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (sizeof (rvec) * BLOCK_SIZE) >>> 
+                (system->reax_param.d_sbp, system->d_my_atoms, l_xcm, l_vcm, l_amcm, system->n);
+            cudaThreadSynchronize (); 
+            cudaCheckError (); 
+
+            center_of_mass <<<1, BLOCKS_POW_2, 3 * (sizeof (rvec) * BLOCKS_POW_2) >>> 
+                (l_xcm, l_vcm, l_amcm,
+                 l_xcm + BLOCKS_POW_2, 
+                 l_vcm + BLOCKS_POW_2, 
+                 l_amcm + BLOCKS_POW_2, 
+                 BLOCKS_POW_2);
+            cudaThreadSynchronize (); 
+            cudaCheckError ();
+
+            copy_host_device (xcm, l_xcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momemtum:xcm" );
+            copy_host_device (vcm, l_vcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost, "momentum:vcm" );
+            copy_host_device (amcm, l_amcm + BLOCKS_POW_2, sizeof (rvec), cudaMemcpyDeviceToHost,"momentum:amcm" );
 #endif
-			}
+            }
 
 extern "C" void dev_compute_inertial_tensor (reax_system *system, real *local_results, rvec my_xcm)
 {
 #if defined(__SM_35__)
-	real *partial_results = (real *) scratch;
-	cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
-
-	compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-		(system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-		 my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-		(system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-		 my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
-		(system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-		 my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
-		(partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
+    real *partial_results = (real *) scratch;
+    cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
+
+    compute_center_mass_xx_xy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    compute_center_mass_xz_yy <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    compute_center_mass_yz_zz <<<BLOCKS_POW_2, BLOCK_SIZE, 2 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
+        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
 
 #else
 
-	real *partial_results = (real *) scratch;
-	//real *local_results;
+    real *partial_results = (real *) scratch;
+    //real *local_results;
 
-	cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
-	//local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1));
+    cuda_memset (partial_results, 0, sizeof (real) * 6 * (BLOCKS_POW_2 + 1), "tensor:tmp");
+    //local_results = (real *) malloc (sizeof (real) * 6 *(BLOCKS_POW_2+ 1));
 
-	compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>>
-		(system->reax_param.d_sbp, system->d_my_atoms, partial_results,
-		 my_xcm[0], my_xcm[1], my_xcm[2], system->n);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (sizeof (real) * BLOCK_SIZE) >>>
+        (system->reax_param.d_sbp, system->d_my_atoms, partial_results,
+         my_xcm[0], my_xcm[1], my_xcm[2], system->n);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
-		(partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (sizeof (real) * BLOCKS_POW_2) >>>
+        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, 
-			sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
+    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, 
+            sizeof (real) * 6, cudaMemcpyDeviceToHost, "tensor:local_results");
 #endif
 }
 
 extern "C" void dev_sync_simulation_data (simulation_data *data)
 {
-	Output_Sync_Simulation_Data (data, (simulation_data *)data->d_simulation_data );
+    Output_Sync_Simulation_Data (data, (simulation_data *)data->d_simulation_data );
 }
 /*
    CUDA_GLOBAL void ker_kinetic_energy (reax_atom *my_atoms, 
diff --git a/PG-PuReMD/src/dual_matvec.cu b/PG-PuReMD/src/dual_matvec.cu
index d27fc361..a674118f 100644
--- a/PG-PuReMD/src/dual_matvec.cu
+++ b/PG-PuReMD/src/dual_matvec.cu
@@ -5,26 +5,26 @@
 //one thread per row
 CUDA_GLOBAL void k_dual_matvec(sparse_matrix H, rvec2 *vec, rvec2 *results, int rows)
 {
-	rvec2 results_row;
-	int col;
-	real val;
+    rvec2 results_row;
+    int col;
+    real val;
 
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= rows) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= rows) return;
 
-	results_row [0] = results_row[1] = 0;
+    results_row [0] = results_row[1] = 0;
 
-	for (int c = H.start[i]; c < H.end[i]; c++)
-	{
-		col = H.entries [c].j;
-		val = H.entries[c].val;
+    for (int c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
 
-		results_row[0] += val * vec [col][0];
-		results_row[1] += val * vec [col][1];
-	}
+        results_row[0] += val * vec [col][0];
+        results_row[1] += val * vec [col][1];
+    }
 
-	results [i][0] = results_row[0];
-	results [i][1] = results_row[1];
+    results [i][0] = results_row[0];
+    results [i][1] = results_row[1];
 }
 
 //32 thread warp per matrix row.
@@ -35,106 +35,106 @@ CUDA_GLOBAL void  k_dual_matvec_csr(sparse_matrix H, rvec2 *vec, rvec2 *results,
 {
 #if defined(__SM_35__)
 
-	rvec2 vals;
-	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
-	int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1);
+    rvec2 vals;
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
+    int lane = thread_id & (MATVEC_KER_THREADS_PER_ROW - 1);
 
-	int row_start;
-	int row_end;
+    int row_start;
+    int row_end;
 
-	// one warp per row
-	int row = warp_id;
+    // one warp per row
+    int row = warp_id;
 
-	vals[0] = 0;
-	vals[1] = 0;
+    vals[0] = 0;
+    vals[1] = 0;
 
-	if (row < num_rows) {
-		row_start = H.start[row];
-		row_end = H.end[row];
+    if (row < num_rows) {
+        row_start = H.start[row];
+        row_end = H.end[row];
 
-		for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW) {
-			vals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
-			vals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
-		}
-	}
+        for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW) {
+            vals[0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
+            vals[1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
+        }
+    }
 
-	for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2){
-		vals[0] += shfl( vals[0], s);
-		vals[1] += shfl( vals[1], s);
-	}
+    for (int s = MATVEC_KER_THREADS_PER_ROW >> 1; s >= 1; s /= 2){
+        vals[0] += shfl( vals[0], s);
+        vals[1] += shfl( vals[1], s);
+    }
 
-	if (lane == 0 && row < num_rows){
-		results[row][0] = vals[0];
-		results[row][1] = vals[1];
-	}
+    if (lane == 0 && row < num_rows){
+        results[row][0] = vals[0];
+        results[row][1] = vals[1];
+    }
 
 #else
 
 
-	extern __shared__ rvec2 vals [];
-	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	int warp_id = thread_id / 32;
-	int lane = thread_id & (32 - 1);
-
-	int row_start;
-	int row_end;
-
-	// one warp per row
-	//int row = warp_id;
-	int row = warp_id;
-	//if (row < num_rows)
-	{
-		vals[threadIdx.x][0] = 0;
-		vals[threadIdx.x][1] = 0;
-
-		if (row < num_rows) {
-			row_start = H.start[row];
-			row_end = H.end[row];
-
-			// compute running sum per thread
-			for(int jj = row_start + lane; jj < row_end; jj += 32) {
-				vals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
-				vals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
-			}
-		}
-
-		__syncthreads ();
-
-		// parallel reduction in shared memory
-		//SIMD instructions with a WARP are synchronous -- so we do not need to synch here
-		if (lane < 16) {
-			vals[threadIdx.x][0] += vals[threadIdx.x + 16][0]; 
-			vals[threadIdx.x][1] += vals[threadIdx.x + 16][1]; 
-		}
-		__syncthreads();
-		if (lane < 8) {
-			vals[threadIdx.x][0] += vals[threadIdx.x + 8][0]; 
-			vals[threadIdx.x][1] += vals[threadIdx.x + 8][1]; 
-		}
-		__syncthreads ();
-		if (lane < 4) {
-			vals[threadIdx.x][0] += vals[threadIdx.x + 4][0]; 
-			vals[threadIdx.x][1] += vals[threadIdx.x + 4][1]; 
-		}
-		__syncthreads ();
-		if (lane < 2) {
-			vals[threadIdx.x][0] += vals[threadIdx.x + 2][0]; 
-			vals[threadIdx.x][1] += vals[threadIdx.x + 2][1]; 
-		}
-		__syncthreads ();
-		if (lane < 1) {
-			vals[threadIdx.x][0] += vals[threadIdx.x + 1][0]; 
-			vals[threadIdx.x][1] += vals[threadIdx.x + 1][1]; 
-		}
-		__syncthreads ();
-
-		// first thread writes the result
-		if (lane == 0 && row < num_rows) {
-			results[row][0] = vals[threadIdx.x][0];
-			results[row][1] = vals[threadIdx.x][1];
-		}
-	}
+    extern __shared__ rvec2 vals [];
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / 32;
+    int lane = thread_id & (32 - 1);
+
+    int row_start;
+    int row_end;
+
+    // one warp per row
+    //int row = warp_id;
+    int row = warp_id;
+    //if (row < num_rows)
+    {
+        vals[threadIdx.x][0] = 0;
+        vals[threadIdx.x][1] = 0;
+
+        if (row < num_rows) {
+            row_start = H.start[row];
+            row_end = H.end[row];
+
+            // compute running sum per thread
+            for(int jj = row_start + lane; jj < row_end; jj += 32) {
+                vals[threadIdx.x][0] += H.entries[jj].val * vec [ H.entries[jj].j ][0];
+                vals[threadIdx.x][1] += H.entries[jj].val * vec [ H.entries[jj].j ][1];
+            }
+        }
+
+        __syncthreads ();
+
+        // parallel reduction in shared memory
+        //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+        if (lane < 16) {
+            vals[threadIdx.x][0] += vals[threadIdx.x + 16][0]; 
+            vals[threadIdx.x][1] += vals[threadIdx.x + 16][1]; 
+        }
+        __syncthreads();
+        if (lane < 8) {
+            vals[threadIdx.x][0] += vals[threadIdx.x + 8][0]; 
+            vals[threadIdx.x][1] += vals[threadIdx.x + 8][1]; 
+        }
+        __syncthreads ();
+        if (lane < 4) {
+            vals[threadIdx.x][0] += vals[threadIdx.x + 4][0]; 
+            vals[threadIdx.x][1] += vals[threadIdx.x + 4][1]; 
+        }
+        __syncthreads ();
+        if (lane < 2) {
+            vals[threadIdx.x][0] += vals[threadIdx.x + 2][0]; 
+            vals[threadIdx.x][1] += vals[threadIdx.x + 2][1]; 
+        }
+        __syncthreads ();
+        if (lane < 1) {
+            vals[threadIdx.x][0] += vals[threadIdx.x + 1][0]; 
+            vals[threadIdx.x][1] += vals[threadIdx.x + 1][1]; 
+        }
+        __syncthreads ();
+
+        // first thread writes the result
+        if (lane == 0 && row < num_rows) {
+            results[row][0] = vals[threadIdx.x][0];
+            results[row][1] = vals[threadIdx.x][1];
+        }
+    }
 
 #endif
 }
diff --git a/PG-PuReMD/src/matvec.cu b/PG-PuReMD/src/matvec.cu
index 960b1dad..dcde4165 100644
--- a/PG-PuReMD/src/matvec.cu
+++ b/PG-PuReMD/src/matvec.cu
@@ -6,22 +6,22 @@
 //one thread per row
 CUDA_GLOBAL void k_matvec (sparse_matrix H, real *vec, real *results, int rows)
 {
-	real results_row = 0;
-	int col;
-	real val;
+    real results_row = 0;
+    int col;
+    real val;
 
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= rows) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= rows) return;
 
-	for (int c = H.start[i]; c < H.end[i]; c++)
-	{
-		col = H.entries [c].j;
-		val = H.entries[c].val;
+    for (int c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
 
-		results_row += val * vec [col];
-	}
+        results_row += val * vec [col];
+    }
 
-	results [i] = results_row;
+    results [i] = results_row;
 }
 
 //32 thread warp per matrix row.
@@ -31,61 +31,61 @@ CUDA_GLOBAL void k_matvec (sparse_matrix H, real *vec, real *results, int rows)
 CUDA_GLOBAL void k_matvec_csr(sparse_matrix H, real *vec, real *results, int num_rows)
 {
 #if defined(__SM_35__)
-	real vals;
+    real vals;
 #else
-	extern __shared__ real vals [];
+    extern __shared__ real vals [];
 #endif
-	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
-	int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1);
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / MATVEC_KER_THREADS_PER_ROW;
+    int lane = thread_id & ( MATVEC_KER_THREADS_PER_ROW - 1);
 
-	int row_start;
-	int row_end;
+    int row_start;
+    int row_end;
 
-	// one warp per row
-	//int row = warp_id;
-	int row = warp_id;
-	//if (row < num_rows)
-	{
+    // one warp per row
+    //int row = warp_id;
+    int row = warp_id;
+    //if (row < num_rows)
+    {
 #if defined(__SM_35__)
-		vals = 0;
+        vals = 0;
 #else
-		vals[threadIdx.x] = 0;
+        vals[threadIdx.x] = 0;
 #endif
 
-		if (row < num_rows) {
-			row_start = H.start[row];
-			row_end = H.end[row];
+        if (row < num_rows) {
+            row_start = H.start[row];
+            row_end = H.end[row];
 
-			// compute running sum per thread
-			for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW)
+            // compute running sum per thread
+            for(int jj = row_start + lane; jj < row_end; jj += MATVEC_KER_THREADS_PER_ROW)
 #if defined(__SM_35__)
-				vals += H.entries[jj].val * vec [ H.entries[jj].j ];
-		}
+                vals += H.entries[jj].val * vec [ H.entries[jj].j ];
+        }
 #else
-		vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ];
-	}
-	__syncthreads ();
+        vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ];
+    }
+    __syncthreads ();
 #endif
 
-	// parallel reduction in shared memory
-	//SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+    // parallel reduction in shared memory
+    //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
 #if defined(__SM_35__)
-	for (int x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2)
-		vals += shfl( vals, x );
+    for (int x = MATVEC_KER_THREADS_PER_ROW >> 1; x >= 1; x/=2)
+        vals += shfl( vals, x );
 
-	if (lane == 0 && row < num_rows)
-		results[row] = vals;
+    if (lane == 0 && row < num_rows)
+        results[row] = vals;
 #else
-	if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads();
-	if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads ();
-	if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads ();
-	if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads ();
-	if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads ();
+    if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads();
+    if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads ();
+    if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads ();
+    if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads ();
+    if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads ();
 
-	// first thread writes the result
-	if (lane == 0 && row < num_rows)
-		results[row] = vals[threadIdx.x];
+    // first thread writes the result
+    if (lane == 0 && row < num_rows)
+        results[row] = vals[threadIdx.x];
 #endif
 }
 }
diff --git a/PG-PuReMD/src/reduction.cu b/PG-PuReMD/src/reduction.cu
index 770e4301..370e491b 100644
--- a/PG-PuReMD/src/reduction.cu
+++ b/PG-PuReMD/src/reduction.cu
@@ -7,62 +7,62 @@
 CUDA_GLOBAL void k_reduction(const real *input, real *per_block_results, const size_t n)
 {
 #if defined(__SM_35__)
-	extern __shared__ real my_results[];
-	real sdata;
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
+    extern __shared__ real my_results[];
+    real sdata;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
 
-	if(i < n)
-		x = input[i];
+    if(i < n)
+        x = input[i];
 
-	sdata = x;
-	__syncthreads();
+    sdata = x;
+    __syncthreads();
 
-	for(int z = 16; z >=1; z/=2)
-		sdata+= shfl ( sdata, z);
+    for(int z = 16; z >=1; z/=2)
+        sdata+= shfl ( sdata, z);
 
-	if (threadIdx.x % 32 == 0)
-		my_results[threadIdx.x >> 5] = sdata;
+    if (threadIdx.x % 32 == 0)
+        my_results[threadIdx.x >> 5] = sdata;
 
-	__syncthreads ();
+    __syncthreads ();
 
-	for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-		if(threadIdx.x < offset)
-			my_results[threadIdx.x] += my_results[threadIdx.x + offset];
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
+        if(threadIdx.x < offset)
+            my_results[threadIdx.x] += my_results[threadIdx.x + offset];
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0)
-		per_block_results[blockIdx.x] = my_results[0];
+    if(threadIdx.x == 0)
+        per_block_results[blockIdx.x] = my_results[0];
 
 #else
 
-	extern __shared__ real sdata[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
-
-	if(i < n)
-	{
-		x = input[i];
-	}
-	sdata[threadIdx.x] = x;
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		per_block_results[blockIdx.x] = sdata[0];
-	}
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if(i < n)
+    {
+        x = input[i];
+    }
+    sdata[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sdata[0];
+    }
 #endif
 }
 
@@ -71,70 +71,70 @@ CUDA_GLOBAL void k_reduction_rvec (rvec *input, rvec *results, size_t n)
 #if defined(__SM_35__)
 
 
-	extern __shared__ rvec my_rvec[];
-	rvec sdata;
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	rvec_MakeZero( sdata );
+    extern __shared__ rvec my_rvec[];
+    rvec sdata;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    rvec_MakeZero( sdata );
 
-	if(i < n)
-		rvec_Copy (sdata, input[i]);
+    if(i < n)
+        rvec_Copy (sdata, input[i]);
 
-	__syncthreads();
+    __syncthreads();
 
-	for(int z = 16; z >=1; z/=2){
-		sdata[0] += shfl ( sdata[0], z);
-		sdata[1] += shfl ( sdata[1], z);
-		sdata[2] += shfl ( sdata[2], z);
-	}
+    for(int z = 16; z >=1; z/=2){
+        sdata[0] += shfl ( sdata[0], z);
+        sdata[1] += shfl ( sdata[1], z);
+        sdata[2] += shfl ( sdata[2], z);
+    }
 
-	if (threadIdx.x % 32 == 0)
-		rvec_Copy( my_rvec[threadIdx.x >> 5] , sdata );
+    if (threadIdx.x % 32 == 0)
+        rvec_Copy( my_rvec[threadIdx.x >> 5] , sdata );
 
-	__syncthreads ();
+    __syncthreads ();
 
-	for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-		if(threadIdx.x < offset)
-			rvec_Add( my_rvec[threadIdx.x], my_rvec[threadIdx.x + offset] );
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
+        if(threadIdx.x < offset)
+            rvec_Add( my_rvec[threadIdx.x], my_rvec[threadIdx.x + offset] );
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0)
-		rvec_Add (results[blockIdx.x], my_rvec[0]);
+    if(threadIdx.x == 0)
+        rvec_Add (results[blockIdx.x], my_rvec[0]);
 
 
 #else
 
 
-	extern __shared__ rvec svec_data[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	rvec x;
+    extern __shared__ rvec svec_data[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    rvec x;
 
-	rvec_MakeZero (x);
+    rvec_MakeZero (x);
 
-	if(i < n)
-	{
-		rvec_Copy (x, input[i]);
-	}
+    if(i < n)
+    {
+        rvec_Copy (x, input[i]);
+    }
 
-	rvec_Copy (svec_data[threadIdx.x], x);
-	__syncthreads();
+    rvec_Copy (svec_data[threadIdx.x], x);
+    __syncthreads();
 
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]);
-		}
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]);
+        }
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0)
-	{
-		//rvec_Copy (results[blockIdx.x], svec_data[0]);
-		rvec_Add (results[blockIdx.x], svec_data[0]);
-	}
+    if(threadIdx.x == 0)
+    {
+        //rvec_Copy (results[blockIdx.x], svec_data[0]);
+        rvec_Add (results[blockIdx.x], svec_data[0]);
+    }
 #endif
 
 
@@ -144,81 +144,81 @@ CUDA_GLOBAL void k_reduction_rvec2 (rvec2 *input, rvec2 *results, size_t n)
 {
 #if defined(__SM_35__)
 
-	extern __shared__ rvec2 my_rvec2[];
-	rvec2 sdata;
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    extern __shared__ rvec2 my_rvec2[];
+    rvec2 sdata;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
-	sdata[0] = 0.0;
-	sdata[1] = 0.0;
+    sdata[0] = 0.0;
+    sdata[1] = 0.0;
 
-	if(i < n){
-		sdata[0] = input[i][0];
-		sdata[1] = input[i][1];
-	}
+    if(i < n){
+        sdata[0] = input[i][0];
+        sdata[1] = input[i][1];
+    }
 
-	__syncthreads();
+    __syncthreads();
 
-	for(int z = 16; z >=1; z/=2){
-		sdata[0] += shfl ( sdata[0], z);
-		sdata[1] += shfl ( sdata[1], z);
-	}
+    for(int z = 16; z >=1; z/=2){
+        sdata[0] += shfl ( sdata[0], z);
+        sdata[1] += shfl ( sdata[1], z);
+    }
 
-	if (threadIdx.x % 32 == 0){
-		my_rvec2[threadIdx.x >> 5][0] = sdata[0];
-		my_rvec2[threadIdx.x >> 5][1] = sdata[1];
-	}
+    if (threadIdx.x % 32 == 0){
+        my_rvec2[threadIdx.x >> 5][0] = sdata[0];
+        my_rvec2[threadIdx.x >> 5][1] = sdata[1];
+    }
 
-	__syncthreads ();
+    __syncthreads ();
 
-	for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-		if(threadIdx.x < offset){
-			my_rvec2[threadIdx.x][0] += my_rvec2[threadIdx.x + offset][0];
-			my_rvec2[threadIdx.x][1] += my_rvec2[threadIdx.x + offset][1];
-		}
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
+        if(threadIdx.x < offset){
+            my_rvec2[threadIdx.x][0] += my_rvec2[threadIdx.x + offset][0];
+            my_rvec2[threadIdx.x][1] += my_rvec2[threadIdx.x + offset][1];
+        }
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0){
-		results[blockIdx.x][0] = my_rvec2[0][0];
-		results[blockIdx.x][1] = my_rvec2[0][1];
-	}
+    if(threadIdx.x == 0){
+        results[blockIdx.x][0] = my_rvec2[0][0];
+        results[blockIdx.x][1] = my_rvec2[0][1];
+    }
 
 #else
-	extern __shared__ rvec2 svec2_data[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	rvec2 x;
-
-	x[0] = 0.0;
-	x[1] = 0.0;
-
-	if(i < n)
-	{
-		x[0] += input[i][0];
-		x[1] += input[i][1];
-	}
-
-	svec2_data [threadIdx.x][0] = x[0];
-	svec2_data [threadIdx.x][1] = x[1];
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			svec2_data [threadIdx.x][0] += svec2_data [threadIdx.x + offset][0];
-			svec2_data [threadIdx.x][1] += svec2_data [threadIdx.x + offset][1];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		//rvec_Copy (results[blockIdx.x], svec_data[0]);
-		results [blockIdx.x][0] += svec2_data [0][0];
-		results [blockIdx.x][1] += svec2_data [0][1];
-	}
+    extern __shared__ rvec2 svec2_data[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    rvec2 x;
+
+    x[0] = 0.0;
+    x[1] = 0.0;
+
+    if(i < n)
+    {
+        x[0] += input[i][0];
+        x[1] += input[i][1];
+    }
+
+    svec2_data [threadIdx.x][0] = x[0];
+    svec2_data [threadIdx.x][1] = x[1];
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            svec2_data [threadIdx.x][0] += svec2_data [threadIdx.x + offset][0];
+            svec2_data [threadIdx.x][1] += svec2_data [threadIdx.x + offset][1];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        //rvec_Copy (results[blockIdx.x], svec_data[0]);
+        results [blockIdx.x][0] += svec2_data [0][0];
+        results [blockIdx.x][1] += svec2_data [0][1];
+    }
 #endif
 }
 
@@ -226,61 +226,61 @@ CUDA_GLOBAL void k_dot (const real *a, const real *b, real *per_block_results, c
 {
 #if defined(__SM_35__)
 
-	extern __shared__ real my_dot[];
-	real sdot;
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    extern __shared__ real my_dot[];
+    real sdot;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
-	sdot = 0.0;
-	if(i < n)
-		sdot = a[i] * b[i];
+    sdot = 0.0;
+    if(i < n)
+        sdot = a[i] * b[i];
 
-	__syncthreads();
+    __syncthreads();
 
-	for(int z = 16; z >=1; z/=2)
-		sdot += shfl ( sdot, z);
+    for(int z = 16; z >=1; z/=2)
+        sdot += shfl ( sdot, z);
 
-	if (threadIdx.x % 32 == 0)
-		my_dot[threadIdx.x >> 5] = sdot;
+    if (threadIdx.x % 32 == 0)
+        my_dot[threadIdx.x >> 5] = sdot;
 
-	__syncthreads ();
+    __syncthreads ();
 
-	for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-		if(threadIdx.x < offset)
-			my_dot[threadIdx.x] += my_dot[threadIdx.x + offset];
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
+        if(threadIdx.x < offset)
+            my_dot[threadIdx.x] += my_dot[threadIdx.x + offset];
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0)
-		per_block_results[blockIdx.x] = my_dot[0];
+    if(threadIdx.x == 0)
+        per_block_results[blockIdx.x] = my_dot[0];
 
 #else
 
-	extern __shared__ real sdot[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
+    extern __shared__ real sdot[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
 
-	if(i < n)
-	{
-		x = a[i] * b[i];
-	}
-	sdot[threadIdx.x] = x;
-	__syncthreads();
+    if(i < n)
+    {
+        x = a[i] * b[i];
+    }
+    sdot[threadIdx.x] = x;
+    __syncthreads();
 
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			sdot[threadIdx.x] += sdot[threadIdx.x + offset];
-		}
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            sdot[threadIdx.x] += sdot[threadIdx.x + offset];
+        }
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0)
-	{
-		per_block_results[blockIdx.x] = sdot[0];
-	}
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sdot[0];
+    }
 
 #endif
 
@@ -290,56 +290,56 @@ CUDA_GLOBAL void k_norm (const real *input, real *per_block_results, const size_
 {
 #if defined(__SM_35__)
 
-	extern __shared__ real my_norm[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real snorm = 0.0;
+    extern __shared__ real my_norm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real snorm = 0.0;
 
-	if(i < n)
-		snorm = SQR (input[i]);
+    if(i < n)
+        snorm = SQR (input[i]);
 
-	__syncthreads();
+    __syncthreads();
 
-	for(int z = 16; z >=1; z/=2)
-		snorm += shfl ( snorm, z);
+    for(int z = 16; z >=1; z/=2)
+        snorm += shfl ( snorm, z);
 
-	if (threadIdx.x % 32 == 0)
-		my_norm[threadIdx.x >> 5] = snorm;
+    if (threadIdx.x % 32 == 0)
+        my_norm[threadIdx.x >> 5] = snorm;
 
-	__syncthreads ();
+    __syncthreads ();
 
-	for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-		if(threadIdx.x < offset)
-			my_norm[threadIdx.x] += my_norm[threadIdx.x + offset];
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
+        if(threadIdx.x < offset)
+            my_norm[threadIdx.x] += my_norm[threadIdx.x + offset];
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0)
-		per_block_results[blockIdx.x] = my_norm[0];
+    if(threadIdx.x == 0)
+        per_block_results[blockIdx.x] = my_norm[0];
 
 #else
-	extern __shared__ real snorm[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
+    extern __shared__ real snorm[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
 
-	if(i < n)
-		x = SQR (input[i]);
+    if(i < n)
+        x = SQR (input[i]);
 
-	snorm[threadIdx.x] = x;
-	__syncthreads();
+    snorm[threadIdx.x] = x;
+    __syncthreads();
 
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			snorm[threadIdx.x] += snorm[threadIdx.x + offset];
-		}
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            snorm[threadIdx.x] += snorm[threadIdx.x + offset];
+        }
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0)
-		per_block_results[blockIdx.x] = snorm[0];
+    if(threadIdx.x == 0)
+        per_block_results[blockIdx.x] = snorm[0];
 
 
 #endif
@@ -351,84 +351,84 @@ CUDA_GLOBAL void k_norm_rvec2 (const rvec2 *input, rvec2 *per_block_results, con
 {
 #if defined(__SM_35__)
 
-	extern __shared__ rvec2 my_norm2[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	rvec2 snorm2;
-	snorm2[0] = snorm2[1] = 0;
-
-	if(i < n) {
-		if (pass == INITIAL) {	
-			snorm2[0] = SQR (input[i][0]);
-			snorm2[1] = SQR (input[i][1]);
-		} else {
-			snorm2[0] = input[i][0];
-			snorm2[1] = input[i][1];
-		}
-	}
-	__syncthreads();
-
-	for(int z = 16; z >=1; z/=2){
-		snorm2[0] += shfl ( snorm2[0], z);
-		snorm2[1] += shfl ( snorm2[1], z);
-	}
-
-	if (threadIdx.x % 32 == 0){
-		my_norm2[threadIdx.x >> 5][0] = snorm2[0];
-		my_norm2[threadIdx.x >> 5][1] = snorm2[1];
-	}
-
-	__syncthreads ();
-
-	for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-		if(threadIdx.x < offset){
-			my_norm2[threadIdx.x][0] += my_norm2[threadIdx.x + offset][0];
-			my_norm2[threadIdx.x][1] += my_norm2[threadIdx.x + offset][1];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0) {
-		per_block_results[blockIdx.x][0] = my_norm2[0][0];
-		per_block_results[blockIdx.x][1] = my_norm2[0][1];
-	}
+    extern __shared__ rvec2 my_norm2[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    rvec2 snorm2;
+    snorm2[0] = snorm2[1] = 0;
+
+    if(i < n) {
+        if (pass == INITIAL) {    
+            snorm2[0] = SQR (input[i][0]);
+            snorm2[1] = SQR (input[i][1]);
+        } else {
+            snorm2[0] = input[i][0];
+            snorm2[1] = input[i][1];
+        }
+    }
+    __syncthreads();
+
+    for(int z = 16; z >=1; z/=2){
+        snorm2[0] += shfl ( snorm2[0], z);
+        snorm2[1] += shfl ( snorm2[1], z);
+    }
+
+    if (threadIdx.x % 32 == 0){
+        my_norm2[threadIdx.x >> 5][0] = snorm2[0];
+        my_norm2[threadIdx.x >> 5][1] = snorm2[1];
+    }
+
+    __syncthreads ();
+
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
+        if(threadIdx.x < offset){
+            my_norm2[threadIdx.x][0] += my_norm2[threadIdx.x + offset][0];
+            my_norm2[threadIdx.x][1] += my_norm2[threadIdx.x + offset][1];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0) {
+        per_block_results[blockIdx.x][0] = my_norm2[0][0];
+        per_block_results[blockIdx.x][1] = my_norm2[0][1];
+    }
 
 #else
 
-	extern __shared__ rvec2 snorm2[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	rvec2 x;
-	x[0] = x[1] = 0;
-
-	if(i < n) {
-		if (pass == INITIAL) {	
-			x[0] = SQR (input[i][0]);
-			x[1] = SQR (input[i][1]);
-		} else {
-			x[0] = input[i][0];
-			x[1] = input[i][1];
-		}
-	}
-
-	snorm2[threadIdx.x][0] = x[0];
-	snorm2[threadIdx.x][1] = x[1];
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			snorm2[threadIdx.x][0] += snorm2[threadIdx.x + offset][0];
-			snorm2[threadIdx.x][1] += snorm2[threadIdx.x + offset][1];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0) {
-		per_block_results[blockIdx.x][0] = snorm2[0][0];
-		per_block_results[blockIdx.x][1] = snorm2[0][1];
-	}
+    extern __shared__ rvec2 snorm2[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    rvec2 x;
+    x[0] = x[1] = 0;
+
+    if(i < n) {
+        if (pass == INITIAL) {    
+            x[0] = SQR (input[i][0]);
+            x[1] = SQR (input[i][1]);
+        } else {
+            x[0] = input[i][0];
+            x[1] = input[i][1];
+        }
+    }
+
+    snorm2[threadIdx.x][0] = x[0];
+    snorm2[threadIdx.x][1] = x[1];
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            snorm2[threadIdx.x][0] += snorm2[threadIdx.x + offset][0];
+            snorm2[threadIdx.x][1] += snorm2[threadIdx.x + offset][1];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0) {
+        per_block_results[blockIdx.x][0] = snorm2[0][0];
+        per_block_results[blockIdx.x][1] = snorm2[0][1];
+    }
 #endif
 }
 
@@ -436,76 +436,76 @@ CUDA_GLOBAL void k_dot_rvec2 (const rvec2 *a, rvec2 *b, rvec2 *res, const size_t
 {
 #if defined(__SM_35__)
 
-	extern __shared__ rvec2 my_dot2[];
-	rvec2 sdot2;
+    extern __shared__ rvec2 my_dot2[];
+    rvec2 sdot2;
 
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	sdot2[0] = sdot2[1] = 0;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    sdot2[0] = sdot2[1] = 0;
 
-	if(i < n) {
-		sdot2[0] = a[i][0] * b[i][0];
-		sdot2[1] = a[i][1] * b[i][1];
-	}
+    if(i < n) {
+        sdot2[0] = a[i][0] * b[i][0];
+        sdot2[1] = a[i][1] * b[i][1];
+    }
 
-	__syncthreads();
+    __syncthreads();
 
-	for(int z = 16; z >=1; z/=2){
-		sdot2[0] += shfl ( sdot2[0], z);
-		sdot2[1] += shfl ( sdot2[1], z);
-	}
+    for(int z = 16; z >=1; z/=2){
+        sdot2[0] += shfl ( sdot2[0], z);
+        sdot2[1] += shfl ( sdot2[1], z);
+    }
 
-	if (threadIdx.x % 32 == 0){
-		my_dot2[threadIdx.x >> 5][0] = sdot2[0];
-		my_dot2[threadIdx.x >> 5][1] = sdot2[1];
-	}
+    if (threadIdx.x % 32 == 0){
+        my_dot2[threadIdx.x >> 5][0] = sdot2[0];
+        my_dot2[threadIdx.x >> 5][1] = sdot2[1];
+    }
 
-	__syncthreads ();
+    __syncthreads ();
 
-	for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
-		if(threadIdx.x < offset){
-			my_dot2[threadIdx.x][0] += my_dot2[threadIdx.x + offset][0];
-			my_dot2[threadIdx.x][1] += my_dot2[threadIdx.x + offset][1];
-		}
+    for(int offset = blockDim.x >> 6; offset > 0; offset >>= 1) {
+        if(threadIdx.x < offset){
+            my_dot2[threadIdx.x][0] += my_dot2[threadIdx.x + offset][0];
+            my_dot2[threadIdx.x][1] += my_dot2[threadIdx.x + offset][1];
+        }
 
-		__syncthreads();
-	}
+        __syncthreads();
+    }
 
-	if(threadIdx.x == 0) {
-		res[blockIdx.x][0] = my_dot2[0][0];
-		res[blockIdx.x][1] = my_dot2[0][1];
-	}
+    if(threadIdx.x == 0) {
+        res[blockIdx.x][0] = my_dot2[0][0];
+        res[blockIdx.x][1] = my_dot2[0][1];
+    }
 
 
 #else
-	extern __shared__ rvec2 sdot2[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	rvec2 x;
-	x[0] = x[1] = 0;
-
-	if(i < n) {
-		x[0] = a[i][0] * b[i][0];
-		x[1] = a[i][1] * b[i][1];
-	}
-
-	sdot2[threadIdx.x][0] = x[0];
-	sdot2[threadIdx.x][1] = x[1];
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			sdot2[threadIdx.x][0] += sdot2[threadIdx.x + offset][0];
-			sdot2[threadIdx.x][1] += sdot2[threadIdx.x + offset][1];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0) {
-		res[blockIdx.x][0] = sdot2[0][0];
-		res[blockIdx.x][1] = sdot2[0][1];
-	}
+    extern __shared__ rvec2 sdot2[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    rvec2 x;
+    x[0] = x[1] = 0;
+
+    if(i < n) {
+        x[0] = a[i][0] * b[i][0];
+        x[1] = a[i][1] * b[i][1];
+    }
+
+    sdot2[threadIdx.x][0] = x[0];
+    sdot2[threadIdx.x][1] = x[1];
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            sdot2[threadIdx.x][0] += sdot2[threadIdx.x + offset][0];
+            sdot2[threadIdx.x][1] += sdot2[threadIdx.x + offset][1];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0) {
+        res[blockIdx.x][0] = sdot2[0][0];
+        res[blockIdx.x][1] = sdot2[0][1];
+    }
 #endif
 }
 
@@ -515,37 +515,37 @@ CUDA_GLOBAL void k_dot_rvec2 (const rvec2 *a, rvec2 *b, rvec2 *res, const size_t
 
 CUDA_GLOBAL void k_vector_sum( real* dest, real c, real* v, real d, real* y, int k )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= k) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= k) return;
 
-	dest[i] = c * v[i] + d * y[i];
+    dest[i] = c * v[i] + d * y[i];
 }
 
 
 CUDA_GLOBAL void k_vector_mul( real* dest, real* v, real* y, int k )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= k) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= k) return;
 
-	dest[i] = v[i] * y[i];
+    dest[i] = v[i] * y[i];
 }
 
 CUDA_GLOBAL void k_rvec2_mul( rvec2* dest, rvec2* v, rvec2* y, int k )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= k) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= k) return;
 
-	dest[i][0] = v[i][0] * y[i][0];
-	dest[i][1] = v[i][1] * y[i][1];
+    dest[i][0] = v[i][0] * y[i][0];
+    dest[i][1] = v[i][1] * y[i][1];
 }
 
 CUDA_GLOBAL void k_rvec2_pbetad (rvec2 *dest, rvec2 *a, 
-		real beta0, real beta1, 
-		rvec2 *b, int n)
+        real beta0, real beta1, 
+        rvec2 *b, int n)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n) return;
 
-	dest[i][0] = a[i][0] + beta0 * b[i][0];
-	dest[i][1] = a[i][1] + beta1 * b[i][1];
+    dest[i][0] = a[i][0] + beta0 * b[i][0];
+    dest[i][1] = a[i][1] + beta1 * b[i][1];
 }
diff --git a/PG-PuReMD/src/validation.cu b/PG-PuReMD/src/validation.cu
index f3b74862..2bffc36f 100644
--- a/PG-PuReMD/src/validation.cu
+++ b/PG-PuReMD/src/validation.cu
@@ -9,1454 +9,1454 @@
 
 bool check_zero (real p1, real p2)
 {
-	if (abs (p1 - p2) >= GPU_TOLERANCE)
-		return true;
-	else 
-		return false;
+    if (abs (p1 - p2) >= GPU_TOLERANCE)
+        return true;
+    else 
+        return false;
 }
 
 bool check_zero (rvec p1, rvec p2)
 {
 
-	if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) ||
-			((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) ||
-			((abs (p1[2] - p2[2])) >= GPU_TOLERANCE ))
-		return true;
-	else return false;
+    if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) ||
+            ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) ||
+            ((abs (p1[2] - p2[2])) >= GPU_TOLERANCE ))
+        return true;
+    else return false;
 }
 
 bool check_zero_rvec2 (rvec2 p1, rvec2 p2)
 {
 
-	if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) ||
-			((abs (p1[1] - p2[1])) >= GPU_TOLERANCE ))
-		return true;
-	else return false;
+    if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) ||
+            ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE ))
+        return true;
+    else return false;
 }
 
 bool check_same (ivec p1, ivec p2)
 {
-	if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) )
-		return true;
-	else 
-		return false;
+    if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) )
+        return true;
+    else 
+        return false;
 }
 
 void print_bond_data (bond_order_data *s)
 {
-	/*   
-	     fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", 
-	     s->BO, 
-	     s->BO_s, 
-	     s->BO_pi,
-	     s->BO_pi2 );
-	 */
-	fprintf (stderr, " Cdbo (%e) ", s->Cdbo );
-	fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi );
-	fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 );
+    /*   
+         fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", 
+         s->BO, 
+         s->BO_s, 
+         s->BO_pi,
+         s->BO_pi2 );
+     */
+    fprintf (stderr, " Cdbo (%e) ", s->Cdbo );
+    fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi );
+    fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 );
 }
 
 
 int validate_neighbors (reax_system *system, reax_list **lists)
 {
-	reax_list *far_nbrs = *lists + FAR_NBRS;
-	reax_list *d_nbrs = *dev_lists + FAR_NBRS;
-	far_neighbor_data gpu, cpu;
-	int index, count, jicount;
-	int hostcount, dijcount, djicount;
-	int i;
-
-	int *end = (int *)malloc (sizeof (int) * system->N);
-	int *start = (int *) malloc (sizeof (int) * system->N );
-
-	copy_host_device (start, d_nbrs->index, 
-			sizeof (int) * system->N, cudaMemcpyDeviceToHost, "far_nbrs:index");
-	copy_host_device (end, d_nbrs->end_index, 
-			sizeof (int) * system->N, cudaMemcpyDeviceToHost, "far_nbrs:end_index");
-
-	far_neighbor_data *data = (far_neighbor_data *) 
-		malloc (sizeof (far_neighbor_data)* d_nbrs->num_intrs);
-	copy_host_device (data, d_nbrs->select.far_nbr_list, 
-			sizeof (far_neighbor_data) * d_nbrs->num_intrs, cudaMemcpyDeviceToHost, "far_nbr_list");
-
-	hostcount = dijcount = djicount = 0;
-
-	for (i= 0; i < system->N-1; i++){
-		if (end [i] > start [i+1])
-		{
-			fprintf (stderr, " Far Neighbors index over write  @ index %d (%d, %d) and (%d %d)\n", 
-					i, start[i], end[i], start[i+1], end[i+1]);
-			return FAILURE;
-		}
-		hostcount += end[i] - start[i];
-	}
-	hostcount += end[i] - start[i];
-	fprintf (stderr, "Total Neighbors count: %d \n", hostcount);
-	hostcount = 0;
-
-	return 0;
-
-	/*
-	   for (int i = 0; i < 2; i++) {
-	   for (int j = start[i]; j < end[i]; j++){
-	   gpu = data[j];
-	   fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) - %d \n", i, data[j].nbr,
-	   data[j].d,
-	   data[j].rel_box[0],
-	   data[j].rel_box[1],
-	   data[j].rel_box[2],
-	   data[j].dvec[0],
-	   data[j].dvec[1],
-	   data[j].dvec[2], 
-	   j
-	   );
-	   }
-	   }
-
-	   return SUCCESS;
-	 */
-
-	for (int i = 0; i < system->N; i++){
-		index = Start_Index (i, far_nbrs);
-
-		for (int j = start[i]; j < end[i]; j++){
-
-
-			if (i > data[j].nbr) {
-
-				int src = data[j].nbr;
-				int dest = i;
-				int x;
-
-
-				for (x = start[src]; x < end[src]; x++) {
-					if (data[x].nbr != dest) continue;
-
-					gpu = data[x];
-					cpu = data[j];
-
-					if (  (gpu.d != cpu.d) ||
-							(cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
-							(cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
-						fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) - %d \n", i, data[j].nbr,
-								data[j].d,
-								data[j].rel_box[0],
-								data[j].rel_box[1],
-								data[j].rel_box[2],
-								data[j].dvec[0],
-								data[j].dvec[1],
-								data[j].dvec[2], 
-								j
-							);
-						fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) - %d \n", data[j].nbr, data[x].nbr,
-								data[x].d,
-								data[x].rel_box[0],
-								data[x].rel_box[1],
-								data[x].rel_box[2],
-								data[x].dvec[0],
-								data[x].dvec[1],
-								data[x].dvec[2], 
-								x
-							);
-						jicount++;
-
-						fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host \n");
-						exit (-1);
-					}
-					djicount ++;
-					break;
-				}
-
-				if (x >= end[src]) {
-					fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src );
-					exit (-1);
-				}
-				continue;
-			}
-
-			gpu = data[j];
-			cpu = far_nbrs->select.far_nbr_list[index];
-			if (  check_zero (gpu.d, cpu.d) ||
-					(gpu.nbr != cpu.nbr) ||
-					check_zero (cpu.dvec, gpu.dvec) ||
-					!check_same (cpu.rel_box, gpu.rel_box)) {
-
-				fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, start[i], end[i], j );
-				fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, far_nbrs), End_Index (i, far_nbrs), index);
-				fprintf (stdout, "Far neighbors does not match atom: %d \n", i );
-				fprintf (stdout, "neighbor %d ,  %d \n",  cpu.nbr, gpu.nbr);
-				fprintf (stdout, "d %f ,  %f \n", cpu.d, data[j].d);
-				fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n",
-						cpu.dvec[0], cpu.dvec[1], cpu.dvec[2],
-						gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] );
-
-				fprintf (stdout, "rel_box (%d %d %d) (%d %d %d) \n",
-						cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2],
-						gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
-
-				fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host  **** \n");
-				return FAILURE;
-				count ++;
-			}
-			index ++;
-			hostcount ++;
-			dijcount ++;
-		}
-
-		if (index != End_Index (i, far_nbrs))
-		{
-			fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", 
-					i, index, Start_Index (i, far_nbrs), End_Index(i, far_nbrs), start[i], end[i]);
-			return FAILURE;
-		}
-	}
-
-	fprintf (stderr, "FAR Neighbors match between device and host host:%d, device:%d dji: %d \n", 
-			hostcount, dijcount, djicount);
-	free (start);
-	free (end);
-	free (data);
-	return SUCCESS;
+    reax_list *far_nbrs = *lists + FAR_NBRS;
+    reax_list *d_nbrs = *dev_lists + FAR_NBRS;
+    far_neighbor_data gpu, cpu;
+    int index, count, jicount;
+    int hostcount, dijcount, djicount;
+    int i;
+
+    int *end = (int *)malloc (sizeof (int) * system->N);
+    int *start = (int *) malloc (sizeof (int) * system->N );
+
+    copy_host_device (start, d_nbrs->index, 
+            sizeof (int) * system->N, cudaMemcpyDeviceToHost, "far_nbrs:index");
+    copy_host_device (end, d_nbrs->end_index, 
+            sizeof (int) * system->N, cudaMemcpyDeviceToHost, "far_nbrs:end_index");
+
+    far_neighbor_data *data = (far_neighbor_data *) 
+        malloc (sizeof (far_neighbor_data)* d_nbrs->num_intrs);
+    copy_host_device (data, d_nbrs->select.far_nbr_list, 
+            sizeof (far_neighbor_data) * d_nbrs->num_intrs, cudaMemcpyDeviceToHost, "far_nbr_list");
+
+    hostcount = dijcount = djicount = 0;
+
+    for (i= 0; i < system->N-1; i++){
+        if (end [i] > start [i+1])
+        {
+            fprintf (stderr, " Far Neighbors index over write  @ index %d (%d, %d) and (%d %d)\n", 
+                    i, start[i], end[i], start[i+1], end[i+1]);
+            return FAILURE;
+        }
+        hostcount += end[i] - start[i];
+    }
+    hostcount += end[i] - start[i];
+    fprintf (stderr, "Total Neighbors count: %d \n", hostcount);
+    hostcount = 0;
+
+    return 0;
+
+    /*
+       for (int i = 0; i < 2; i++) {
+       for (int j = start[i]; j < end[i]; j++){
+       gpu = data[j];
+       fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) - %d \n", i, data[j].nbr,
+       data[j].d,
+       data[j].rel_box[0],
+       data[j].rel_box[1],
+       data[j].rel_box[2],
+       data[j].dvec[0],
+       data[j].dvec[1],
+       data[j].dvec[2], 
+       j
+       );
+       }
+       }
+
+       return SUCCESS;
+     */
+
+    for (int i = 0; i < system->N; i++){
+        index = Start_Index (i, far_nbrs);
+
+        for (int j = start[i]; j < end[i]; j++){
+
+
+            if (i > data[j].nbr) {
+
+                int src = data[j].nbr;
+                int dest = i;
+                int x;
+
+
+                for (x = start[src]; x < end[src]; x++) {
+                    if (data[x].nbr != dest) continue;
+
+                    gpu = data[x];
+                    cpu = data[j];
+
+                    if (  (gpu.d != cpu.d) ||
+                            (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
+                            (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
+                        fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) - %d \n", i, data[j].nbr,
+                                data[j].d,
+                                data[j].rel_box[0],
+                                data[j].rel_box[1],
+                                data[j].rel_box[2],
+                                data[j].dvec[0],
+                                data[j].dvec[1],
+                                data[j].dvec[2], 
+                                j
+                            );
+                        fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) - %d \n", data[j].nbr, data[x].nbr,
+                                data[x].d,
+                                data[x].rel_box[0],
+                                data[x].rel_box[1],
+                                data[x].rel_box[2],
+                                data[x].dvec[0],
+                                data[x].dvec[1],
+                                data[x].dvec[2], 
+                                x
+                            );
+                        jicount++;
+
+                        fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host \n");
+                        exit (-1);
+                    }
+                    djicount ++;
+                    break;
+                }
+
+                if (x >= end[src]) {
+                    fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src );
+                    exit (-1);
+                }
+                continue;
+            }
+
+            gpu = data[j];
+            cpu = far_nbrs->select.far_nbr_list[index];
+            if (  check_zero (gpu.d, cpu.d) ||
+                    (gpu.nbr != cpu.nbr) ||
+                    check_zero (cpu.dvec, gpu.dvec) ||
+                    !check_same (cpu.rel_box, gpu.rel_box)) {
+
+                fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, start[i], end[i], j );
+                fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, far_nbrs), End_Index (i, far_nbrs), index);
+                fprintf (stdout, "Far neighbors does not match atom: %d \n", i );
+                fprintf (stdout, "neighbor %d ,  %d \n",  cpu.nbr, gpu.nbr);
+                fprintf (stdout, "d %f ,  %f \n", cpu.d, data[j].d);
+                fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n",
+                        cpu.dvec[0], cpu.dvec[1], cpu.dvec[2],
+                        gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] );
+
+                fprintf (stdout, "rel_box (%d %d %d) (%d %d %d) \n",
+                        cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2],
+                        gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
+
+                fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host  **** \n");
+                return FAILURE;
+                count ++;
+            }
+            index ++;
+            hostcount ++;
+            dijcount ++;
+        }
+
+        if (index != End_Index (i, far_nbrs))
+        {
+            fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", 
+                    i, index, Start_Index (i, far_nbrs), End_Index(i, far_nbrs), start[i], end[i]);
+            return FAILURE;
+        }
+    }
+
+    fprintf (stderr, "FAR Neighbors match between device and host host:%d, device:%d dji: %d \n", 
+            hostcount, dijcount, djicount);
+    free (start);
+    free (end);
+    free (data);
+    return SUCCESS;
 }
 
 
 int validate_sym_dbond_indices (reax_system *system, storage *workspace, reax_list **lists)
 {
-	int start, end, index, count, miscount;
-	int hostcount, devicecount, h, d;
-	int *d_start, *d_end;
-	bond_data *d_bond_data;
-	reax_list *d_bonds = *dev_lists + BONDS;
-	reax_list *bonds = *lists + BONDS;
-
-	d_end = (int *)malloc (sizeof (int) * system->N);
-	d_start = (int *) malloc (sizeof (int) * system->N );
-	d_bond_data = (bond_data *) malloc (sizeof (bond_data) * d_bonds->num_intrs);
-	//fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds );
-
-	copy_host_device (d_start, d_bonds->index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "index");
-	copy_host_device (d_end, d_bonds->end_index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "index");
-	copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, cudaMemcpyDeviceToHost, "bond_data");
-
-	count = 0; 
-	miscount = 0; 
-	hostcount = 0;
-	devicecount = 0;
-
-	for (int i = 0; i < system->N; i++) {
-		h= End_Index (i, bonds) - Start_Index (i, bonds);
-		d= d_end[i] - d_start[i];
-		//if (h != d) 
-		//	fprintf (stderr, "Count does not match atom:%d, host:%d, device:%d \n", 
-		//					i, h, d);
-		hostcount += h;
-		devicecount += d;
-	}
-	fprintf (stderr, "Bonds count: host: %d device: %d \n", hostcount, devicecount);
-
-	for (int i = 0; i < system->N; i++) {
-
-		for (int j = d_start[i]; j < d_end[i]; j++) {
-			bond_data *src, *tgt;
-			src = &d_bond_data[j];
-
-			tgt = &d_bond_data[ src->sym_index ];   
-
-			if ((src->dbond_index == tgt->dbond_index) )
-				count ++;
-			else
-				miscount ++;
-		}    
-	}
-	fprintf (stderr, "Sym and dbond indexes done count(device) --> %d  (%d)\n", count, miscount);
-
-	count = 0; 
-	miscount = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		for (int j = Start_Index (i, bonds); j < End_Index(i, bonds); j++) {
-			bond_data *src, *tgt;
-			src = &bonds->select.bond_list [j]; 
-
-			tgt = &bonds->select.bond_list [ src->sym_index ]; 
-
-			if ((src->dbond_index == tgt->dbond_index) )
-				count ++;
-			else
-				miscount ++;
-		}    
-	}
-	fprintf (stderr, "Sym and dbond indexes done count (host) --> %d  (%d)\n", count, miscount);
-
-	free (d_end);
-	free (d_start);
-	free (d_bond_data);
-
-	return SUCCESS;
+    int start, end, index, count, miscount;
+    int hostcount, devicecount, h, d;
+    int *d_start, *d_end;
+    bond_data *d_bond_data;
+    reax_list *d_bonds = *dev_lists + BONDS;
+    reax_list *bonds = *lists + BONDS;
+
+    d_end = (int *)malloc (sizeof (int) * system->N);
+    d_start = (int *) malloc (sizeof (int) * system->N );
+    d_bond_data = (bond_data *) malloc (sizeof (bond_data) * d_bonds->num_intrs);
+    //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds );
+
+    copy_host_device (d_start, d_bonds->index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "index");
+    copy_host_device (d_end, d_bonds->end_index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "index");
+    copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, cudaMemcpyDeviceToHost, "bond_data");
+
+    count = 0; 
+    miscount = 0; 
+    hostcount = 0;
+    devicecount = 0;
+
+    for (int i = 0; i < system->N; i++) {
+        h= End_Index (i, bonds) - Start_Index (i, bonds);
+        d= d_end[i] - d_start[i];
+        //if (h != d) 
+        //    fprintf (stderr, "Count does not match atom:%d, host:%d, device:%d \n", 
+        //                    i, h, d);
+        hostcount += h;
+        devicecount += d;
+    }
+    fprintf (stderr, "Bonds count: host: %d device: %d \n", hostcount, devicecount);
+
+    for (int i = 0; i < system->N; i++) {
+
+        for (int j = d_start[i]; j < d_end[i]; j++) {
+            bond_data *src, *tgt;
+            src = &d_bond_data[j];
+
+            tgt = &d_bond_data[ src->sym_index ];   
+
+            if ((src->dbond_index == tgt->dbond_index) )
+                count ++;
+            else
+                miscount ++;
+        }    
+    }
+    fprintf (stderr, "Sym and dbond indexes done count(device) --> %d  (%d)\n", count, miscount);
+
+    count = 0; 
+    miscount = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        for (int j = Start_Index (i, bonds); j < End_Index(i, bonds); j++) {
+            bond_data *src, *tgt;
+            src = &bonds->select.bond_list [j]; 
+
+            tgt = &bonds->select.bond_list [ src->sym_index ]; 
+
+            if ((src->dbond_index == tgt->dbond_index) )
+                count ++;
+            else
+                miscount ++;
+        }    
+    }
+    fprintf (stderr, "Sym and dbond indexes done count (host) --> %d  (%d)\n", count, miscount);
+
+    free (d_end);
+    free (d_start);
+    free (d_bond_data);
+
+    return SUCCESS;
 }
 
 
 int validate_sparse_matrix( reax_system *system, storage *workspace )
 {
-	sparse_matrix test;
-	int index, count, total;
-	test.start = (int *) malloc (sizeof (int) * (system->N));
-	test.end = (int *) malloc (sizeof (int) * (system->N));
-
-	test.entries = (sparse_matrix_entry *) malloc 
-		(sizeof (sparse_matrix_entry) * (dev_workspace->H.m));
-
-	memset (test.entries, 0xFF, 
-			sizeof (sparse_matrix_entry) * dev_workspace->H.m);
-	copy_host_device ( test.entries, dev_workspace->H.entries, 
-			sizeof (sparse_matrix_entry)* dev_workspace->H.m, 
-			cudaMemcpyDeviceToHost, "sparse_matrix_entries");
-	copy_host_device ( test.start, dev_workspace->H.start, sizeof (int) * (system->N), cudaMemcpyDeviceToHost, "start");
-	copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (system->N), cudaMemcpyDeviceToHost, "end");
-
-	for (int i = 0 ; i < system->N; i++) {
-		if ((test.end[i] >= dev_workspace->H.m)) {
-			fprintf (stderr, " exceeding number of entries for atom: %d \n", i);
-			exit (-1);
-		}
-
-		if (( i < (system->N-1)) && (test.end[i] >= test.start[i+1]))
-		{
-			fprintf (stderr, " Index exceeding for atom : %d \n", i );
-			fprintf (stderr, "end(i): %d \n", test.end[i]);
-			fprintf (stderr, "start(i+1): %d \n", test.start[i+1]);
-			exit (-1);
-		}
-	}
-	fprintf (stderr, "Sparse Matrix Boundary Check PASSED !!!\n");
-
-	//TODO
-	//TODO
-	//TODO
-	return SUCCESS;
-
-	count = 0;
-	for (int i = 0 ; i < system->N; i++) 
-		count += test.end[i] - test.start[i];
-	fprintf (stderr, " Total number of entries : %d \n", count);
-
-	fprintf (stderr, " ALlocated memeory for entries : %d\n", dev_workspace->H.m);
-
-	////////////////////////////
-	//for (int i = workspace->H.start[0]; i < workspace->H.end[0]; i++) {
-	//	fprintf (stderr, "Row: 0, col: %d val: %f \n", workspace->H.entries[i].j, workspace->H.entries[i].val );
-	//}
-	//////////////////////////////
-
-	count = 0;
-	total = 0;
-	for (int i = 0; i < system->n; i++) {
-		for (int j = workspace->H.start[i]; j < workspace->H.end[i]; j++) {
-			sparse_matrix_entry *src = &workspace->H.entries[j];
-
-			for (int k = test.start[i]; k < test.end[i]; k++) {
-				sparse_matrix_entry *tgt = &test.entries [k];
-				if (src->j == tgt->j){
-					if ( check_zero (src->val, tgt->val)) {
-						index = test.start [i];
-						/*
-						   fprintf (stderr, " i-1 (%d %d ) (%d %d) \n", 
-						   test.start[i-1], test.end[i-1], 
-						   workspace->H.start[i-1], workspace->H.start[i]);
-						 */
-						fprintf (stderr, " Sparse matrix entry does not match for atom %d at index %d (%d %d) (%d %d) \n", 
-								i, k, test.start[i], test.end[i], 
-								workspace->H.start[i], workspace->H.end[i]);
-						for (int x = workspace->H.start[i]; x < workspace->H.end[i]; x ++)
-						{
-							src = &workspace->H.entries[x];
-							tgt = &test.entries [index];
-							fprintf (stderr, " cpu (%d %f)**** <--> gpu (%d %f) index %d \n", src->j, src->val, tgt->j, tgt->val, index);
-							index ++;
-						}
-						fprintf (stderr, "Sparse Matrix DOES NOT match between device and host \n");
-						exit (-1);
-						count++;
-					} else 
-					{
-						total ++;
-						if (i == tgt->j)  continue;
-						//if (tgt->j >= system->n) continue;
-
-						//success case here. check for row - k and column i;
-						for (int x = test.start[tgt->j]; x < test.end[tgt->j]; x++){
-							sparse_matrix_entry *rtgt = &test.entries [x];
-							if (i == rtgt->j) {
-								if (check_zero (tgt->val, rtgt->val)) {
-									fprintf (stderr, "symmetric entry not matching for (%d, %d) \n", i, tgt->j);
-									fprintf (stderr, "row: %d col: %d val: %f \n", i, tgt->j, tgt->val);
-									fprintf (stderr, "row: %d col: %d val: %f \n", tgt->j, rtgt->j, rtgt->val);
-									exit (-1);
-								} else {
-									total ++;
-									break;
-								}
-							}
-						}
-					}
-				}
-			}
-		}
-	}
-
-	fprintf (stderr, "Sparse Matrix mismatch total: %d, miscount %d  \n", total, count);
-	free (test.start);
-	free (test.end);
-	free (test.entries);
-	return SUCCESS;
+    sparse_matrix test;
+    int index, count, total;
+    test.start = (int *) malloc (sizeof (int) * (system->N));
+    test.end = (int *) malloc (sizeof (int) * (system->N));
+
+    test.entries = (sparse_matrix_entry *) malloc 
+        (sizeof (sparse_matrix_entry) * (dev_workspace->H.m));
+
+    memset (test.entries, 0xFF, 
+            sizeof (sparse_matrix_entry) * dev_workspace->H.m);
+    copy_host_device ( test.entries, dev_workspace->H.entries, 
+            sizeof (sparse_matrix_entry)* dev_workspace->H.m, 
+            cudaMemcpyDeviceToHost, "sparse_matrix_entries");
+    copy_host_device ( test.start, dev_workspace->H.start, sizeof (int) * (system->N), cudaMemcpyDeviceToHost, "start");
+    copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (system->N), cudaMemcpyDeviceToHost, "end");
+
+    for (int i = 0 ; i < system->N; i++) {
+        if ((test.end[i] >= dev_workspace->H.m)) {
+            fprintf (stderr, " exceeding number of entries for atom: %d \n", i);
+            exit (-1);
+        }
+
+        if (( i < (system->N-1)) && (test.end[i] >= test.start[i+1]))
+        {
+            fprintf (stderr, " Index exceeding for atom : %d \n", i );
+            fprintf (stderr, "end(i): %d \n", test.end[i]);
+            fprintf (stderr, "start(i+1): %d \n", test.start[i+1]);
+            exit (-1);
+        }
+    }
+    fprintf (stderr, "Sparse Matrix Boundary Check PASSED !!!\n");
+
+    //TODO
+    //TODO
+    //TODO
+    return SUCCESS;
+
+    count = 0;
+    for (int i = 0 ; i < system->N; i++) 
+        count += test.end[i] - test.start[i];
+    fprintf (stderr, " Total number of entries : %d \n", count);
+
+    fprintf (stderr, " ALlocated memeory for entries : %d\n", dev_workspace->H.m);
+
+    ////////////////////////////
+    //for (int i = workspace->H.start[0]; i < workspace->H.end[0]; i++) {
+    //    fprintf (stderr, "Row: 0, col: %d val: %f \n", workspace->H.entries[i].j, workspace->H.entries[i].val );
+    //}
+    //////////////////////////////
+
+    count = 0;
+    total = 0;
+    for (int i = 0; i < system->n; i++) {
+        for (int j = workspace->H.start[i]; j < workspace->H.end[i]; j++) {
+            sparse_matrix_entry *src = &workspace->H.entries[j];
+
+            for (int k = test.start[i]; k < test.end[i]; k++) {
+                sparse_matrix_entry *tgt = &test.entries [k];
+                if (src->j == tgt->j){
+                    if ( check_zero (src->val, tgt->val)) {
+                        index = test.start [i];
+                        /*
+                           fprintf (stderr, " i-1 (%d %d ) (%d %d) \n", 
+                           test.start[i-1], test.end[i-1], 
+                           workspace->H.start[i-1], workspace->H.start[i]);
+                         */
+                        fprintf (stderr, " Sparse matrix entry does not match for atom %d at index %d (%d %d) (%d %d) \n", 
+                                i, k, test.start[i], test.end[i], 
+                                workspace->H.start[i], workspace->H.end[i]);
+                        for (int x = workspace->H.start[i]; x < workspace->H.end[i]; x ++)
+                        {
+                            src = &workspace->H.entries[x];
+                            tgt = &test.entries [index];
+                            fprintf (stderr, " cpu (%d %f)**** <--> gpu (%d %f) index %d \n", src->j, src->val, tgt->j, tgt->val, index);
+                            index ++;
+                        }
+                        fprintf (stderr, "Sparse Matrix DOES NOT match between device and host \n");
+                        exit (-1);
+                        count++;
+                    } else 
+                    {
+                        total ++;
+                        if (i == tgt->j)  continue;
+                        //if (tgt->j >= system->n) continue;
+
+                        //success case here. check for row - k and column i;
+                        for (int x = test.start[tgt->j]; x < test.end[tgt->j]; x++){
+                            sparse_matrix_entry *rtgt = &test.entries [x];
+                            if (i == rtgt->j) {
+                                if (check_zero (tgt->val, rtgt->val)) {
+                                    fprintf (stderr, "symmetric entry not matching for (%d, %d) \n", i, tgt->j);
+                                    fprintf (stderr, "row: %d col: %d val: %f \n", i, tgt->j, tgt->val);
+                                    fprintf (stderr, "row: %d col: %d val: %f \n", tgt->j, rtgt->j, rtgt->val);
+                                    exit (-1);
+                                } else {
+                                    total ++;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    fprintf (stderr, "Sparse Matrix mismatch total: %d, miscount %d  \n", total, count);
+    free (test.start);
+    free (test.end);
+    free (test.entries);
+    return SUCCESS;
 }
 
 bool print_hbonds (int *d_start, int *d_end, int i, hbond_data *data)
 {
-	hbond_data src, tgt; 
+    hbond_data src, tgt; 
 
-	fprintf (stderr, " start %d end %d count ---> %d \n", d_start[i], d_end[i], d_end[i] - d_start[i]);   
+    fprintf (stderr, " start %d end %d count ---> %d \n", d_start[i], d_end[i], d_end[i] - d_start[i]);   
 
-	for (int j = d_start[i]; j < d_end[i]; j++) 
-		fprintf (stderr, "Atom : %d , Hbond Info . nbr: %d scl: %d index:%d\n", i, data[j].nbr, data[j].scl);
-	fprintf (stderr, " ========================================= \n");
+    for (int j = d_start[i]; j < d_end[i]; j++) 
+        fprintf (stderr, "Atom : %d , Hbond Info . nbr: %d scl: %d index:%d\n", i, data[j].nbr, data[j].scl);
+    fprintf (stderr, " ========================================= \n");
 }
 
 
 int validate_hbonds (reax_system *system, storage *workspace, reax_list **lists)
 {
-	int count, nbr, sym_count, dev_count;
-	int *d_start, *d_end, index, d_index;
-	hbond_data *data, src, tgt;
-	reax_list *d_hbonds = *dev_lists + HBONDS;
-	reax_list *hbonds = *lists + HBONDS;
-
-	d_end = (int *)malloc (sizeof (int)* d_hbonds->n);
-	d_start = (int *) malloc (sizeof (int) * d_hbonds->n );
-	fprintf (stderr, "Total index values: %d \n", d_hbonds->n);
-
-	copy_host_device (d_start, d_hbonds->index, sizeof (int)* d_hbonds->n, cudaMemcpyDeviceToHost, "start");
-	copy_host_device (d_end, d_hbonds->end_index, sizeof (int) * d_hbonds->n, cudaMemcpyDeviceToHost, "end");
-
-	//fprintf (stderr, "Copying hbonds to host %d \n", system->num_hbonds);
-	data = (hbond_data *) malloc (sizeof (hbond_data) * d_hbonds->num_intrs);
-	copy_host_device (data, d_hbonds->select.hbond_list, sizeof (hbond_data) * d_hbonds->num_intrs, 
-			cudaMemcpyDeviceToHost, "hbond_data");
-
-	count = 0;
-	dev_count = 0;
-	sym_count = 0;
-	for (int i = 0; i < system->n; i++) {
-
-		if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == 1 )
-		{
-			count += End_Index (i, hbonds) - Start_Index (i, hbonds);
-			dev_count += d_end [i] - d_start[i];
-
-			if ((d_end[ i] - d_start[i])    !=
-					(End_Index (i, hbonds) - Start_Index (i, hbonds))) {
-				fprintf (stderr, "%d %d - d(%d  %d) c(%d %d) \n",i, i,
-						d_start[i], d_end[ i],
-						Start_Index (i, hbonds),
-						End_Index (i, hbonds) );
-				print_hbonds (d_start, d_end, i, data);
-				print_hbonds (hbonds->index, hbonds->end_index, i, hbonds->select.hbond_list);
-				exit (-1);
-			}
-		}
-		else {
-			sym_count += d_end[ i] - d_start[i];
-		}
-	}
-	fprintf (stderr, "hbonds count match between host: %d and device: %d (%d) \n", count,dev_count, sym_count);
-	sym_count = 0;
-
-	for (int i = system->n; i < system->N; i++) {
-		//if (system->reax_param.sbp[ system->my_atoms[i].type].p_hbond == 2)
-		{
-			sym_count += d_end[i] - d_start[i];
-		}
-	}
-	fprintf (stderr, "Sym count outside 'n' : %d \n", sym_count );
-	//print_hbonds (d_start, d_end, 0, data);
-
-
-	count = 0;
-	for (int i = 0; i < system->n; i++) {
-
-		d_index = i; 
-		/*
-		   fprintf (stderr, " Count cpu %d gpu %d \n", 
-		   End_Index (workspace->hbond_index[i], hbonds) - index, 
-		   d_end[d_index] - d_start[d_index]);
-		 */
-
-		if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond != 1 )
-		{
-			/*
-			   int x;
-			   for (int j = d_start[d_index]; j < d_end[d_index]; j++ )
-			   {
-			   tgt = data [j];
-			   nbr = tgt.nbr;
-			   for (x = d_start[nbr]; x < d_end[nbr]; x++) 
-			   {
-			   src = data [x];
-			   if (src.nbr == i) {
-			   break;
-			   }
-			   }
-			   if (x >= d_end[nbr]) {
-			   fprintf (stderr, "HBONDS is NOT SYMMETRIC \n");
-			   fprintf (stderr, "Atom: %d, nbr: %d (%d)\n", i, nbr);
-			   fprintf (stderr, "Atom: %d, start: %d end: %d \n", nbr, d_start[nbr], d_end[nbr]);
-			   for (x = d_start[nbr]; x < d_end[nbr]; x++) 
-			   {
-			   src = data [x];
-			   fprintf (stderr, "Atom: %d, nbr: %d \n", nbr, src.nbr);
-			   }
-
-			   exit (1);
-			   }
-			   }
-			 */
-
-			for (int j = d_start[d_index]; j < d_end[d_index]; j++ )
-			{
-				tgt = data[j];
-				nbr = tgt.sym_index;
-
-				if (nbr >= d_hbonds->num_intrs || nbr < 0){
-					fprintf (stderr, "Index out of range for atom: %d sym_index:%d Hbond index: %d, nbr: %d\n", i, nbr, j, data[j].nbr);
-					fprintf (stderr, "atom type: %d \n", system->reax_param.sbp[ system->my_atoms [ data[j].nbr ].type].p_hbond);
-					exit (1);
-				}
-
-				if (data[nbr].sym_index != j) {
-					fprintf (stderr, "Sym Index for hydrogen bonds does not match \n");
-					exit (1);
-				}
-			}
-			continue;
-		}
-
-		for (int j = d_start[d_index]; j < d_end[d_index]; j++ )
-		{
-			tgt = data[j];
-
-			int k = 0;
-			for (k = Start_Index (i, hbonds);
-					k < End_Index (i, hbonds); k++) {
-				src = hbonds->select.hbond_list[k];
-
-				if ((src.nbr == tgt.nbr) && (src.scl == tgt.scl)) {
-					/*
-					   fprintf (stderr, "Mismatch  at atom %d index %d (%d %d) -- (%d %d) \n", i, k,
-					   src.nbr, src.scl, 
-					   tgt.nbr, tgt.scl);
-					 */
-					count ++;
-					break;
-				}
-			}
-
-			/*
-			   if (  ((End_Index (workspace->hbond_index[i], hbonds) - index) != index ) && 
-			   (k >= End_Index (workspace->hbond_index[i], hbonds))) {
-			   fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, d_index );
-			   exit (-1);
-			   }
-			 */
-
-			if ( k >= (End_Index (i, hbonds) )){
-				fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, j);
-				fprintf (stderr, " ==========Host============ \n");
-				print_hbonds (hbonds->index, hbonds->end_index, 
-						i, hbonds->select.hbond_list);
-				fprintf (stderr, " ==========Device============ \n");
-				print_hbonds (d_start, d_end, 
-						i, data);
-				exit (-1);
-			}
-		}
-
-		if ((End_Index (i, hbonds)- Start_Index(i, hbonds)) != (d_end[i] - d_start[i])){
-			fprintf (stderr, "End index does not match between device and host \n");
-			fprintf (stderr, " Atom: %d Host: %d %d \n", i, Start_Index (i, hbonds), End_Index (i, hbonds));
-			fprintf (stderr, " Device: %d %d \n", d_start[i], d_end[i]);
-			exit (-1);
-		}
-	}
-
-	fprintf (stderr, "HBONDs match on device and Host count --> %d\n", count);
-
-	free (d_start);
-	free (d_end);
-	free (data);
-	return SUCCESS;
+    int count, nbr, sym_count, dev_count;
+    int *d_start, *d_end, index, d_index;
+    hbond_data *data, src, tgt;
+    reax_list *d_hbonds = *dev_lists + HBONDS;
+    reax_list *hbonds = *lists + HBONDS;
+
+    d_end = (int *)malloc (sizeof (int)* d_hbonds->n);
+    d_start = (int *) malloc (sizeof (int) * d_hbonds->n );
+    fprintf (stderr, "Total index values: %d \n", d_hbonds->n);
+
+    copy_host_device (d_start, d_hbonds->index, sizeof (int)* d_hbonds->n, cudaMemcpyDeviceToHost, "start");
+    copy_host_device (d_end, d_hbonds->end_index, sizeof (int) * d_hbonds->n, cudaMemcpyDeviceToHost, "end");
+
+    //fprintf (stderr, "Copying hbonds to host %d \n", system->num_hbonds);
+    data = (hbond_data *) malloc (sizeof (hbond_data) * d_hbonds->num_intrs);
+    copy_host_device (data, d_hbonds->select.hbond_list, sizeof (hbond_data) * d_hbonds->num_intrs, 
+            cudaMemcpyDeviceToHost, "hbond_data");
+
+    count = 0;
+    dev_count = 0;
+    sym_count = 0;
+    for (int i = 0; i < system->n; i++) {
+
+        if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond == 1 )
+        {
+            count += End_Index (i, hbonds) - Start_Index (i, hbonds);
+            dev_count += d_end [i] - d_start[i];
+
+            if ((d_end[ i] - d_start[i])    !=
+                    (End_Index (i, hbonds) - Start_Index (i, hbonds))) {
+                fprintf (stderr, "%d %d - d(%d  %d) c(%d %d) \n",i, i,
+                        d_start[i], d_end[ i],
+                        Start_Index (i, hbonds),
+                        End_Index (i, hbonds) );
+                print_hbonds (d_start, d_end, i, data);
+                print_hbonds (hbonds->index, hbonds->end_index, i, hbonds->select.hbond_list);
+                exit (-1);
+            }
+        }
+        else {
+            sym_count += d_end[ i] - d_start[i];
+        }
+    }
+    fprintf (stderr, "hbonds count match between host: %d and device: %d (%d) \n", count,dev_count, sym_count);
+    sym_count = 0;
+
+    for (int i = system->n; i < system->N; i++) {
+        //if (system->reax_param.sbp[ system->my_atoms[i].type].p_hbond == 2)
+        {
+            sym_count += d_end[i] - d_start[i];
+        }
+    }
+    fprintf (stderr, "Sym count outside 'n' : %d \n", sym_count );
+    //print_hbonds (d_start, d_end, 0, data);
+
+
+    count = 0;
+    for (int i = 0; i < system->n; i++) {
+
+        d_index = i; 
+        /*
+           fprintf (stderr, " Count cpu %d gpu %d \n", 
+           End_Index (workspace->hbond_index[i], hbonds) - index, 
+           d_end[d_index] - d_start[d_index]);
+         */
+
+        if ( system->reax_param.sbp[ system->my_atoms[i].type ].p_hbond != 1 )
+        {
+            /*
+               int x;
+               for (int j = d_start[d_index]; j < d_end[d_index]; j++ )
+               {
+               tgt = data [j];
+               nbr = tgt.nbr;
+               for (x = d_start[nbr]; x < d_end[nbr]; x++) 
+               {
+               src = data [x];
+               if (src.nbr == i) {
+               break;
+               }
+               }
+               if (x >= d_end[nbr]) {
+               fprintf (stderr, "HBONDS is NOT SYMMETRIC \n");
+               fprintf (stderr, "Atom: %d, nbr: %d (%d)\n", i, nbr);
+               fprintf (stderr, "Atom: %d, start: %d end: %d \n", nbr, d_start[nbr], d_end[nbr]);
+               for (x = d_start[nbr]; x < d_end[nbr]; x++) 
+               {
+               src = data [x];
+               fprintf (stderr, "Atom: %d, nbr: %d \n", nbr, src.nbr);
+               }
+
+               exit (1);
+               }
+               }
+             */
+
+            for (int j = d_start[d_index]; j < d_end[d_index]; j++ )
+            {
+                tgt = data[j];
+                nbr = tgt.sym_index;
+
+                if (nbr >= d_hbonds->num_intrs || nbr < 0){
+                    fprintf (stderr, "Index out of range for atom: %d sym_index:%d Hbond index: %d, nbr: %d\n", i, nbr, j, data[j].nbr);
+                    fprintf (stderr, "atom type: %d \n", system->reax_param.sbp[ system->my_atoms [ data[j].nbr ].type].p_hbond);
+                    exit (1);
+                }
+
+                if (data[nbr].sym_index != j) {
+                    fprintf (stderr, "Sym Index for hydrogen bonds does not match \n");
+                    exit (1);
+                }
+            }
+            continue;
+        }
+
+        for (int j = d_start[d_index]; j < d_end[d_index]; j++ )
+        {
+            tgt = data[j];
+
+            int k = 0;
+            for (k = Start_Index (i, hbonds);
+                    k < End_Index (i, hbonds); k++) {
+                src = hbonds->select.hbond_list[k];
+
+                if ((src.nbr == tgt.nbr) && (src.scl == tgt.scl)) {
+                    /*
+                       fprintf (stderr, "Mismatch  at atom %d index %d (%d %d) -- (%d %d) \n", i, k,
+                       src.nbr, src.scl, 
+                       tgt.nbr, tgt.scl);
+                     */
+                    count ++;
+                    break;
+                }
+            }
+
+            /*
+               if (  ((End_Index (workspace->hbond_index[i], hbonds) - index) != index ) && 
+               (k >= End_Index (workspace->hbond_index[i], hbonds))) {
+               fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, d_index );
+               exit (-1);
+               }
+             */
+
+            if ( k >= (End_Index (i, hbonds) )){
+                fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, j);
+                fprintf (stderr, " ==========Host============ \n");
+                print_hbonds (hbonds->index, hbonds->end_index, 
+                        i, hbonds->select.hbond_list);
+                fprintf (stderr, " ==========Device============ \n");
+                print_hbonds (d_start, d_end, 
+                        i, data);
+                exit (-1);
+            }
+        }
+
+        if ((End_Index (i, hbonds)- Start_Index(i, hbonds)) != (d_end[i] - d_start[i])){
+            fprintf (stderr, "End index does not match between device and host \n");
+            fprintf (stderr, " Atom: %d Host: %d %d \n", i, Start_Index (i, hbonds), End_Index (i, hbonds));
+            fprintf (stderr, " Device: %d %d \n", d_start[i], d_end[i]);
+            exit (-1);
+        }
+    }
+
+    fprintf (stderr, "HBONDs match on device and Host count --> %d\n", count);
+
+    free (d_start);
+    free (d_end);
+    free (data);
+    return SUCCESS;
 }
 
 int validate_bonds (reax_system *system, storage *workspace, reax_list **lists)
 {
-	int start, end, index, count, miscount;
-	int *d_start, *d_end;
-	bond_data *d_bond_data;
-	reax_list *d_bonds = *dev_lists + BONDS;
-	reax_list *bonds = *lists + BONDS;
-
-	d_end = (int *)malloc (sizeof (int) * system->N);
-	d_start = (int *) malloc (sizeof (int) * system->N );
-	d_bond_data = (bond_data *) malloc (sizeof (bond_data) * d_bonds->num_intrs);
-	//fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds );
-
-	copy_host_device (d_start, d_bonds->index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "start");
-	copy_host_device (d_end, d_bonds->end_index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "end");
-	copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, 
-			cudaMemcpyDeviceToHost, "bond_data");
-
-	count = 0;
-	for (int i = 0; i < system->N; i++) {
-		start = Start_Index (i, bonds);
-		end = End_Index (i, bonds);
-
-		count += end - start;
-		if ((end-start) != (d_end[i]-d_start[i])){
-			fprintf (stderr, "Entries does NOT match --> atom %d: cpu (%d %d) gpu (%d %d) \n",
-					i, start, end, d_start[i], d_end[i]);
-			exit (-1);
-		}
-
-	}
-	fprintf (stderr, "BOND LIST COUNT match on device and host  count %d \n", count);
-
-	for (int i = 0; i < system->N-1; i++) {
-		if ( d_end[i] >= d_start[i+1] ){
-			fprintf (stderr, "Bonds list check Overwrite @ index --> %d \n", i);
-			exit (-1);
-		}
-	}
-	//fprintf (stderr, " BOND LIST Overwrite *PASSED* \n");
-	count = 0;
-	miscount = 0;
-	for (int i = 0; i < system->N; i++) {
-
-		for (int j = d_start[i]; j < d_end[i]; j++) {
-			bond_data *src, *tgt;
-			src = &d_bond_data[j];
-			bond_data *src_sym = & d_bond_data[ src->sym_index ];
-
-			//Previously this was commented out. Thats why it was working.
-			//if (i >= src->nbr) continue;
-
-			int k = 0;
-			for (k = Start_Index (i, bonds); k < End_Index (i, bonds); k++) {
-				tgt = & (bonds->select.bond_list[k]);
-				bond_data *tgt_sym = &(bonds->select.bond_list [tgt->sym_index]);
-
-				if ((src->nbr == tgt->nbr) && !check_zero (src->d,tgt->d) &&
-						!check_zero (src->dvec,tgt->dvec) && check_same (src->rel_box, tgt->rel_box)) {
-
-					bond_order_data *s, *t;
-					s = &(src->bo_data);
-					t = &(tgt->bo_data);
-
-					if (  !check_zero (s->BO,t->BO) &&
-							!check_zero (s->BO_s,t->BO_s) &&
-							!check_zero(s->BO_pi,t->BO_pi)  &&
-							!check_zero (s->BO_pi2,t->BO_pi2) &&
-							!check_zero (s->Cdbo,t->Cdbo) && !check_zero (s->Cdbopi,t->Cdbopi) && !check_zero (s->Cdbopi2,t->Cdbopi2) &&
-							!check_zero (s->C1dbo,t->C1dbo) && !check_zero (s->C2dbo,t->C2dbo) && !check_zero (s->C3dbo,t->C3dbo) &&
-							!check_zero(s->C1dbopi,t->C1dbopi) && !check_zero(s->C2dbopi,t->C2dbopi) && !check_zero(s->C3dbopi,t->C3dbopi) && !check_zero(s->C4dbopi,t->C4dbopi) &&
-							!check_zero(s->C1dbopi2,t->C1dbopi2) && !check_zero(s->C2dbopi2,t->C2dbopi2) &&!check_zero(s->C3dbopi2,t->C3dbopi2) &&!check_zero(s->C4dbopi2,t->C4dbopi2) &&
-							!check_zero (s->dln_BOp_s, t->dln_BOp_s ) &&
-							!check_zero (s->dln_BOp_pi, t->dln_BOp_pi ) &&
-							!check_zero (s->dln_BOp_pi2, t->dln_BOp_pi2 ) &&
-							!check_zero (s->dBOp, t->dBOp )) {
-						count ++;
-
-						//Check the sym index and dbond index here for double checking
-						// bond_ij on both device and hosts are matched now. 
-						/*
-						   bond_order_data *ss, *ts;
-						   ss = & (src_sym->bo_data );
-						   ts = & (tgt_sym->bo_data );
-
-						   if ((src_sym->nbr != tgt_sym->nbr) || check_zero (src_sym->d,tgt_sym->d) ||
-						   check_zero (src_sym->dvec,tgt_sym->dvec) || !check_same (src_sym->rel_box, tgt_sym->rel_box)
-						   || check_zero (ss->Cdbo, ts->Cdbo)){
-
-						   fprintf (stderr, " Sym Index information does not match for atom %d \n", i);
-						   fprintf (stderr, " atom --> %d \n", i);
-						   fprintf (stderr, " nbr --> %d %d\n", src->nbr, tgt->nbr );
-						   fprintf (stderr, " d --> %f %f \n", src_sym->d, tgt_sym->d );
-						   fprintf (stderr, " sym Index nbr --> %d %d \n", src_sym->nbr, tgt_sym->nbr );
-						   fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n",
-						   src_sym->dvec[0], src_sym->dvec[1], src_sym->dvec[2],
-						   tgt_sym->dvec[0], tgt_sym->dvec[1], tgt_sym->dvec[2] );
-						   fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n",
-						   src_sym->rel_box[0], src_sym->rel_box[1], src_sym->rel_box[2],
-						   tgt_sym->rel_box[0], tgt_sym->rel_box[1], tgt_sym->rel_box[2] );
-
-						   fprintf (stderr, " sym index Cdbo (%4.10e %4.10e) \n", ss->Cdbo,ts->Cdbo );
-						   exit (-1);
-						   }
-						 */
-
-						break;
-					}
-					fprintf (stderr, " d --> %f %f \n", src->d, tgt->d );
-					fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n",
-							src->dvec[0], src->dvec[1], src->dvec[2],
-							tgt->dvec[0], tgt->dvec[1], tgt->dvec[2] );
-					fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n",
-							src->rel_box[0], src->rel_box[1], src->rel_box[2],
-							tgt->rel_box[0], tgt->rel_box[1], tgt->rel_box[2] );
-
-					fprintf (stderr, "Bond_Order_Data does not match for atom %d neighbor (%d %d) BO (%e %e) BO_s (%e %e) BO_pi (%e %e) BO_pi2 (%e %e) \n", i,
-							src->nbr, tgt->nbr,
-							s->BO, t->BO,
-							s->BO_s, t->BO_s,
-							s->BO_pi, t->BO_pi,
-							s->BO_pi2, t->BO_pi2
-						);
-					fprintf (stderr, " dBOp (%e %e %e) (%e %e %e) \n", s->dBOp[0], s->dBOp[1], s->dBOp[2],
-							t->dBOp[0], t->dBOp[1], t->dBOp[2] );
-
-					fprintf (stderr, " Cdbo (%4.10e %4.10e) \n", s->Cdbo,t->Cdbo );
-					fprintf (stderr, " Cdbopi (%e %e) \n", s->Cdbopi,t->Cdbopi );
-					fprintf (stderr, " Cdbopi2 (%e %e) \n", s->Cdbopi2,t->Cdbopi2 );
-					fprintf (stderr, " C1dbo (%e %e %e)(%e %e %e) \n", s->C1dbo,s->C2dbo,s->C3dbo, t->C1dbo,t->C2dbo,t->C3dbo );
-					fprintf (stderr, " C1dbopi (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi,s->C2dbopi,s->C3dbopi,s->C4dbopi, t->C1dbopi,t->C2dbopi,t->C3dbopi,t->C4dbopi);
-					fprintf (stderr, " C1dbopi2 (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi2,s->C2dbopi2,s->C3dbopi2,s->C4dbopi2, t->C1dbopi2,t->C2dbopi2,t->C3dbopi2,t->C4dbopi2);
-					fprintf (stderr, " dln_BOp_s (%e %e %e ) (%e %e %e) \n",
-							s->dln_BOp_s[0], s->dln_BOp_s[1], s->dln_BOp_s[2],
-							t->dln_BOp_s[0], t->dln_BOp_s[1], t->dln_BOp_s[2] );
-					fprintf (stderr, " dln_BOp_pi (%e %e %e ) (%e %e %e) \n",
-							s->dln_BOp_pi[0], s->dln_BOp_pi[1], s->dln_BOp_pi[2],
-							t->dln_BOp_pi[0], t->dln_BOp_pi[1], t->dln_BOp_pi[2] );
-					fprintf (stderr, " dln_BOp_pi2 (%e %e %e ) (%e %e %e) \n",
-							s->dln_BOp_pi2[0], s->dln_BOp_pi2[1], s->dln_BOp_pi2[2],
-							t->dln_BOp_pi2[0], t->dln_BOp_pi2[1], t->dln_BOp_pi2[2] );
-
-					//miscount ++;
-					//break;
-					exit (-1);
-				}
-			}
-
-			if (k >= End_Index (i, bonds)) {
-				miscount ++;
-				fprintf (stderr, " We have a problem with the atom %d and bond entry %d \n", i, j);
-				exit (-1);
-			}
-		}
-	}
-
-	fprintf (stderr, " BONDS matched count %d miscount %d (%d) \n", count, miscount, (count+miscount));
-	free (d_start);
-	free (d_end);
-	free (d_bond_data);
-	return SUCCESS;
+    int start, end, index, count, miscount;
+    int *d_start, *d_end;
+    bond_data *d_bond_data;
+    reax_list *d_bonds = *dev_lists + BONDS;
+    reax_list *bonds = *lists + BONDS;
+
+    d_end = (int *)malloc (sizeof (int) * system->N);
+    d_start = (int *) malloc (sizeof (int) * system->N );
+    d_bond_data = (bond_data *) malloc (sizeof (bond_data) * d_bonds->num_intrs);
+    //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds );
+
+    copy_host_device (d_start, d_bonds->index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "start");
+    copy_host_device (d_end, d_bonds->end_index, sizeof (int) * system->N, cudaMemcpyDeviceToHost, "end");
+    copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) * d_bonds->num_intrs, 
+            cudaMemcpyDeviceToHost, "bond_data");
+
+    count = 0;
+    for (int i = 0; i < system->N; i++) {
+        start = Start_Index (i, bonds);
+        end = End_Index (i, bonds);
+
+        count += end - start;
+        if ((end-start) != (d_end[i]-d_start[i])){
+            fprintf (stderr, "Entries does NOT match --> atom %d: cpu (%d %d) gpu (%d %d) \n",
+                    i, start, end, d_start[i], d_end[i]);
+            exit (-1);
+        }
+
+    }
+    fprintf (stderr, "BOND LIST COUNT match on device and host  count %d \n", count);
+
+    for (int i = 0; i < system->N-1; i++) {
+        if ( d_end[i] >= d_start[i+1] ){
+            fprintf (stderr, "Bonds list check Overwrite @ index --> %d \n", i);
+            exit (-1);
+        }
+    }
+    //fprintf (stderr, " BOND LIST Overwrite *PASSED* \n");
+    count = 0;
+    miscount = 0;
+    for (int i = 0; i < system->N; i++) {
+
+        for (int j = d_start[i]; j < d_end[i]; j++) {
+            bond_data *src, *tgt;
+            src = &d_bond_data[j];
+            bond_data *src_sym = & d_bond_data[ src->sym_index ];
+
+            //Previously this was commented out. Thats why it was working.
+            //if (i >= src->nbr) continue;
+
+            int k = 0;
+            for (k = Start_Index (i, bonds); k < End_Index (i, bonds); k++) {
+                tgt = & (bonds->select.bond_list[k]);
+                bond_data *tgt_sym = &(bonds->select.bond_list [tgt->sym_index]);
+
+                if ((src->nbr == tgt->nbr) && !check_zero (src->d,tgt->d) &&
+                        !check_zero (src->dvec,tgt->dvec) && check_same (src->rel_box, tgt->rel_box)) {
+
+                    bond_order_data *s, *t;
+                    s = &(src->bo_data);
+                    t = &(tgt->bo_data);
+
+                    if (  !check_zero (s->BO,t->BO) &&
+                            !check_zero (s->BO_s,t->BO_s) &&
+                            !check_zero(s->BO_pi,t->BO_pi)  &&
+                            !check_zero (s->BO_pi2,t->BO_pi2) &&
+                            !check_zero (s->Cdbo,t->Cdbo) && !check_zero (s->Cdbopi,t->Cdbopi) && !check_zero (s->Cdbopi2,t->Cdbopi2) &&
+                            !check_zero (s->C1dbo,t->C1dbo) && !check_zero (s->C2dbo,t->C2dbo) && !check_zero (s->C3dbo,t->C3dbo) &&
+                            !check_zero(s->C1dbopi,t->C1dbopi) && !check_zero(s->C2dbopi,t->C2dbopi) && !check_zero(s->C3dbopi,t->C3dbopi) && !check_zero(s->C4dbopi,t->C4dbopi) &&
+                            !check_zero(s->C1dbopi2,t->C1dbopi2) && !check_zero(s->C2dbopi2,t->C2dbopi2) &&!check_zero(s->C3dbopi2,t->C3dbopi2) &&!check_zero(s->C4dbopi2,t->C4dbopi2) &&
+                            !check_zero (s->dln_BOp_s, t->dln_BOp_s ) &&
+                            !check_zero (s->dln_BOp_pi, t->dln_BOp_pi ) &&
+                            !check_zero (s->dln_BOp_pi2, t->dln_BOp_pi2 ) &&
+                            !check_zero (s->dBOp, t->dBOp )) {
+                        count ++;
+
+                        //Check the sym index and dbond index here for double checking
+                        // bond_ij on both device and hosts are matched now. 
+                        /*
+                           bond_order_data *ss, *ts;
+                           ss = & (src_sym->bo_data );
+                           ts = & (tgt_sym->bo_data );
+
+                           if ((src_sym->nbr != tgt_sym->nbr) || check_zero (src_sym->d,tgt_sym->d) ||
+                           check_zero (src_sym->dvec,tgt_sym->dvec) || !check_same (src_sym->rel_box, tgt_sym->rel_box)
+                           || check_zero (ss->Cdbo, ts->Cdbo)){
+
+                           fprintf (stderr, " Sym Index information does not match for atom %d \n", i);
+                           fprintf (stderr, " atom --> %d \n", i);
+                           fprintf (stderr, " nbr --> %d %d\n", src->nbr, tgt->nbr );
+                           fprintf (stderr, " d --> %f %f \n", src_sym->d, tgt_sym->d );
+                           fprintf (stderr, " sym Index nbr --> %d %d \n", src_sym->nbr, tgt_sym->nbr );
+                           fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n",
+                           src_sym->dvec[0], src_sym->dvec[1], src_sym->dvec[2],
+                           tgt_sym->dvec[0], tgt_sym->dvec[1], tgt_sym->dvec[2] );
+                           fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n",
+                           src_sym->rel_box[0], src_sym->rel_box[1], src_sym->rel_box[2],
+                           tgt_sym->rel_box[0], tgt_sym->rel_box[1], tgt_sym->rel_box[2] );
+
+                           fprintf (stderr, " sym index Cdbo (%4.10e %4.10e) \n", ss->Cdbo,ts->Cdbo );
+                           exit (-1);
+                           }
+                         */
+
+                        break;
+                    }
+                    fprintf (stderr, " d --> %f %f \n", src->d, tgt->d );
+                    fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n",
+                            src->dvec[0], src->dvec[1], src->dvec[2],
+                            tgt->dvec[0], tgt->dvec[1], tgt->dvec[2] );
+                    fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n",
+                            src->rel_box[0], src->rel_box[1], src->rel_box[2],
+                            tgt->rel_box[0], tgt->rel_box[1], tgt->rel_box[2] );
+
+                    fprintf (stderr, "Bond_Order_Data does not match for atom %d neighbor (%d %d) BO (%e %e) BO_s (%e %e) BO_pi (%e %e) BO_pi2 (%e %e) \n", i,
+                            src->nbr, tgt->nbr,
+                            s->BO, t->BO,
+                            s->BO_s, t->BO_s,
+                            s->BO_pi, t->BO_pi,
+                            s->BO_pi2, t->BO_pi2
+                        );
+                    fprintf (stderr, " dBOp (%e %e %e) (%e %e %e) \n", s->dBOp[0], s->dBOp[1], s->dBOp[2],
+                            t->dBOp[0], t->dBOp[1], t->dBOp[2] );
+
+                    fprintf (stderr, " Cdbo (%4.10e %4.10e) \n", s->Cdbo,t->Cdbo );
+                    fprintf (stderr, " Cdbopi (%e %e) \n", s->Cdbopi,t->Cdbopi );
+                    fprintf (stderr, " Cdbopi2 (%e %e) \n", s->Cdbopi2,t->Cdbopi2 );
+                    fprintf (stderr, " C1dbo (%e %e %e)(%e %e %e) \n", s->C1dbo,s->C2dbo,s->C3dbo, t->C1dbo,t->C2dbo,t->C3dbo );
+                    fprintf (stderr, " C1dbopi (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi,s->C2dbopi,s->C3dbopi,s->C4dbopi, t->C1dbopi,t->C2dbopi,t->C3dbopi,t->C4dbopi);
+                    fprintf (stderr, " C1dbopi2 (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi2,s->C2dbopi2,s->C3dbopi2,s->C4dbopi2, t->C1dbopi2,t->C2dbopi2,t->C3dbopi2,t->C4dbopi2);
+                    fprintf (stderr, " dln_BOp_s (%e %e %e ) (%e %e %e) \n",
+                            s->dln_BOp_s[0], s->dln_BOp_s[1], s->dln_BOp_s[2],
+                            t->dln_BOp_s[0], t->dln_BOp_s[1], t->dln_BOp_s[2] );
+                    fprintf (stderr, " dln_BOp_pi (%e %e %e ) (%e %e %e) \n",
+                            s->dln_BOp_pi[0], s->dln_BOp_pi[1], s->dln_BOp_pi[2],
+                            t->dln_BOp_pi[0], t->dln_BOp_pi[1], t->dln_BOp_pi[2] );
+                    fprintf (stderr, " dln_BOp_pi2 (%e %e %e ) (%e %e %e) \n",
+                            s->dln_BOp_pi2[0], s->dln_BOp_pi2[1], s->dln_BOp_pi2[2],
+                            t->dln_BOp_pi2[0], t->dln_BOp_pi2[1], t->dln_BOp_pi2[2] );
+
+                    //miscount ++;
+                    //break;
+                    exit (-1);
+                }
+            }
+
+            if (k >= End_Index (i, bonds)) {
+                miscount ++;
+                fprintf (stderr, " We have a problem with the atom %d and bond entry %d \n", i, j);
+                exit (-1);
+            }
+        }
+    }
+
+    fprintf (stderr, " BONDS matched count %d miscount %d (%d) \n", count, miscount, (count+miscount));
+    free (d_start);
+    free (d_end);
+    free (d_bond_data);
+    return SUCCESS;
 }
 
 int validate_workspace (reax_system *system, storage *workspace)
 {
-	int miscount;
-	int count, tcount;
-
-	///////////////////////
-	//INIT FORCES
-	///////////////////////
-
-	// bond_mark
-	int *bond_mark = (int *)malloc (sizeof (int) * system->N);
-	copy_host_device (bond_mark, dev_workspace->bond_mark, sizeof (int) * system->N, 
-			cudaMemcpyDeviceToHost, "bond_mark");
-	miscount = 0;
-	for (int i = 0; i < system->N; i++) {
-		if (workspace->bond_mark [i] != bond_mark [i])  {
-			fprintf (stderr, "Bond_mark atom:%d -- %d:%d \n", i, bond_mark [i], workspace->bond_mark [i]);
-			miscount ++;
-		}
-	}
-	free (bond_mark);
-	fprintf (stderr, " Bond Mark : %d \n", miscount );
-
-	//total_bond_order
-	real *total_bond_order = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "total_bond_order");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){
-			fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->total_bond_order[i], total_bond_order[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (total_bond_order);
-	fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count);
-
-	//////////////////////////////
-	//BOND ORDERS 
-	//////////////////////////////
-
-	//deltap
-	real *deltap= (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (deltap, dev_workspace->Deltap, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "deltap");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->Deltap[i], deltap[i])){
-			fprintf (stderr, "deltap does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->Deltap[i], deltap[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (deltap);
-	fprintf (stderr, "Deltap mismatch count %d\n", count);
-
-	//deltap_boc
-	real *deltap_boc = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (deltap_boc, dev_workspace->Deltap_boc, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "deltap_boc");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->Deltap_boc[i], deltap_boc[i])){
-			fprintf (stderr, "deltap_boc does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->Deltap_boc[i], deltap_boc[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (deltap_boc);
-	fprintf (stderr, "Deltap_boc mismatch count %d\n", count);
-
-
-	rvec *dDeltap_self;
-	dDeltap_self = (rvec *) calloc (system->N, sizeof (rvec) );
-	copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * sizeof (rvec), cudaMemcpyDeviceToHost, "ddeltap_self");
-
-	count = 0; 
-	for (int i = 0; i < system->N; i++ )
-	{
-		if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i]))
-		{    
-			fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, 
-					workspace->dDeltap_self[i][0],
-					workspace->dDeltap_self[i][1],
-					workspace->dDeltap_self[i][2],
-					dDeltap_self[3*i+0],
-					dDeltap_self[3*i+1],
-					dDeltap_self[3*i+2] );
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (dDeltap_self);
-	fprintf (stderr, "dDeltap_self mismatch count %d\n", count);
-
-	//Delta
-	real *delta = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (delta, dev_workspace->Delta, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "Delta");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->Delta[i], delta[i])){
-			fprintf (stderr, "delta does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->Delta[i], delta[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (delta);
-	fprintf (stderr, "Delta mismatch count %d\n", count);
-
-	//Delta_e
-	real *deltae = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (deltae, dev_workspace->Delta_e, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "Deltae");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->Delta_e[i], deltae[i])){
-			fprintf (stderr, "deltae does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->Delta_e[i], deltae[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (deltae);
-	fprintf (stderr, "Delta_e mismatch count %d\n", count);
-
-	//vlpex
-	real *vlpex= (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (vlpex, dev_workspace->vlpex, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "vlpex");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->vlpex[i], vlpex[i])){
-			fprintf (stderr, "vlpex does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->vlpex[i], vlpex[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (vlpex);
-	fprintf (stderr, "vlpex mismatch count %d\n", count);
-
-	//nlp
-	real *nlp = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (nlp, dev_workspace->nlp, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->nlp[i], nlp[i])){
-			fprintf (stderr, "nlp does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->nlp[i], nlp[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (nlp);
-	fprintf (stderr, "nlp mismatch count %d\n", count);
-
-	//delta_lp
-	real *Delta_lp = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (Delta_lp , dev_workspace->Delta_lp , system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "Delta_lp ");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->Delta_lp [i], Delta_lp [i])){
-			fprintf (stderr, "Delta_lp  does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->Delta_lp [i], Delta_lp [i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (Delta_lp );
-	fprintf (stderr, "Delta_lp  mismatch count %d\n", count);
-
-	//Clp
-	real *Clp = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (Clp, dev_workspace->Clp, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "Clp");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->Clp[i], Clp[i])){
-			fprintf (stderr, "Clp does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->Clp[i], Clp[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (Clp);
-	fprintf (stderr, "Clp mismatch count %d\n", count);
-
-	//dDelta_lp
-	real *dDelta_lp = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (dDelta_lp, dev_workspace->dDelta_lp, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "dDelta_lp");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->dDelta_lp[i], dDelta_lp[i])){
-			fprintf (stderr, "dDelta_lp does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->dDelta_lp[i], dDelta_lp[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (dDelta_lp);
-	fprintf (stderr, "dDelta_lp mismatch count %d\n", count);
-
-	//nlp_temp
-	real *nlp_temp = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (nlp_temp, dev_workspace->nlp_temp, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "nlp_temp");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->nlp_temp[i], nlp_temp[i])){
-			fprintf (stderr, "nlp_temp does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->nlp_temp[i], nlp_temp[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (nlp_temp);
-	fprintf (stderr, "nlp_temp mismatch count %d\n", count);
-
-	//Delta_lp_temp
-	real *Delta_lp_temp = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (Delta_lp_temp, dev_workspace->Delta_lp_temp, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "Delta_lp_temp");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->Delta_lp_temp[i], Delta_lp_temp[i])){
-			fprintf (stderr, "Delta_lp_temp does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->Delta_lp_temp[i], Delta_lp_temp[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (Delta_lp_temp);
-	fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count);
-
-
-	//dDelta_lp_temp
-	real *dDelta_lp_temp = (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (dDelta_lp_temp, dev_workspace->dDelta_lp_temp, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "dDelta_lp_temp");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->dDelta_lp_temp[i], dDelta_lp_temp[i])){
-			fprintf (stderr, "dDelta_lp_temp does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->dDelta_lp_temp[i], dDelta_lp_temp[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (dDelta_lp_temp);
-	fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count);
-
-	//////////////////////////////
-	//BONDS
-	//////////////////////////////
-
-	//CdDelta
-	real *CdDelta= (real *) malloc ( system->N * sizeof (real));
-	copy_host_device (CdDelta, dev_workspace->CdDelta, system->N * sizeof (real), 
-			cudaMemcpyDeviceToHost, "CdDelta");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->CdDelta[i], CdDelta[i])){
-			fprintf (stderr, "CdDelta does not match for atom %d (%4.15e %4.15e)\n",
-					i, workspace->CdDelta[i], CdDelta[i]);
-			exit (-1);
-			count ++;
-		}    
-	}
-	free (CdDelta);
-	fprintf (stderr, "CdDelta mismatch count %d\n", count);
-
-
-	//////////////////////////////////
-	//ATOM ENERGY
-	//////////////////////////////////
-
-	//////////////////////////////////
-	//VALENCE ANGLES
-	//////////////////////////////////
-	rvec *f= (rvec *) malloc ( system->N * sizeof (rvec));
-	copy_host_device (f, dev_workspace->f, system->N * sizeof (rvec), 
-			cudaMemcpyDeviceToHost, "f");
-	count = 0; 
-	for (int i = 0; i < system->N; i++) {
-
-		if ( check_zero (workspace->f[i], f[i])){
-			fprintf (stderr, "f does not match for atom %d (%4.15e %4.15e, %4.15e) (%4.15e %4.15e, %4.15e)\n",
-					i, 
-					workspace->f[i][0], workspace->f[i][1], workspace->f[i][2], 
-					f[i][0], f[i][1], f[i][2]);
-			//exit (-1);
-			count ++;
-		}    
-	}
-	free (f);
-	fprintf (stderr, "f mismatch count %d\n", count);
-
-	/////////////////////////////////////////////////////
-	//QEq part
-	/////////////////////////////////////////////////////
-	compare_rvec2 (workspace->d2, dev_workspace->d2, system->N, "d2");
-
-	compare_rvec2 (workspace->q2, dev_workspace->q2, system->N, "q2");
-
-	compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
-
-	compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
-
-	return SUCCESS;
+    int miscount;
+    int count, tcount;
+
+    ///////////////////////
+    //INIT FORCES
+    ///////////////////////
+
+    // bond_mark
+    int *bond_mark = (int *)malloc (sizeof (int) * system->N);
+    copy_host_device (bond_mark, dev_workspace->bond_mark, sizeof (int) * system->N, 
+            cudaMemcpyDeviceToHost, "bond_mark");
+    miscount = 0;
+    for (int i = 0; i < system->N; i++) {
+        if (workspace->bond_mark [i] != bond_mark [i])  {
+            fprintf (stderr, "Bond_mark atom:%d -- %d:%d \n", i, bond_mark [i], workspace->bond_mark [i]);
+            miscount ++;
+        }
+    }
+    free (bond_mark);
+    fprintf (stderr, " Bond Mark : %d \n", miscount );
+
+    //total_bond_order
+    real *total_bond_order = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "total_bond_order");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){
+            fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->total_bond_order[i], total_bond_order[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (total_bond_order);
+    fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count);
+
+    //////////////////////////////
+    //BOND ORDERS 
+    //////////////////////////////
+
+    //deltap
+    real *deltap= (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (deltap, dev_workspace->Deltap, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "deltap");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->Deltap[i], deltap[i])){
+            fprintf (stderr, "deltap does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->Deltap[i], deltap[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (deltap);
+    fprintf (stderr, "Deltap mismatch count %d\n", count);
+
+    //deltap_boc
+    real *deltap_boc = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (deltap_boc, dev_workspace->Deltap_boc, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "deltap_boc");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->Deltap_boc[i], deltap_boc[i])){
+            fprintf (stderr, "deltap_boc does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->Deltap_boc[i], deltap_boc[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (deltap_boc);
+    fprintf (stderr, "Deltap_boc mismatch count %d\n", count);
+
+
+    rvec *dDeltap_self;
+    dDeltap_self = (rvec *) calloc (system->N, sizeof (rvec) );
+    copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * sizeof (rvec), cudaMemcpyDeviceToHost, "ddeltap_self");
+
+    count = 0; 
+    for (int i = 0; i < system->N; i++ )
+    {
+        if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i]))
+        {    
+            fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, 
+                    workspace->dDeltap_self[i][0],
+                    workspace->dDeltap_self[i][1],
+                    workspace->dDeltap_self[i][2],
+                    dDeltap_self[3*i+0],
+                    dDeltap_self[3*i+1],
+                    dDeltap_self[3*i+2] );
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (dDeltap_self);
+    fprintf (stderr, "dDeltap_self mismatch count %d\n", count);
+
+    //Delta
+    real *delta = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (delta, dev_workspace->Delta, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "Delta");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->Delta[i], delta[i])){
+            fprintf (stderr, "delta does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->Delta[i], delta[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (delta);
+    fprintf (stderr, "Delta mismatch count %d\n", count);
+
+    //Delta_e
+    real *deltae = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (deltae, dev_workspace->Delta_e, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "Deltae");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->Delta_e[i], deltae[i])){
+            fprintf (stderr, "deltae does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->Delta_e[i], deltae[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (deltae);
+    fprintf (stderr, "Delta_e mismatch count %d\n", count);
+
+    //vlpex
+    real *vlpex= (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (vlpex, dev_workspace->vlpex, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "vlpex");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->vlpex[i], vlpex[i])){
+            fprintf (stderr, "vlpex does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->vlpex[i], vlpex[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (vlpex);
+    fprintf (stderr, "vlpex mismatch count %d\n", count);
+
+    //nlp
+    real *nlp = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (nlp, dev_workspace->nlp, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->nlp[i], nlp[i])){
+            fprintf (stderr, "nlp does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->nlp[i], nlp[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (nlp);
+    fprintf (stderr, "nlp mismatch count %d\n", count);
+
+    //delta_lp
+    real *Delta_lp = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (Delta_lp , dev_workspace->Delta_lp , system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "Delta_lp ");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->Delta_lp [i], Delta_lp [i])){
+            fprintf (stderr, "Delta_lp  does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->Delta_lp [i], Delta_lp [i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (Delta_lp );
+    fprintf (stderr, "Delta_lp  mismatch count %d\n", count);
+
+    //Clp
+    real *Clp = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (Clp, dev_workspace->Clp, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "Clp");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->Clp[i], Clp[i])){
+            fprintf (stderr, "Clp does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->Clp[i], Clp[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (Clp);
+    fprintf (stderr, "Clp mismatch count %d\n", count);
+
+    //dDelta_lp
+    real *dDelta_lp = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (dDelta_lp, dev_workspace->dDelta_lp, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "dDelta_lp");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->dDelta_lp[i], dDelta_lp[i])){
+            fprintf (stderr, "dDelta_lp does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->dDelta_lp[i], dDelta_lp[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (dDelta_lp);
+    fprintf (stderr, "dDelta_lp mismatch count %d\n", count);
+
+    //nlp_temp
+    real *nlp_temp = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (nlp_temp, dev_workspace->nlp_temp, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "nlp_temp");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->nlp_temp[i], nlp_temp[i])){
+            fprintf (stderr, "nlp_temp does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->nlp_temp[i], nlp_temp[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (nlp_temp);
+    fprintf (stderr, "nlp_temp mismatch count %d\n", count);
+
+    //Delta_lp_temp
+    real *Delta_lp_temp = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (Delta_lp_temp, dev_workspace->Delta_lp_temp, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "Delta_lp_temp");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->Delta_lp_temp[i], Delta_lp_temp[i])){
+            fprintf (stderr, "Delta_lp_temp does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->Delta_lp_temp[i], Delta_lp_temp[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (Delta_lp_temp);
+    fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count);
+
+
+    //dDelta_lp_temp
+    real *dDelta_lp_temp = (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (dDelta_lp_temp, dev_workspace->dDelta_lp_temp, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "dDelta_lp_temp");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->dDelta_lp_temp[i], dDelta_lp_temp[i])){
+            fprintf (stderr, "dDelta_lp_temp does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->dDelta_lp_temp[i], dDelta_lp_temp[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (dDelta_lp_temp);
+    fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count);
+
+    //////////////////////////////
+    //BONDS
+    //////////////////////////////
+
+    //CdDelta
+    real *CdDelta= (real *) malloc ( system->N * sizeof (real));
+    copy_host_device (CdDelta, dev_workspace->CdDelta, system->N * sizeof (real), 
+            cudaMemcpyDeviceToHost, "CdDelta");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->CdDelta[i], CdDelta[i])){
+            fprintf (stderr, "CdDelta does not match for atom %d (%4.15e %4.15e)\n",
+                    i, workspace->CdDelta[i], CdDelta[i]);
+            exit (-1);
+            count ++;
+        }    
+    }
+    free (CdDelta);
+    fprintf (stderr, "CdDelta mismatch count %d\n", count);
+
+
+    //////////////////////////////////
+    //ATOM ENERGY
+    //////////////////////////////////
+
+    //////////////////////////////////
+    //VALENCE ANGLES
+    //////////////////////////////////
+    rvec *f= (rvec *) malloc ( system->N * sizeof (rvec));
+    copy_host_device (f, dev_workspace->f, system->N * sizeof (rvec), 
+            cudaMemcpyDeviceToHost, "f");
+    count = 0; 
+    for (int i = 0; i < system->N; i++) {
+
+        if ( check_zero (workspace->f[i], f[i])){
+            fprintf (stderr, "f does not match for atom %d (%4.15e %4.15e, %4.15e) (%4.15e %4.15e, %4.15e)\n",
+                    i, 
+                    workspace->f[i][0], workspace->f[i][1], workspace->f[i][2], 
+                    f[i][0], f[i][1], f[i][2]);
+            //exit (-1);
+            count ++;
+        }    
+    }
+    free (f);
+    fprintf (stderr, "f mismatch count %d\n", count);
+
+    /////////////////////////////////////////////////////
+    //QEq part
+    /////////////////////////////////////////////////////
+    compare_rvec2 (workspace->d2, dev_workspace->d2, system->N, "d2");
+
+    compare_rvec2 (workspace->q2, dev_workspace->q2, system->N, "q2");
+
+    compare_rvec2 (workspace->x, dev_workspace->x, system->N, "x");
+
+    compare_rvec2 (workspace->b, dev_workspace->b, system->N, "b");
+
+    return SUCCESS;
 }
 
 void compare_rvec2( rvec2 *host, rvec2 *device, int N, char *msg)
 {
-	int count = 0;
-	int miscount = 0;
-	rvec2 *tmp = (rvec2 *) host_scratch;
-	copy_host_device (tmp, device, sizeof (rvec2) * N, cudaMemcpyDeviceToHost, msg);
-
-	for (int i = 0; i < N; i++)
-	{
-		if (check_zero_rvec2 (host [i], tmp [i])) {
-			fprintf (stderr, " %s does not match at index: %d (%f %f) - (%f %f) \n", 
-					msg, i, host[i][0], host[i][1], tmp[i][0], tmp[i][1] );
-			// exit (-1);
-			miscount ++;
-		}
-		count ++;
-	}
-	fprintf (stderr, "%s match between host and device (%d - %d) \n", msg, count, miscount);
+    int count = 0;
+    int miscount = 0;
+    rvec2 *tmp = (rvec2 *) host_scratch;
+    copy_host_device (tmp, device, sizeof (rvec2) * N, cudaMemcpyDeviceToHost, msg);
+
+    for (int i = 0; i < N; i++)
+    {
+        if (check_zero_rvec2 (host [i], tmp [i])) {
+            fprintf (stderr, " %s does not match at index: %d (%f %f) - (%f %f) \n", 
+                    msg, i, host[i][0], host[i][1], tmp[i][0], tmp[i][1] );
+            // exit (-1);
+            miscount ++;
+        }
+        count ++;
+    }
+    fprintf (stderr, "%s match between host and device (%d - %d) \n", msg, count, miscount);
 }
 
 void compare_array( real *host, real *device, int N, char *msg)
 {
-	int count = 0;
-	int miscount = 0;
-	real *tmp = (real *) host_scratch;
-	copy_host_device (tmp, device, sizeof (real) * N, cudaMemcpyDeviceToHost, msg);
-
-	for (int i = 0; i < N; i++)
-	{
-		if (check_zero (host [i], tmp [i])) {
-			fprintf (stderr, " %s does not match at index: %d (%f) - (%f) \n", 
-					msg, i, host[i], tmp[i] );
-			// exit (-1);
-			miscount ++;
-		}
-		count ++;
-	}
-	fprintf (stderr, "%s match between host and device (%d - %d) \n", msg, count, miscount);
+    int count = 0;
+    int miscount = 0;
+    real *tmp = (real *) host_scratch;
+    copy_host_device (tmp, device, sizeof (real) * N, cudaMemcpyDeviceToHost, msg);
+
+    for (int i = 0; i < N; i++)
+    {
+        if (check_zero (host [i], tmp [i])) {
+            fprintf (stderr, " %s does not match at index: %d (%f) - (%f) \n", 
+                    msg, i, host[i], tmp[i] );
+            // exit (-1);
+            miscount ++;
+        }
+        count ++;
+    }
+    fprintf (stderr, "%s match between host and device (%d - %d) \n", msg, count, miscount);
 }
 
 
 int validate_data (reax_system *system, simulation_data *host)
 {
-	simulation_data device;
-
-	copy_host_device (&device, host->d_simulation_data, sizeof (simulation_data), 
-			cudaMemcpyDeviceToHost, "simulation_data");
-
-	if (check_zero (host->my_en.e_bond, device.my_en.e_bond)){
-		fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->my_en.e_bond, device.my_en.e_bond);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_lp, device.my_en.e_lp)){
-		fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->my_en.e_lp, device.my_en.e_lp);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_ov, device.my_en.e_ov)){
-		fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->my_en.e_ov, device.my_en.e_ov);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_un, device.my_en.e_un)){
-		fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->my_en.e_un, device.my_en.e_un);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_tor, device.my_en.e_tor)) {
-		fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->my_en.e_tor, device.my_en.e_tor);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_con, device.my_en.e_con)) {
-		fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->my_en.e_con, device.my_en.e_con);
-		exit (-1);
-	}
-
-	fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->my_en.e_hb, device.my_en.e_hb);
-	if (check_zero (host->my_en.e_hb, device.my_en.e_hb)) {
-		fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->my_en.e_hb, device.my_en.e_hb);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_ang, device.my_en.e_ang)) {
-		fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->my_en.e_ang, device.my_en.e_ang);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_pen, device.my_en.e_pen)) {
-		fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->my_en.e_pen, device.my_en.e_pen);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_coa, device.my_en.e_coa)) {
-		fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->my_en.e_coa, device.my_en.e_coa);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_vdW, device.my_en.e_vdW)) {
-		fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->my_en.e_vdW, device.my_en.e_vdW);
-		exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_pol, device.my_en.e_pol)) {
-		fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->my_en.e_pol, device.my_en.e_pol);
-		//exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_kin, device.my_en.e_kin)) {
-		fprintf (stderr, "E_Kin does not match (%4.10e %4.10e) \n", host->my_en.e_kin, device.my_en.e_kin);
-		//exit (-1);
-	}
-
-	if (check_zero (host->my_en.e_ele, device.my_en.e_ele)) {
-		fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->my_en.e_ele, device.my_en.e_ele);
-		//exit (-1);
-	}
-
-	fprintf (stderr, "Simulation Data match between host and device \n");
-	return SUCCESS;
+    simulation_data device;
+
+    copy_host_device (&device, host->d_simulation_data, sizeof (simulation_data), 
+            cudaMemcpyDeviceToHost, "simulation_data");
+
+    if (check_zero (host->my_en.e_bond, device.my_en.e_bond)){
+        fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->my_en.e_bond, device.my_en.e_bond);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_lp, device.my_en.e_lp)){
+        fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->my_en.e_lp, device.my_en.e_lp);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_ov, device.my_en.e_ov)){
+        fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->my_en.e_ov, device.my_en.e_ov);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_un, device.my_en.e_un)){
+        fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->my_en.e_un, device.my_en.e_un);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_tor, device.my_en.e_tor)) {
+        fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->my_en.e_tor, device.my_en.e_tor);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_con, device.my_en.e_con)) {
+        fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->my_en.e_con, device.my_en.e_con);
+        exit (-1);
+    }
+
+    fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->my_en.e_hb, device.my_en.e_hb);
+    if (check_zero (host->my_en.e_hb, device.my_en.e_hb)) {
+        fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->my_en.e_hb, device.my_en.e_hb);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_ang, device.my_en.e_ang)) {
+        fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->my_en.e_ang, device.my_en.e_ang);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_pen, device.my_en.e_pen)) {
+        fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->my_en.e_pen, device.my_en.e_pen);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_coa, device.my_en.e_coa)) {
+        fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->my_en.e_coa, device.my_en.e_coa);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_vdW, device.my_en.e_vdW)) {
+        fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->my_en.e_vdW, device.my_en.e_vdW);
+        exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_pol, device.my_en.e_pol)) {
+        fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->my_en.e_pol, device.my_en.e_pol);
+        //exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_kin, device.my_en.e_kin)) {
+        fprintf (stderr, "E_Kin does not match (%4.10e %4.10e) \n", host->my_en.e_kin, device.my_en.e_kin);
+        //exit (-1);
+    }
+
+    if (check_zero (host->my_en.e_ele, device.my_en.e_ele)) {
+        fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->my_en.e_ele, device.my_en.e_ele);
+        //exit (-1);
+    }
+
+    fprintf (stderr, "Simulation Data match between host and device \n");
+    return SUCCESS;
 }
 
 int validate_grid (reax_system *system)
 {
-	int  x,i, j, k,l, itr; //, tmp, tested;
-	int itr_nbr,itr_11, miscount;
-	ivec src, dest;
-	grid *g;
-	grid_cell *gci, *gcj, *gcj_nbr;
-	int found = 0;
-
-	int *tmp = (int *) host_scratch;
-	int total;
-
-	g = &( system->my_grid );
-	miscount = 0;
-
-	total = g->ncells[0] * g->ncells[1] * g->ncells[2];
-
-	copy_host_device (tmp, system->d_my_grid.str, sizeof(int) * total, cudaMemcpyDeviceToHost, "grid:str");
-	copy_host_device (tmp + total, system->d_my_grid.end, sizeof(int) * total, cudaMemcpyDeviceToHost, "grid:end");
-
-	real *cutoff = (real *) (tmp + 2 * total);
-	copy_host_device (cutoff, system->d_my_grid.cutoff, sizeof (real) * total, cudaMemcpyDeviceToHost, "grid:cutoff");
-
-	for( i = 0; i < g->ncells[0]; i++ )
-		for( j = 0; j < g->ncells[1]; j++ )
-			for( k = 0; k < g->ncells[2]; k++ ) 
-			{
-				if ((g->str [index_grid_3d (i, j, k, g)] != tmp [index_grid_3d (i, j, k, g)]) || 
-						(g->end [index_grid_3d (i, j, k, g)] != tmp[total + index_grid_3d (i, j, k, g)]) ||
-						(cutoff [index_grid_3d (i, j, k, g)] != g->cutoff [index_grid_3d (i, j, k, g)]))
-				{
-					fprintf (stderr, "we have a problem here \n");
-					exit (0);
-				}
-				/*
-				   fprintf (stderr, " %d %d %d - str: %d end: %d  (%d %d) ( %f %f)\n", 
-				   i, j, k, g->str [index_grid_3d (i, j, k, g)], g->end [index_grid_3d (i, j, k, g)], 
-				   tmp [index_grid_3d (i, j, k, g)], tmp[total + index_grid_3d (i, j, k, g)], 
-				   cutoff [index_grid_3d (i, j, k, g)], g->cutoff [index_grid_3d (i, j, k, g)]);
-				 */
-			}
-
-	rvec *tmpvec = (rvec *) host_scratch;
-	copy_host_device (tmpvec, system->d_my_grid.nbrs_cp, sizeof (rvec) * total * g->max_nbrs, 
-			cudaMemcpyDeviceToHost, "grid:nbrs_cp");
-
-	ivec *tivec = (ivec *) (((rvec *)host_scratch) + total * g->max_nbrs);
-	copy_host_device (tivec, system->d_my_grid.nbrs_x, sizeof (ivec) * total * g->max_nbrs, 
-			cudaMemcpyDeviceToHost, "grid:nbrs_x");
-
-
-	for( i = 0; i < g->ncells[0]; i++ )
-		for( j = 0; j < g->ncells[1]; j++ )
-			for( k = 0; k < g->ncells[2]; k++ ) 
-				for (l = 0; l < g->max_nbrs; l++) {
-
-					if (( g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][0] != tmpvec[index_grid_nbrs(i, j, k, l, g)][0]) ||
-							(g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][1] != tmpvec[index_grid_nbrs(i, j, k, l, g)][1]) || 
-							(g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][2] != tmpvec[index_grid_nbrs(i, j, k, l, g)][2]) || 
-							(g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][0] != tivec[index_grid_nbrs(i, j, k, l, g)][0]) ||
-							(g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][1] != tivec[index_grid_nbrs(i, j, k, l, g)][1]) || 
-							(g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][2] != tivec[index_grid_nbrs(i, j, k, l, g)][2] )) 
-					{
-						fprintf (stderr, "we have a big problem here \n");
-						exit (0);
-					}
-
-					if ((g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][0] > NEG_INF) &&
-							(g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][1] > NEG_INF) &&
-							(g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][2] > NEG_INF) )
-						;/*
-						    fprintf (stderr, "%d %d %d %d ---- %d %d %d - %d %d %d \n", 
-					//fprintf (stderr, "%d %d %d %d ---- (%3.2f %3.2f %3.2f) - (%3.2f %3.2f %3.2f) \n", 
-					i, j, k, l, 
-					g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][0], 
-					g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][1], 
-					g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][2], 
-					tivec[index_grid_nbrs(i, j, k, l, g)][0], 
-					tivec[index_grid_nbrs(i, j, k, l, g)][1], 
-					tivec[index_grid_nbrs(i, j, k, l, g)][2]
-					);
-						  */
-				}
-
-	return 0;
-
-	//  for( i = 0; i < g->ncells[0]; i++ )
-	//    for( j = 0; j < g->ncells[1]; j++ )
-	//      for( k = 0; k < g->ncells[2]; k++ ) 
-	//		{ 
-	//         gci = &(g->cells[ index_grid_3d (i, j, k, g) ]); 
-	//			//for (x = 0; x < g->max_nbrs; x++)
-	//			//	fprintf (stderr, "(%d, %d, %d) - (%d, %d, %d) \n", 
-	//			//							i, j, k, 
-	//			//							gci->nbrs_x[x][0],
-	//			//							gci->nbrs_x[x][1],
-	//			//							gci->nbrs_x[x][2] );
-	//			//exit (0);
-	//
-	//           itr = 0;
-	//           while( (gcj=gci->nbrs[itr]) != NULL ) 
-	//           {
-	//			  		//iterate through the neighbors of gcj and find (i, j, k)
-	//					itr_nbr = 0;
-	//					found = 0;
-	//					while ( (gcj_nbr=gcj->nbrs [itr_nbr]) != NULL )
-	//					{
-	//						ivec_Copy (dest, gcj_nbr->nbrs_x[itr_nbr] );
-	//
-	//						if ( (i == dest[0]) && (j == dest[1]) && (k == dest[2]))
-	//						{
-	//							found = 1;
-	//							break;
-	//						}
-	//						itr_nbr ++;
-	//					}
-	//
-	//					if (found == 0) {
-	//						fprintf (stderr, "we have a problem here: (%d, %d, %d): (%d, %d, %d) type: (%d, %d) \n",
-	//											i, j, k, 
-	//											gci->nbrs_x[itr][0],
-	//											gci->nbrs_x[itr][1],
-	//											gci->nbrs_x[itr][2],
-	//											gci->type, 
-	//											gcj->type);
-	//						itr_11 = 0;
-	//						while ( (gcj_nbr=gcj->nbrs [itr_11]) != NULL )
-	//						{
-	//							ivec_Copy (dest, gcj_nbr->nbrs_x[itr_11] );
-	//							fprintf (stderr, "%d, %d, %d \n", dest[0], dest[1], dest[2]);
-	//							itr_11 ++;
-	//						}
-	//						exit (0);
-	//						miscount ++;
-	//					}
-	//
-	//					itr ++;
-	//			  }
-	//		}
-	//
-	//		fprintf (stderr, " cell miscount: %d \n", miscount);
+    int  x,i, j, k,l, itr; //, tmp, tested;
+    int itr_nbr,itr_11, miscount;
+    ivec src, dest;
+    grid *g;
+    grid_cell *gci, *gcj, *gcj_nbr;
+    int found = 0;
+
+    int *tmp = (int *) host_scratch;
+    int total;
+
+    g = &( system->my_grid );
+    miscount = 0;
+
+    total = g->ncells[0] * g->ncells[1] * g->ncells[2];
+
+    copy_host_device (tmp, system->d_my_grid.str, sizeof(int) * total, cudaMemcpyDeviceToHost, "grid:str");
+    copy_host_device (tmp + total, system->d_my_grid.end, sizeof(int) * total, cudaMemcpyDeviceToHost, "grid:end");
+
+    real *cutoff = (real *) (tmp + 2 * total);
+    copy_host_device (cutoff, system->d_my_grid.cutoff, sizeof (real) * total, cudaMemcpyDeviceToHost, "grid:cutoff");
+
+    for( i = 0; i < g->ncells[0]; i++ )
+        for( j = 0; j < g->ncells[1]; j++ )
+            for( k = 0; k < g->ncells[2]; k++ ) 
+            {
+                if ((g->str [index_grid_3d (i, j, k, g)] != tmp [index_grid_3d (i, j, k, g)]) || 
+                        (g->end [index_grid_3d (i, j, k, g)] != tmp[total + index_grid_3d (i, j, k, g)]) ||
+                        (cutoff [index_grid_3d (i, j, k, g)] != g->cutoff [index_grid_3d (i, j, k, g)]))
+                {
+                    fprintf (stderr, "we have a problem here \n");
+                    exit (0);
+                }
+                /*
+                   fprintf (stderr, " %d %d %d - str: %d end: %d  (%d %d) ( %f %f)\n", 
+                   i, j, k, g->str [index_grid_3d (i, j, k, g)], g->end [index_grid_3d (i, j, k, g)], 
+                   tmp [index_grid_3d (i, j, k, g)], tmp[total + index_grid_3d (i, j, k, g)], 
+                   cutoff [index_grid_3d (i, j, k, g)], g->cutoff [index_grid_3d (i, j, k, g)]);
+                 */
+            }
+
+    rvec *tmpvec = (rvec *) host_scratch;
+    copy_host_device (tmpvec, system->d_my_grid.nbrs_cp, sizeof (rvec) * total * g->max_nbrs, 
+            cudaMemcpyDeviceToHost, "grid:nbrs_cp");
+
+    ivec *tivec = (ivec *) (((rvec *)host_scratch) + total * g->max_nbrs);
+    copy_host_device (tivec, system->d_my_grid.nbrs_x, sizeof (ivec) * total * g->max_nbrs, 
+            cudaMemcpyDeviceToHost, "grid:nbrs_x");
+
+
+    for( i = 0; i < g->ncells[0]; i++ )
+        for( j = 0; j < g->ncells[1]; j++ )
+            for( k = 0; k < g->ncells[2]; k++ ) 
+                for (l = 0; l < g->max_nbrs; l++) {
+
+                    if (( g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][0] != tmpvec[index_grid_nbrs(i, j, k, l, g)][0]) ||
+                            (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][1] != tmpvec[index_grid_nbrs(i, j, k, l, g)][1]) || 
+                            (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][2] != tmpvec[index_grid_nbrs(i, j, k, l, g)][2]) || 
+                            (g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][0] != tivec[index_grid_nbrs(i, j, k, l, g)][0]) ||
+                            (g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][1] != tivec[index_grid_nbrs(i, j, k, l, g)][1]) || 
+                            (g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][2] != tivec[index_grid_nbrs(i, j, k, l, g)][2] )) 
+                    {
+                        fprintf (stderr, "we have a big problem here \n");
+                        exit (0);
+                    }
+
+                    if ((g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][0] > NEG_INF) &&
+                            (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][1] > NEG_INF) &&
+                            (g->nbrs_cp[index_grid_nbrs(i, j, k, l, g)][2] > NEG_INF) )
+                        ;/*
+                            fprintf (stderr, "%d %d %d %d ---- %d %d %d - %d %d %d \n", 
+                    //fprintf (stderr, "%d %d %d %d ---- (%3.2f %3.2f %3.2f) - (%3.2f %3.2f %3.2f) \n", 
+                    i, j, k, l, 
+                    g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][0], 
+                    g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][1], 
+                    g->nbrs_x[index_grid_nbrs(i, j, k, l, g)][2], 
+                    tivec[index_grid_nbrs(i, j, k, l, g)][0], 
+                    tivec[index_grid_nbrs(i, j, k, l, g)][1], 
+                    tivec[index_grid_nbrs(i, j, k, l, g)][2]
+                    );
+                          */
+                }
+
+    return 0;
+
+    //  for( i = 0; i < g->ncells[0]; i++ )
+    //    for( j = 0; j < g->ncells[1]; j++ )
+    //      for( k = 0; k < g->ncells[2]; k++ ) 
+    //        { 
+    //         gci = &(g->cells[ index_grid_3d (i, j, k, g) ]); 
+    //            //for (x = 0; x < g->max_nbrs; x++)
+    //            //    fprintf (stderr, "(%d, %d, %d) - (%d, %d, %d) \n", 
+    //            //                            i, j, k, 
+    //            //                            gci->nbrs_x[x][0],
+    //            //                            gci->nbrs_x[x][1],
+    //            //                            gci->nbrs_x[x][2] );
+    //            //exit (0);
+    //
+    //           itr = 0;
+    //           while( (gcj=gci->nbrs[itr]) != NULL ) 
+    //           {
+    //                    //iterate through the neighbors of gcj and find (i, j, k)
+    //                    itr_nbr = 0;
+    //                    found = 0;
+    //                    while ( (gcj_nbr=gcj->nbrs [itr_nbr]) != NULL )
+    //                    {
+    //                        ivec_Copy (dest, gcj_nbr->nbrs_x[itr_nbr] );
+    //
+    //                        if ( (i == dest[0]) && (j == dest[1]) && (k == dest[2]))
+    //                        {
+    //                            found = 1;
+    //                            break;
+    //                        }
+    //                        itr_nbr ++;
+    //                    }
+    //
+    //                    if (found == 0) {
+    //                        fprintf (stderr, "we have a problem here: (%d, %d, %d): (%d, %d, %d) type: (%d, %d) \n",
+    //                                            i, j, k, 
+    //                                            gci->nbrs_x[itr][0],
+    //                                            gci->nbrs_x[itr][1],
+    //                                            gci->nbrs_x[itr][2],
+    //                                            gci->type, 
+    //                                            gcj->type);
+    //                        itr_11 = 0;
+    //                        while ( (gcj_nbr=gcj->nbrs [itr_11]) != NULL )
+    //                        {
+    //                            ivec_Copy (dest, gcj_nbr->nbrs_x[itr_11] );
+    //                            fprintf (stderr, "%d, %d, %d \n", dest[0], dest[1], dest[2]);
+    //                            itr_11 ++;
+    //                        }
+    //                        exit (0);
+    //                        miscount ++;
+    //                    }
+    //
+    //                    itr ++;
+    //              }
+    //        }
+    //
+    //        fprintf (stderr, " cell miscount: %d \n", miscount);
 }
 
 int validate_three_bodies (reax_system *system, storage *workspace, reax_list **lists)
 {
-	reax_list *three = *lists + THREE_BODIES;
-	reax_list *bonds = *lists + BONDS;
-
-	reax_list *d_three = *dev_lists + THREE_BODIES;
-	reax_list *d_bonds = *dev_lists + BONDS;
-	bond_data *d_bond_data;
-	real *test;
-
-	three_body_interaction_data *data = (three_body_interaction_data *)
-		malloc ( sizeof (three_body_interaction_data) * d_three->num_intrs);
-	int *start = (int *) malloc (sizeof (int) * d_three->n);
-	int *end = (int *) malloc (sizeof (int) * d_three->n);
-
-	int *b_start = (int *) malloc (sizeof (int) * d_bonds->n);
-	int *b_end = (int *) malloc (sizeof (int) * d_bonds->n);
-	int count;
-	int hcount, dcount;
-
-
-	copy_host_device ( start, d_three->index,
-			sizeof (int) * d_three->n, cudaMemcpyDeviceToHost, "three:start");
-	copy_host_device ( end, d_three->end_index,
-			sizeof (int) * d_three->n, cudaMemcpyDeviceToHost, "three:end");
-	copy_host_device ( data, d_three->select.three_body_list,
-			sizeof (three_body_interaction_data) * d_three->num_intrs,
-			cudaMemcpyDeviceToHost, "three:data");
-
-	d_bond_data = (bond_data *) malloc (sizeof (bond_data)* d_bonds->num_intrs);
-
-	copy_host_device ( b_start, d_bonds->index,
-			sizeof (int) * d_bonds->n, cudaMemcpyDeviceToHost, "bonds:start");
-	copy_host_device ( b_end, d_bonds->end_index,
-			sizeof (int) * d_bonds->n, cudaMemcpyDeviceToHost, "bonds:end");
-	copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) *  d_bonds->num_intrs, 
-			cudaMemcpyDeviceToHost, "bonds:data");
-
-	count = 0;
-	hcount = dcount = 0;
-	for (int i = 0; i < system->N; i++)
-	{
-
-		int x, y, z;
-		for (x = b_start[i]; x < b_end[i]; x++)
-		{
-			int t_start = start[x];
-			int t_end = end[x];
-
-			bond_data *dev_bond = &d_bond_data [x];
-			bond_data *host_bond;
-			for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
-			{
-				host_bond = &bonds->select.bond_list [z];
-				if ((dev_bond->nbr == host_bond->nbr) &&
-						check_same (dev_bond->rel_box, host_bond->rel_box) &&
-						!check_zero (dev_bond->dvec, host_bond->dvec) &&
-						!check_zero (dev_bond->d, host_bond->d) )
-				{
-					break;
-				}
-			}
-			if (z >= End_Index (i, bonds)){
-				fprintf (stderr, "Could not find the matching bond on host and device \n");
-				exit (-1);
-			}
-
-			dcount += end[x] - start[x];
-			hcount += Num_Entries (z, three);
-
-			if ((end[x] - start[x]) != (End_Index (z, three) - Start_Index (z, three)))
-			{
-				count ++;
-				/*
-				   fprintf (stderr, " Three body count does not match between host and device\n");
-				   fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (z, three), End_Index (z, three));
-				   fprintf (stderr, " atom: %d - bond: %d Device count: (%d, %d)\n", i, x, start[x], end[x]);
-				 */
-			}
-		}
-
-		/*
-		   if ((dcount != hcount)) {
-
-		   fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount);
-
-		   for (int j = b_start[i]; j < b_end[i]; j ++) {
-		   bond_order_data *src = &d_bond_data[j].bo_data;
-		   dcount = end[j] - start[j];
-		   hcount = Num_Entries (j, three);
-		   fprintf (stderr, "device \n");
-		   print_bond_data (src);
-
-		   fprintf (stderr, "\n");
-		   src = &bonds->select.bond_list[j].bo_data;
-		   fprintf (stderr, "host \n");
-		   print_bond_data (src);
-		   fprintf (stderr, "\n");
+    reax_list *three = *lists + THREE_BODIES;
+    reax_list *bonds = *lists + BONDS;
+
+    reax_list *d_three = *dev_lists + THREE_BODIES;
+    reax_list *d_bonds = *dev_lists + BONDS;
+    bond_data *d_bond_data;
+    real *test;
+
+    three_body_interaction_data *data = (three_body_interaction_data *)
+        malloc ( sizeof (three_body_interaction_data) * d_three->num_intrs);
+    int *start = (int *) malloc (sizeof (int) * d_three->n);
+    int *end = (int *) malloc (sizeof (int) * d_three->n);
+
+    int *b_start = (int *) malloc (sizeof (int) * d_bonds->n);
+    int *b_end = (int *) malloc (sizeof (int) * d_bonds->n);
+    int count;
+    int hcount, dcount;
+
+
+    copy_host_device ( start, d_three->index,
+            sizeof (int) * d_three->n, cudaMemcpyDeviceToHost, "three:start");
+    copy_host_device ( end, d_three->end_index,
+            sizeof (int) * d_three->n, cudaMemcpyDeviceToHost, "three:end");
+    copy_host_device ( data, d_three->select.three_body_list,
+            sizeof (three_body_interaction_data) * d_three->num_intrs,
+            cudaMemcpyDeviceToHost, "three:data");
+
+    d_bond_data = (bond_data *) malloc (sizeof (bond_data)* d_bonds->num_intrs);
+
+    copy_host_device ( b_start, d_bonds->index,
+            sizeof (int) * d_bonds->n, cudaMemcpyDeviceToHost, "bonds:start");
+    copy_host_device ( b_end, d_bonds->end_index,
+            sizeof (int) * d_bonds->n, cudaMemcpyDeviceToHost, "bonds:end");
+    copy_host_device (d_bond_data, d_bonds->select.bond_list, sizeof (bond_data) *  d_bonds->num_intrs, 
+            cudaMemcpyDeviceToHost, "bonds:data");
+
+    count = 0;
+    hcount = dcount = 0;
+    for (int i = 0; i < system->N; i++)
+    {
+
+        int x, y, z;
+        for (x = b_start[i]; x < b_end[i]; x++)
+        {
+            int t_start = start[x];
+            int t_end = end[x];
+
+            bond_data *dev_bond = &d_bond_data [x];
+            bond_data *host_bond;
+            for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
+            {
+                host_bond = &bonds->select.bond_list [z];
+                if ((dev_bond->nbr == host_bond->nbr) &&
+                        check_same (dev_bond->rel_box, host_bond->rel_box) &&
+                        !check_zero (dev_bond->dvec, host_bond->dvec) &&
+                        !check_zero (dev_bond->d, host_bond->d) )
+                {
+                    break;
+                }
+            }
+            if (z >= End_Index (i, bonds)){
+                fprintf (stderr, "Could not find the matching bond on host and device \n");
+                exit (-1);
+            }
+
+            dcount += end[x] - start[x];
+            hcount += Num_Entries (z, three);
+
+            if ((end[x] - start[x]) != (End_Index (z, three) - Start_Index (z, three)))
+            {
+                count ++;
+                /*
+                   fprintf (stderr, " Three body count does not match between host and device\n");
+                   fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (z, three), End_Index (z, three));
+                   fprintf (stderr, " atom: %d - bond: %d Device count: (%d, %d)\n", i, x, start[x], end[x]);
+                 */
+            }
+        }
+
+        /*
+           if ((dcount != hcount)) {
+
+           fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount);
+
+           for (int j = b_start[i]; j < b_end[i]; j ++) {
+           bond_order_data *src = &d_bond_data[j].bo_data;
+           dcount = end[j] - start[j];
+           hcount = Num_Entries (j, three);
+           fprintf (stderr, "device \n");
+           print_bond_data (src);
+
+           fprintf (stderr, "\n");
+           src = &bonds->select.bond_list[j].bo_data;
+           fprintf (stderr, "host \n");
+           print_bond_data (src);
+           fprintf (stderr, "\n");
 
 //fprintf (stderr, "--- Device bo is %f \n", test[j]);
 fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i],
@@ -1467,65 +1467,65 @@ fprintf (stderr, "------\n");
 fprintf (stderr, " Three Bodies count does not match between host and device \n");
 exit (-1);
 }
-		 */
+         */
 }
 fprintf (stderr, "Three body count on DEVICE %d  HOST %d -- miscount: %d\n", dcount, hcount, count);
 
 count = 0;
 for (int i = 0; i < system->N; i++)
 {
-	int x, y, z;
-	for (x = b_start[i]; x < b_end[i]; x++)
-	{
-		int t_start = start[x];
-		int t_end = end[x];
-
-		bond_data *dev_bond = &d_bond_data [x];
-		bond_data *host_bond;
-		for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
-		{
-			host_bond = &bonds->select.bond_list [z];
-			if ((dev_bond->nbr == host_bond->nbr) &&
-					check_same (dev_bond->rel_box, host_bond->rel_box) &&
-					!check_zero (dev_bond->dvec, host_bond->dvec) &&
-					!check_zero (dev_bond->d, host_bond->d) )
-			{
-				break;
-			}
-		}
-		if (z >= End_Index (i, bonds)){
-			fprintf (stderr, "Could not find the matching bond on host and device \n");
-			exit (-1);
-		}
-
-		//find this three-body in the bonds on the host side.
-		for (y = t_start; y < t_end; y++)
-		{
-			three_body_interaction_data *device = data + y;
-			three_body_interaction_data *host;
-
-			//fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb);
-
-			int xx;
-			for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++)
-			{
-				host = &three->select.three_body_list [xx];
-				//fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb);
-				//if ((host->thb == device->thb) && (host->pthb == device->pthb))
-				if ((host->thb == device->thb) && !check_zero (host->theta, device->theta))
-				{
-					count ++;
-					break;
-				}
-			}
-
-			if ( xx >= End_Index (z, three) ) {
-				fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z,
-						Start_Index (z, three), End_Index (z, three), start[x], end[x] );
-				exit (-1);
-			}// else fprintf (stderr, "----------------- \n");
-		}
-	}
+    int x, y, z;
+    for (x = b_start[i]; x < b_end[i]; x++)
+    {
+        int t_start = start[x];
+        int t_end = end[x];
+
+        bond_data *dev_bond = &d_bond_data [x];
+        bond_data *host_bond;
+        for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
+        {
+            host_bond = &bonds->select.bond_list [z];
+            if ((dev_bond->nbr == host_bond->nbr) &&
+                    check_same (dev_bond->rel_box, host_bond->rel_box) &&
+                    !check_zero (dev_bond->dvec, host_bond->dvec) &&
+                    !check_zero (dev_bond->d, host_bond->d) )
+            {
+                break;
+            }
+        }
+        if (z >= End_Index (i, bonds)){
+            fprintf (stderr, "Could not find the matching bond on host and device \n");
+            exit (-1);
+        }
+
+        //find this three-body in the bonds on the host side.
+        for (y = t_start; y < t_end; y++)
+        {
+            three_body_interaction_data *device = data + y;
+            three_body_interaction_data *host;
+
+            //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb);
+
+            int xx;
+            for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++)
+            {
+                host = &three->select.three_body_list [xx];
+                //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb);
+                //if ((host->thb == device->thb) && (host->pthb == device->pthb))
+                if ((host->thb == device->thb) && !check_zero (host->theta, device->theta))
+                {
+                    count ++;
+                    break;
+                }
+            }
+
+            if ( xx >= End_Index (z, three) ) {
+                fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z,
+                        Start_Index (z, three), End_Index (z, three), start[x], end[x] );
+                exit (-1);
+            }// else fprintf (stderr, "----------------- \n");
+        }
+    }
 }
 free (data);
 free (start);
@@ -1542,170 +1542,170 @@ return SUCCESS;
 int validate_atoms (reax_system *system, reax_list **lists)
 {
 
-	int start, end, index, count, miscount;
-	reax_atom *test = (reax_atom *) malloc (sizeof (reax_atom)* system->N);
-	copy_host_device (test, system->d_my_atoms, sizeof (reax_atom) * system->N, cudaMemcpyDeviceToHost, "atoms");
-
-	/*
-	   for (int i = system->n; i < system->n + 10; i++)
-	   {
-	   fprintf (stderr, " Atom: %d HIndex: %d \n", i, test[i].Hindex);
-	   }
-	 */
-
-	count = miscount = 0; 
-	for (int i = 0; i < system->N; i++) 
-	{
-		if (test[i].type != system->my_atoms[i].type) {
-			fprintf (stderr, " Type does not match (%d %d) @ index %d \n", system->my_atoms[i].type, test[i].type, i);
-			exit (-1);
-		}    
-
-		if (  check_zero (test[i].x, system->my_atoms[i].x) )
-		{    
-			fprintf (stderr, "Atom :%d x --> host (%f %f %f) device (%f %f %f) \n", i,
-					system->my_atoms[i].x[0], system->my_atoms[i].x[1], system->my_atoms[i].x[2], 
-					test[i].x[0], test[i].x[1], test[i].x[2] );
-			miscount ++;
-			exit (-1);
-		}
-		if (     check_zero (test[i].v, system->my_atoms[i].v) )
-		{
-			fprintf (stderr, "Atom :%d v --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i,
-					system->my_atoms[i].v[0], system->my_atoms[i].v[1], system->my_atoms[i].v[2],
-					test[i].v[0], test[i].v[1], test[i].v[2] );
-			miscount ++;
-			exit (-1);
-		}
-		if (     check_zero (test[i].f, system->my_atoms[i].f) )
-		{
-			fprintf (stderr, "Atom :%d f --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i,
-					system->my_atoms[i].f[0], system->my_atoms[i].f[1], system->my_atoms[i].f[2],
-					test[i].f[0], test[i].f[1], test[i].f[2] );
-			miscount ++;
-			exit (-1);
-		}
-
-		if (     check_zero (test[i].q, system->my_atoms[i].q) )
-		{
-			fprintf (stderr, "Atom :%d q --> host (%f) device (%f) \n", i,
-					system->my_atoms[i].q, test[i].q );
-			miscount ++;
-			exit (-1);
-		}
-
-		count ++;
-	}
-
-	fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount);
-
-	free (test);
-	return true;
+    int start, end, index, count, miscount;
+    reax_atom *test = (reax_atom *) malloc (sizeof (reax_atom)* system->N);
+    copy_host_device (test, system->d_my_atoms, sizeof (reax_atom) * system->N, cudaMemcpyDeviceToHost, "atoms");
+
+    /*
+       for (int i = system->n; i < system->n + 10; i++)
+       {
+       fprintf (stderr, " Atom: %d HIndex: %d \n", i, test[i].Hindex);
+       }
+     */
+
+    count = miscount = 0; 
+    for (int i = 0; i < system->N; i++) 
+    {
+        if (test[i].type != system->my_atoms[i].type) {
+            fprintf (stderr, " Type does not match (%d %d) @ index %d \n", system->my_atoms[i].type, test[i].type, i);
+            exit (-1);
+        }    
+
+        if (  check_zero (test[i].x, system->my_atoms[i].x) )
+        {    
+            fprintf (stderr, "Atom :%d x --> host (%f %f %f) device (%f %f %f) \n", i,
+                    system->my_atoms[i].x[0], system->my_atoms[i].x[1], system->my_atoms[i].x[2], 
+                    test[i].x[0], test[i].x[1], test[i].x[2] );
+            miscount ++;
+            exit (-1);
+        }
+        if (     check_zero (test[i].v, system->my_atoms[i].v) )
+        {
+            fprintf (stderr, "Atom :%d v --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i,
+                    system->my_atoms[i].v[0], system->my_atoms[i].v[1], system->my_atoms[i].v[2],
+                    test[i].v[0], test[i].v[1], test[i].v[2] );
+            miscount ++;
+            exit (-1);
+        }
+        if (     check_zero (test[i].f, system->my_atoms[i].f) )
+        {
+            fprintf (stderr, "Atom :%d f --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i,
+                    system->my_atoms[i].f[0], system->my_atoms[i].f[1], system->my_atoms[i].f[2],
+                    test[i].f[0], test[i].f[1], test[i].f[2] );
+            miscount ++;
+            exit (-1);
+        }
+
+        if (     check_zero (test[i].q, system->my_atoms[i].q) )
+        {
+            fprintf (stderr, "Atom :%d q --> host (%f) device (%f) \n", i,
+                    system->my_atoms[i].q, test[i].q );
+            miscount ++;
+            exit (-1);
+        }
+
+        count ++;
+    }
+
+    fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount);
+
+    free (test);
+    return true;
 }
 
 
 int print_sparse_matrix (sparse_matrix *H)
 {
-	sparse_matrix test;
-	int index, count;
-
-	test.start = (int *) malloc (sizeof (int) * (H->cap)); 
-	test.end = (int *) malloc (sizeof (int) * (H->cap)); 
-
-	test.entries = (sparse_matrix_entry *) malloc (sizeof (sparse_matrix_entry) * (H->m));
-	memset (test.entries, 0xFF, sizeof (sparse_matrix_entry) * H->m);
-
-	copy_host_device ( test.entries, dev_workspace->H.entries, 
-			sizeof (sparse_matrix_entry) * H->m, cudaMemcpyDeviceToHost, "H:m");
-	copy_host_device ( test.start, dev_workspace->H.start, sizeof (int)* (H->cap), cudaMemcpyDeviceToHost, "H:start");
-	copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (H->cap), cudaMemcpyDeviceToHost, "H:end");
-
-	count = 0; 
-	for (int i = 0; i < 1; i++) {
-		for (int j = test.start[i]; j < test.end[i]; j++) {
-			sparse_matrix_entry *src = &test.entries[j];
-			fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val);
-		}    
-	}
-	fprintf (stderr, "--------------- ");
-
-	free (test.start);
-	free (test.end);
-	free (test.entries);
-
-	return SUCCESS;
+    sparse_matrix test;
+    int index, count;
+
+    test.start = (int *) malloc (sizeof (int) * (H->cap)); 
+    test.end = (int *) malloc (sizeof (int) * (H->cap)); 
+
+    test.entries = (sparse_matrix_entry *) malloc (sizeof (sparse_matrix_entry) * (H->m));
+    memset (test.entries, 0xFF, sizeof (sparse_matrix_entry) * H->m);
+
+    copy_host_device ( test.entries, dev_workspace->H.entries, 
+            sizeof (sparse_matrix_entry) * H->m, cudaMemcpyDeviceToHost, "H:m");
+    copy_host_device ( test.start, dev_workspace->H.start, sizeof (int)* (H->cap), cudaMemcpyDeviceToHost, "H:start");
+    copy_host_device ( test.end , dev_workspace->H.end, sizeof (int) * (H->cap), cudaMemcpyDeviceToHost, "H:end");
+
+    count = 0; 
+    for (int i = 0; i < 1; i++) {
+        for (int j = test.start[i]; j < test.end[i]; j++) {
+            sparse_matrix_entry *src = &test.entries[j];
+            fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val);
+        }    
+    }
+    fprintf (stderr, "--------------- ");
+
+    free (test.start);
+    free (test.end);
+    free (test.entries);
+
+    return SUCCESS;
 }
 
 int print_sparse_matrix_host (sparse_matrix *H)
 {
-	int index, count;
-
-	count = 0; 
-	for (int i = 0; i < 1; i++) {
-		for (int j = H->start[i]; j < H->end[i]; j++) {
-			sparse_matrix_entry *src = &H->entries[j];
-			fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val);
-		}    
-	}
-	fprintf (stderr, "--------------- ");
-	return SUCCESS;
+    int index, count;
+
+    count = 0; 
+    for (int i = 0; i < 1; i++) {
+        for (int j = H->start[i]; j < H->end[i]; j++) {
+            sparse_matrix_entry *src = &H->entries[j];
+            fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val);
+        }    
+    }
+    fprintf (stderr, "--------------- ");
+    return SUCCESS;
 }
 
 int print_host_rvec2 (rvec2 *a, int n)
 {
-	for (int i = 0; i < n; i++)
-		fprintf (stderr, "a[%f][%f] \n", a[i][0], a[i][1]);
-	fprintf (stderr, " ---------------------------------\n");
+    for (int i = 0; i < n; i++)
+        fprintf (stderr, "a[%f][%f] \n", a[i][0], a[i][1]);
+    fprintf (stderr, " ---------------------------------\n");
 
-	return SUCCESS;
+    return SUCCESS;
 }
 
 int print_device_rvec2 (rvec2 *b, int n)
 {
-	rvec2 *a = (rvec2 *) host_scratch;	
+    rvec2 *a = (rvec2 *) host_scratch;    
 
-	copy_host_device (a, b, sizeof (rvec2) * n, cudaMemcpyDeviceToHost, "rvec2");
+    copy_host_device (a, b, sizeof (rvec2) * n, cudaMemcpyDeviceToHost, "rvec2");
 
-	return print_host_rvec2 (a, n);
+    return print_host_rvec2 (a, n);
 }
 
 int print_host_array (real *a, int n)
 {
 
-	for (int i = 0; i < n; i++)
-		fprintf (stderr," a[%d] = %f \n", i, a[i]);
-	fprintf(stderr, " ----------------------------------\n");
-	return SUCCESS;
+    for (int i = 0; i < n; i++)
+        fprintf (stderr," a[%d] = %f \n", i, a[i]);
+    fprintf(stderr, " ----------------------------------\n");
+    return SUCCESS;
 }
 
 int print_device_array (real *a, int n)
 {
-	real *b = (real *) host_scratch;
-	copy_host_device (b, a, sizeof (real) * n, cudaMemcpyDeviceToHost, "real");
-	print_host_array (b, n);
+    real *b = (real *) host_scratch;
+    copy_host_device (b, a, sizeof (real) * n, cudaMemcpyDeviceToHost, "real");
+    print_host_array (b, n);
 }
 
 int check_zeros_host (rvec2 *host, int n, char *msg)
 {
-	int count, count1;
-	count = count1 = 0;
-	for (int i = 0; i < n; i++){
-		if (host[i][0] == 0) count ++;
-		if (host[i][1] == 0) count1 ++;
-	}
+    int count, count1;
+    count = count1 = 0;
+    for (int i = 0; i < n; i++){
+        if (host[i][0] == 0) count ++;
+        if (host[i][1] == 0) count1 ++;
+    }
 
-	fprintf (stderr, "%s has %d, %d zero elements \n", msg, count, count1 );
+    fprintf (stderr, "%s has %d, %d zero elements \n", msg, count, count1 );
 
-	return 1;
+    return 1;
 }
 
 int check_zeros_device (rvec2 *device, int n, char *msg)
 {
-	rvec2 *a = (rvec2 *) host_scratch;	
+    rvec2 *a = (rvec2 *) host_scratch;    
 
-	copy_host_device (a, device, sizeof (rvec2) * n, cudaMemcpyDeviceToHost, msg);
+    copy_host_device (a, device, sizeof (rvec2) * n, cudaMemcpyDeviceToHost, msg);
 
-	check_zeros_host (a, n, msg);
+    check_zeros_host (a, n, msg);
 
-	return 1;
+    return 1;
 }
diff --git a/PG-PuReMD/src/vector.cu b/PG-PuReMD/src/vector.cu
index 2cfa0b41..489477f2 100644
--- a/PG-PuReMD/src/vector.cu
+++ b/PG-PuReMD/src/vector.cu
@@ -29,494 +29,494 @@ extern "C"  {
 #endif
 
 
-	inline int Vector_isZero( real* v, int k )
-	{
-		for( --k; k>=0; --k )
-			if( fabs( v[k] ) > ALMOST_ZERO )
-				return 0;
+    inline int Vector_isZero( real* v, int k )
+    {
+        for( --k; k>=0; --k )
+            if( fabs( v[k] ) > ALMOST_ZERO )
+                return 0;
 
-		return 1;
-	}
+        return 1;
+    }
 
 
-	inline void Vector_MakeZero( real *v, int k )
-	{
-		for( --k; k>=0; --k )
-			v[k] = 0;
-	}
+    inline void Vector_MakeZero( real *v, int k )
+    {
+        for( --k; k>=0; --k )
+            v[k] = 0;
+    }
 
 
-	inline void Vector_Copy( real* dest, real* v, int k )
-	{
-		for( --k; k>=0; --k )
-			dest[k] = v[k];
-	}
+    inline void Vector_Copy( real* dest, real* v, int k )
+    {
+        for( --k; k>=0; --k )
+            dest[k] = v[k];
+    }
 
 
-	inline void Vector_Scale( real* dest, real c, real* v, int k )
-	{
-		for( --k; k>=0; --k )
-			dest[k] = c * v[k];
-	}
+    inline void Vector_Scale( real* dest, real c, real* v, int k )
+    {
+        for( --k; k>=0; --k )
+            dest[k] = c * v[k];
+    }
 
 
-	inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
-	{
-		for( --k; k>=0; --k )
-			dest[k] = c * v[k] + d * y[k];
-	}
+    inline void Vector_Sum( real* dest, real c, real* v, real d, real* y, int k )
+    {
+        for( --k; k>=0; --k )
+            dest[k] = c * v[k] + d * y[k];
+    }
 
 
-	inline void Vector_Add( real* dest, real c, real* v, int k )
-	{
-		for( --k; k>=0; --k )
-			dest[k] += c * v[k];
-	}
+    inline void Vector_Add( real* dest, real c, real* v, int k )
+    {
+        for( --k; k>=0; --k )
+            dest[k] += c * v[k];
+    }
 
 
-	inline real Dot( real* v1, real* v2, int k )
-	{
-		real ret = 0;
+    inline real Dot( real* v1, real* v2, int k )
+    {
+        real ret = 0;
 
-		for( --k; k>=0; --k )
-			ret +=  v1[k] * v2[k];
+        for( --k; k>=0; --k )
+            ret +=  v1[k] * v2[k];
 
-		return ret;
-	}
+        return ret;
+    }
 
 
-	inline real Norm( real* v1, int k )
-	{
-		real ret = 0;
+    inline real Norm( real* v1, int k )
+    {
+        real ret = 0;
 
-		for( --k; k>=0; --k )
-			ret +=  SQR( v1[k] );
+        for( --k; k>=0; --k )
+            ret +=  SQR( v1[k] );
 
-		return SQRT( ret );
-	}
+        return SQRT( ret );
+    }
 
 
-	inline void Vector_Print( FILE *fout, char *vname, real *v, int k )
-	{
-		int i;
+    inline void Vector_Print( FILE *fout, char *vname, real *v, int k )
+    {
+        int i;
 
-		fprintf( fout, "%s:", vname );
-		for( i = 0; i < k; ++i )
-			fprintf( fout, "%24.15e\n", v[i] );
-		fprintf( fout, "\n" );
-	}
+        fprintf( fout, "%s:", vname );
+        for( i = 0; i < k; ++i )
+            fprintf( fout, "%24.15e\n", v[i] );
+        fprintf( fout, "\n" );
+    }
 
 
-	void rvec_Copy( rvec dest, rvec src )
-	{
-		dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
-	}
+    void rvec_Copy( rvec dest, rvec src )
+    {
+        dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    }
 
-	inline void rvec_Scale( rvec ret, real c, rvec v )
-	{
-		ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
-	}
+    inline void rvec_Scale( rvec ret, real c, rvec v )
+    {
+        ret[0] = c * v[0], ret[1] = c * v[1], ret[2] = c * v[2];
+    }
 
 
-	inline void rvec_Add( rvec ret, rvec v )
-	{
-		ret[0] += v[0], ret[1] += v[1], ret[2] += v[2];
-	}
+    inline void rvec_Add( rvec ret, rvec v )
+    {
+        ret[0] += v[0], ret[1] += v[1], ret[2] += v[2];
+    }
 
 
-	inline void rvec_ScaledAdd( rvec ret, real c, rvec v )
-	{
-		ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
-	}
+    inline void rvec_ScaledAdd( rvec ret, real c, rvec v )
+    {
+        ret[0] += c * v[0], ret[1] += c * v[1], ret[2] += c * v[2];
+    }
 
 
-	inline void rvec_Sum( rvec ret, rvec v1 ,rvec v2 )
-	{
-		ret[0] = v1[0] + v2[0];
-		ret[1] = v1[1] + v2[1];
-		ret[2] = v1[2] + v2[2];
-	}
+    inline void rvec_Sum( rvec ret, rvec v1 ,rvec v2 )
+    {
+        ret[0] = v1[0] + v2[0];
+        ret[1] = v1[1] + v2[1];
+        ret[2] = v1[2] + v2[2];
+    }
 
 
-	inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 ,real c2, rvec v2 )
-	{
-		ret[0] = c1 * v1[0] + c2 * v2[0]; 
-		ret[1] = c1 * v1[1] + c2 * v2[1];
-		ret[2] = c1 * v1[2] + c2 * v2[2];
-	}
+    inline void rvec_ScaledSum( rvec ret, real c1, rvec v1 ,real c2, rvec v2 )
+    {
+        ret[0] = c1 * v1[0] + c2 * v2[0]; 
+        ret[1] = c1 * v1[1] + c2 * v2[1];
+        ret[2] = c1 * v1[2] + c2 * v2[2];
+    }
 
 
-	inline real rvec_Dot( rvec v1, rvec v2 )
-	{
-		return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
-	}
+    inline real rvec_Dot( rvec v1, rvec v2 )
+    {
+        return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
+    }
 
 
-	inline real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 )
-	{
-		return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);
-	}
+    inline real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 )
+    {
+        return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);
+    }
 
 
-	inline void rvec_Multiply( rvec r, rvec v1, rvec v2 )
-	{
-		r[0] = v1[0] * v2[0];
-		r[1] = v1[1] * v2[1];
-		r[2] = v1[2] * v2[2];
-	}
+    inline void rvec_Multiply( rvec r, rvec v1, rvec v2 )
+    {
+        r[0] = v1[0] * v2[0];
+        r[1] = v1[1] * v2[1];
+        r[2] = v1[2] * v2[2];
+    }
 
 
-	inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
-	{
-		r[0] = v1[0] * v2[0];
-		r[1] = v1[1] * v2[1];
-		r[2] = v1[2] * v2[2];
-	}
+    inline void rvec_iMultiply( rvec r, ivec v1, rvec v2 )
+    {
+        r[0] = v1[0] * v2[0];
+        r[1] = v1[1] * v2[1];
+        r[2] = v1[2] * v2[2];
+    }
 
 
-	inline void rvec_Divide( rvec r, rvec v1, rvec v2 )
-	{
-		r[0] = v1[0] / v2[0];
-		r[1] = v1[1] / v2[1];
-		r[2] = v1[2] / v2[2];
-	}
+    inline void rvec_Divide( rvec r, rvec v1, rvec v2 )
+    {
+        r[0] = v1[0] / v2[0];
+        r[1] = v1[1] / v2[1];
+        r[2] = v1[2] / v2[2];
+    }
 
 
-	inline void rvec_iDivide( rvec r, rvec v1, ivec v2 )
-	{
-		r[0] = v1[0] / v2[0];
-		r[1] = v1[1] / v2[1];
-		r[2] = v1[2] / v2[2];
-	}
+    inline void rvec_iDivide( rvec r, rvec v1, ivec v2 )
+    {
+        r[0] = v1[0] / v2[0];
+        r[1] = v1[1] / v2[1];
+        r[2] = v1[2] / v2[2];
+    }
 
 
-	inline void rvec_Invert( rvec r, rvec v )
-	{
-		r[0] = 1. / v[0];
-		r[1] = 1. / v[1];
-		r[2] = 1. / v[2];
-	}
+    inline void rvec_Invert( rvec r, rvec v )
+    {
+        r[0] = 1. / v[0];
+        r[1] = 1. / v[1];
+        r[2] = 1. / v[2];
+    }
 
 
-	inline void rvec_Cross( rvec ret, rvec v1, rvec v2 )
-	{
-		ret[0] = v1[1] * v2[2] - v1[2] * v2[1];
-		ret[1] = v1[2] * v2[0] - v1[0] * v2[2];
-		ret[2] = v1[0] * v2[1] - v1[1] * v2[0];
-	}
+    inline void rvec_Cross( rvec ret, rvec v1, rvec v2 )
+    {
+        ret[0] = v1[1] * v2[2] - v1[2] * v2[1];
+        ret[1] = v1[2] * v2[0] - v1[0] * v2[2];
+        ret[2] = v1[0] * v2[1] - v1[1] * v2[0];
+    }
 
 
-	inline void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 )
-	{
-		int i, j;
+    inline void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 )
+    {
+        int i, j;
 
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-				r[i][j] = v1[i] * v2[j];
-	}
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+                r[i][j] = v1[i] * v2[j];
+    }
 
 
-	inline real rvec_Norm_Sqr( rvec v )
-	{
-		return SQR(v[0]) + SQR(v[1]) + SQR(v[2]);
-	}
+    inline real rvec_Norm_Sqr( rvec v )
+    {
+        return SQR(v[0]) + SQR(v[1]) + SQR(v[2]);
+    }
 
 
-	inline real rvec_Norm( rvec v )
-	{
-		return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) );
-	}
+    inline real rvec_Norm( rvec v )
+    {
+        return SQRT( SQR(v[0]) + SQR(v[1]) + SQR(v[2]) );
+    }
 
 
-	inline int rvec_isZero( rvec v )
-	{
-		if( fabs(v[0]) > ALMOST_ZERO || 
-				fabs(v[1]) > ALMOST_ZERO || 
-				fabs(v[2]) > ALMOST_ZERO )
-			return 0;
-		return 1;
-	}
+    inline int rvec_isZero( rvec v )
+    {
+        if( fabs(v[0]) > ALMOST_ZERO || 
+                fabs(v[1]) > ALMOST_ZERO || 
+                fabs(v[2]) > ALMOST_ZERO )
+            return 0;
+        return 1;
+    }
 
-	inline void rvec_MakeZero( rvec v )
-	{
-		//  v[0] = v[1] = v[2] = 0.0000000000000;
-		v[0] = v[1] = v[2] = 0.000000000000000e+00;
-	}
+    inline void rvec_MakeZero( rvec v )
+    {
+        //  v[0] = v[1] = v[2] = 0.0000000000000;
+        v[0] = v[1] = v[2] = 0.000000000000000e+00;
+    }
 
 
 #if defined(PURE_REAX)
-	inline void rvec_Random( rvec v )
-	{
-		v[0] = Random(2.0)-1.0;
-		v[1] = Random(2.0)-1.0;
-		v[2] = Random(2.0)-1.0;
-	}
+    inline void rvec_Random( rvec v )
+    {
+        v[0] = Random(2.0)-1.0;
+        v[1] = Random(2.0)-1.0;
+        v[2] = Random(2.0)-1.0;
+    }
 #endif
 
 
-	inline void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 )
-	{
-		int i, j, k;
-		rtensor temp;
-
-		// check if the result matrix is the same as one of m1, m2.
-		// if so, we cannot modify the contents of m1 or m2, so 
-		// we have to use a temp matrix.
-		if( ret == m1 || ret == m2 )
-		{
-			for( i = 0; i < 3; ++i )
-				for( j = 0; j < 3; ++j )
-				{
-					temp[i][j] = 0;	    
-					for( k = 0; k < 3; ++k )
-						temp[i][j] += m1[i][k] * m2[k][j];
-				}
-
-			for( i = 0; i < 3; ++i )
-				for( j = 0; j < 3; ++j )
-					ret[i][j] = temp[i][j];	
-		}
-		else
-		{
-			for( i = 0; i < 3; ++i )
-				for( j = 0; j < 3; ++j )
-				{
-					ret[i][j] = 0;	    
-					for( k = 0; k < 3; ++k )
-						ret[i][j] += m1[i][k] * m2[k][j];
-				}
-		}
-	}
-
-
-	inline void rtensor_MatVec( rvec ret, rtensor m, rvec v )
-	{
-		int i;
-		rvec temp;
-
-		// if ret is the same vector as v, we cannot modify the 
-		// contents of v until all computation is finished.
-		if( ret == v )
-		{
-			for( i = 0; i < 3; ++i )
-				temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
-
-			for( i = 0; i < 3; ++i )
-				ret[i] = temp[i];
-		}
-		else
-		{
-			for( i = 0; i < 3; ++i )
-				ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
-		}
-	}
-
-
-	inline void rtensor_Scale( rtensor ret, real c, rtensor m )
-	{
-		int i, j;
-
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-				ret[i][j] = c * m[i][j];
-	}
-
-
-	inline void rtensor_Add( rtensor ret, rtensor t )
-	{
-		int i, j;
-
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-				ret[i][j] += t[i][j];
-	}
-
-
-	inline void rtensor_ScaledAdd( rtensor ret, real c, rtensor t )
-	{
-		int i, j;
-
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-				ret[i][j] += c * t[i][j];
-	}
-
-
-	inline void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 )
-	{
-		int i, j;
-
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-				ret[i][j] = t1[i][j] + t2[i][j];
-	}
-
-
-	inline void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1, 
-			real c2, rtensor t2 )
-	{
-		int i, j;
-
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-				ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j];
-	}
-
-
-	inline void rtensor_Copy( rtensor ret, rtensor t )
-	{
-		int i, j;
-
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-				ret[i][j] = t[i][j];
-	}
+    inline void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 )
+    {
+        int i, j, k;
+        rtensor temp;
+
+        // check if the result matrix is the same as one of m1, m2.
+        // if so, we cannot modify the contents of m1 or m2, so 
+        // we have to use a temp matrix.
+        if( ret == m1 || ret == m2 )
+        {
+            for( i = 0; i < 3; ++i )
+                for( j = 0; j < 3; ++j )
+                {
+                    temp[i][j] = 0;        
+                    for( k = 0; k < 3; ++k )
+                        temp[i][j] += m1[i][k] * m2[k][j];
+                }
+
+            for( i = 0; i < 3; ++i )
+                for( j = 0; j < 3; ++j )
+                    ret[i][j] = temp[i][j];    
+        }
+        else
+        {
+            for( i = 0; i < 3; ++i )
+                for( j = 0; j < 3; ++j )
+                {
+                    ret[i][j] = 0;        
+                    for( k = 0; k < 3; ++k )
+                        ret[i][j] += m1[i][k] * m2[k][j];
+                }
+        }
+    }
+
+
+    inline void rtensor_MatVec( rvec ret, rtensor m, rvec v )
+    {
+        int i;
+        rvec temp;
+
+        // if ret is the same vector as v, we cannot modify the 
+        // contents of v until all computation is finished.
+        if( ret == v )
+        {
+            for( i = 0; i < 3; ++i )
+                temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+
+            for( i = 0; i < 3; ++i )
+                ret[i] = temp[i];
+        }
+        else
+        {
+            for( i = 0; i < 3; ++i )
+                ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+        }
+    }
+
+
+    inline void rtensor_Scale( rtensor ret, real c, rtensor m )
+    {
+        int i, j;
+
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+                ret[i][j] = c * m[i][j];
+    }
+
+
+    inline void rtensor_Add( rtensor ret, rtensor t )
+    {
+        int i, j;
+
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+                ret[i][j] += t[i][j];
+    }
+
+
+    inline void rtensor_ScaledAdd( rtensor ret, real c, rtensor t )
+    {
+        int i, j;
+
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+                ret[i][j] += c * t[i][j];
+    }
+
+
+    inline void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 )
+    {
+        int i, j;
+
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+                ret[i][j] = t1[i][j] + t2[i][j];
+    }
+
+
+    inline void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1, 
+            real c2, rtensor t2 )
+    {
+        int i, j;
+
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+                ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j];
+    }
+
+
+    inline void rtensor_Copy( rtensor ret, rtensor t )
+    {
+        int i, j;
+
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+                ret[i][j] = t[i][j];
+    }
 
-
-	inline void rtensor_Identity( rtensor t )
-	{
-		t[0][0] = t[1][1] = t[2][2] = 1;
-		t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = 0;
-	}
+
+    inline void rtensor_Identity( rtensor t )
+    {
+        t[0][0] = t[1][1] = t[2][2] = 1;
+        t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = 0;
+    }
 
 
-	inline void rtensor_MakeZero( rtensor t )
-	{
-		t[0][0] = t[0][1] = t[0][2] = 0;
-		t[1][0] = t[1][1] = t[1][2] = 0;
-		t[2][0] = t[2][1] = t[2][2] = 0;
-	}
+    inline void rtensor_MakeZero( rtensor t )
+    {
+        t[0][0] = t[0][1] = t[0][2] = 0;
+        t[1][0] = t[1][1] = t[1][2] = 0;
+        t[2][0] = t[2][1] = t[2][2] = 0;
+    }
 
 
-	inline void rtensor_Transpose( rtensor ret, rtensor t )
-	{
-		ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
-		ret[0][1] = t[1][0], ret[0][2] = t[2][0];
-		ret[1][0] = t[0][1], ret[1][2] = t[2][1];
-		ret[2][0] = t[0][2], ret[2][1] = t[1][2];
-	}
+    inline void rtensor_Transpose( rtensor ret, rtensor t )
+    {
+        ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
+        ret[0][1] = t[1][0], ret[0][2] = t[2][0];
+        ret[1][0] = t[0][1], ret[1][2] = t[2][1];
+        ret[2][0] = t[0][2], ret[2][1] = t[1][2];
+    }
 
 
-	inline real rtensor_Det( rtensor t )
-	{
-		return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
-				t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
-				t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
-	}
+    inline real rtensor_Det( rtensor t )
+    {
+        return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
+                t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
+                t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
+    }
 
 
-	inline real rtensor_Trace( rtensor t )
-	{
-		return (t[0][0] + t[1][1] + t[2][2]);
-	}
+    inline real rtensor_Trace( rtensor t )
+    {
+        return (t[0][0] + t[1][1] + t[2][2]);
+    }
 
 
-	inline void Print_rTensor(FILE* fp, rtensor t)
-	{
-		int i, j;
+    inline void Print_rTensor(FILE* fp, rtensor t)
+    {
+        int i, j;
 
-		for (i=0; i < 3; i++)
-		{
-			fprintf(fp,"[");
-			for (j=0; j < 3; j++)
-				fprintf(fp,"%8.3f,\t",t[i][j]);
-			fprintf(fp,"]\n");
-		}
-	}
+        for (i=0; i < 3; i++)
+        {
+            fprintf(fp,"[");
+            for (j=0; j < 3; j++)
+                fprintf(fp,"%8.3f,\t",t[i][j]);
+            fprintf(fp,"]\n");
+        }
+    }
 
 
-	inline void ivec_MakeZero( ivec v )
-	{
-		// LGJ  v[0] = v[1] = v[2] = 0;
-		v[0] = v[1] = v[2] = 0.000000000000000e+00;
-	}
+    inline void ivec_MakeZero( ivec v )
+    {
+        // LGJ  v[0] = v[1] = v[2] = 0;
+        v[0] = v[1] = v[2] = 0.000000000000000e+00;
+    }
 
 
-	inline void ivec_Copy( ivec dest, ivec src )
-	{
-		dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
-	}
+    inline void ivec_Copy( ivec dest, ivec src )
+    {
+        dest[0] = src[0], dest[1] = src[1], dest[2] = src[2];
+    }
 
 
-	inline void ivec_Scale( ivec dest, real C, ivec src )
-	{
-		dest[0] = (int)(C * src[0]);
-		dest[1] = (int)(C * src[1]);
-		dest[2] = (int)(C * src[2]);
-	}
+    inline void ivec_Scale( ivec dest, real C, ivec src )
+    {
+        dest[0] = (int)(C * src[0]);
+        dest[1] = (int)(C * src[1]);
+        dest[2] = (int)(C * src[2]);
+    }
 
 
-	inline void ivec_rScale( ivec dest, real C, rvec src )
-	{
-		dest[0] = (int)(C * src[0]);
-		dest[1] = (int)(C * src[1]);
-		dest[2] = (int)(C * src[2]);
-	}
+    inline void ivec_rScale( ivec dest, real C, rvec src )
+    {
+        dest[0] = (int)(C * src[0]);
+        dest[1] = (int)(C * src[1]);
+        dest[2] = (int)(C * src[2]);
+    }
 
 
-	inline int ivec_isZero( ivec v )
-	{
-		if( v[0]==0 && v[1]==0 && v[2]==0 )
-			return 1;
-		return 0;
-	}
+    inline int ivec_isZero( ivec v )
+    {
+        if( v[0]==0 && v[1]==0 && v[2]==0 )
+            return 1;
+        return 0;
+    }
 
 
-	inline int ivec_isEqual( ivec v1, ivec v2 )
-	{
-		if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] )
-			return 1;
-		return 0;
-	}
+    inline int ivec_isEqual( ivec v1, ivec v2 )
+    {
+        if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] )
+            return 1;
+        return 0;
+    }
 
 
-	inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
-	{
-		dest[0] = v1[0] + v2[0];
-		dest[1] = v1[1] + v2[1];
-		dest[2] = v1[2] + v2[2];
-	}
+    inline void ivec_Sum( ivec dest, ivec v1, ivec v2 )
+    {
+        dest[0] = v1[0] + v2[0];
+        dest[1] = v1[1] + v2[1];
+        dest[2] = v1[2] + v2[2];
+    }
 
 
-	inline void ivec_ScaledSum( ivec dest, int k1, ivec v1, int k2, ivec v2 )
-	{
-		dest[0] = k1*v1[0] + k2*v2[0];
-		dest[1] = k1*v1[1] + k2*v2[1];
-		dest[2] = k1*v1[2] + k2*v2[2];
-	}
+    inline void ivec_ScaledSum( ivec dest, int k1, ivec v1, int k2, ivec v2 )
+    {
+        dest[0] = k1*v1[0] + k2*v2[0];
+        dest[1] = k1*v1[1] + k2*v2[1];
+        dest[2] = k1*v1[2] + k2*v2[2];
+    }
 
 
-	inline void ivec_Add( ivec dest, ivec v )
-	{
-		dest[0] += v[0];
-		dest[1] += v[1];
-		dest[2] += v[2];
-	}
+    inline void ivec_Add( ivec dest, ivec v )
+    {
+        dest[0] += v[0];
+        dest[1] += v[1];
+        dest[2] += v[2];
+    }
 
 
-	inline void ivec_ScaledAdd( ivec dest, int k, ivec v )
-	{
-		dest[0] += k * v[0];
-		dest[1] += k * v[1];
-		dest[2] += k * v[2];
-	}
+    inline void ivec_ScaledAdd( ivec dest, int k, ivec v )
+    {
+        dest[0] += k * v[0];
+        dest[1] += k * v[1];
+        dest[2] += k * v[2];
+    }
 
 
 
-	inline void ivec_Max( ivec res, ivec v1, ivec v2 )
-	{
-		res[0] = MAX( v1[0], v2[0] );
-		res[1] = MAX( v1[1], v2[1] );
-		res[2] = MAX( v1[2], v2[2] );
-	}
+    inline void ivec_Max( ivec res, ivec v1, ivec v2 )
+    {
+        res[0] = MAX( v1[0], v2[0] );
+        res[1] = MAX( v1[1], v2[1] );
+        res[2] = MAX( v1[2], v2[2] );
+    }
 
 
-	inline void ivec_Max3( ivec res, ivec v1, ivec v2, ivec v3 )
-	{
-		res[0] = MAX3( v1[0], v2[0], v3[0] );
-		res[1] = MAX3( v1[1], v2[1], v3[1] );
-		res[2] = MAX3( v1[2], v2[2], v3[2] );
-	}
+    inline void ivec_Max3( ivec res, ivec v1, ivec v2, ivec v3 )
+    {
+        res[0] = MAX3( v1[0], v2[0], v3[0] );
+        res[1] = MAX3( v1[1], v2[1], v3[1] );
+        res[2] = MAX3( v1[2], v2[2], v3[2] );
+    }
 
 #ifdef __cplusplus
 }
diff --git a/PuReMD-GPU/src/GMRES.cu b/PuReMD-GPU/src/GMRES.cu
index 011c4eeb..d00100e9 100644
--- a/PuReMD-GPU/src/GMRES.cu
+++ b/PuReMD-GPU/src/GMRES.cu
@@ -34,186 +34,186 @@
 
 void Sparse_MatVec( sparse_matrix *A, real *x, real *b )
 {
-	int i, j, k, n, si, ei;
-	real H;
-
-	n = A->n;
-	for( i = 0; i < n; ++i )
-		b[i] = 0;
-
-	for( i = 0; i < n; ++i ) {
-		si = A->start[i];
-		ei = A->start[i+1]-1;
-
-		for( k = si; k < ei; ++k ) {
-			j = A->entries[k].j;
-			H = A->entries[k].val;
-			b[j] += H * x[i]; 
-			b[i] += H * x[j];
-		}
-
-		// the diagonal entry is the last one in
-		b[i] += A->entries[k].val * x[i]; 
-	}
+    int i, j, k, n, si, ei;
+    real H;
+
+    n = A->n;
+    for( i = 0; i < n; ++i )
+        b[i] = 0;
+
+    for( i = 0; i < n; ++i ) {
+        si = A->start[i];
+        ei = A->start[i+1]-1;
+
+        for( k = si; k < ei; ++k ) {
+            j = A->entries[k].j;
+            H = A->entries[k].val;
+            b[j] += H * x[i]; 
+            b[i] += H * x[j];
+        }
+
+        // the diagonal entry is the last one in
+        b[i] += A->entries[k].val * x[i]; 
+    }
 }
 
 
 void Forward_Subs( sparse_matrix *L, real *b, real *y )
 {
-	int i, pj, j, si, ei;
-	real val;
-
-	for( i = 0; i < L->n; ++i ) {
-		y[i] = b[i];
-		si = L->start[i];
-		ei = L->start[i+1];
-		for( pj = si; pj < ei-1; ++pj ){
-			j = L->entries[pj].j;
-			val = L->entries[pj].val;
-			y[i] -= val * y[j];
-		}
-		y[i] /= L->entries[pj].val;
-	}
+    int i, pj, j, si, ei;
+    real val;
+
+    for( i = 0; i < L->n; ++i ) {
+        y[i] = b[i];
+        si = L->start[i];
+        ei = L->start[i+1];
+        for( pj = si; pj < ei-1; ++pj ){
+            j = L->entries[pj].j;
+            val = L->entries[pj].val;
+            y[i] -= val * y[j];
+        }
+        y[i] /= L->entries[pj].val;
+    }
 }
 
 
 void Backward_Subs( sparse_matrix *U, real *y, real *x )
 {
-	int i, pj, j, si, ei;
-	real val;
-
-	for( i = U->n-1; i >= 0; --i ) {
-		x[i] = y[i];
-		si = U->start[i];
-		ei = U->start[i+1];
-		for( pj = si+1; pj < ei; ++pj ){
-			j = U->entries[pj].j;
-			val = U->entries[pj].val;
-			x[i] -= val * x[j];
-		}
-		x[i] /= U->entries[si].val;
-	}
+    int i, pj, j, si, ei;
+    real val;
+
+    for( i = U->n-1; i >= 0; --i ) {
+        x[i] = y[i];
+        si = U->start[i];
+        ei = U->start[i+1];
+        for( pj = si+1; pj < ei; ++pj ){
+            j = U->entries[pj].j;
+            val = U->entries[pj].val;
+            x[i] -= val * x[j];
+        }
+        x[i] /= U->entries[si].val;
+    }
 }
 
 
 int GMRES( static_storage *workspace, sparse_matrix *H, 
-		real *b, real tol, real *x, FILE *fout, reax_system* system)
+        real *b, real tol, real *x, FILE *fout, reax_system* system)
 {
-	int i, j, k, itr, N;
-	real cc, tmp1, tmp2, temp, bnorm;
-
-	N = H->n;
-	bnorm = Norm( b, N );
-
-	/* apply the diagonal pre-conditioner to rhs */
-	for( i = 0; i < N; ++i )
-		workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
-
-	/* GMRES outer-loop */
-	for( itr = 0; itr < MAX_ITR; ++itr ) {
-		/* calculate r0 */
-		Sparse_MatVec( H, x, workspace->b_prm );      
-
-		for( i = 0; i < N; ++i )
-			workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */    
-
-
-		Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
-		workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
-		Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system)], N );
-
-		/* GMRES inner-loop */
-		for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
-			/* matvec */
-			Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
-
-			for( k = 0; k < N; ++k )  
-				workspace->v[ index_wkspace_sys (j+1,k,system)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ 
-
-			/* apply modified Gram-Schmidt to orthogonalize the new residual */
-			for( i = 0; i <= j; i++ ) {
-				workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-				Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)], 
-						-workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N );
-			}
-
-
-			workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-			Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], 
-					1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-			// fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-
-
-			/* Givens rotations on the upper-Hessenberg matrix to make it U */
-			for( i = 0; i <= j; i++ )	{
-				if( i == j ) {
-					cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
-					workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
-					workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
-				}
-
-				tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-					workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ];
-				tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-					workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ];
-
-				workspace->h[ index_wkspace_res (i,j) ] = tmp1;
-				workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
-			} 
-
-			/* apply Givens rotations to the rhs as well */
-			tmp1 =  workspace->hc[j] * workspace->g[j];
-			tmp2 = -workspace->hs[j] * workspace->g[j];
-			workspace->g[j] = tmp1;
-			workspace->g[j+1] = tmp2;
-
-			// fprintf( stderr, "h: " );
-			// for( i = 0; i <= j+1; ++i )
-			//  fprintf( stderr, "%.6f ", workspace->h[i][j] );
-			// fprintf( stderr, "\n" );
-			//fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-		}
-
-
-		/* solve Hy = g.
-		   H is now upper-triangular, do back-substitution */
-		for( i = j-1; i >= 0; i-- ) {
-			temp = workspace->g[i];	  
-			for( k = j-1; k > i; k-- )
-				temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
-
-			workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
-		}
-
-
-		/* update x = x_0 + Vy */
-		for( i = 0; i < j; i++ )
-			Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N );
-
-		/* stopping condition */
-		if( fabs(workspace->g[j]) / bnorm <= tol )
-			break;
-	}
-
-	// Sparse_MatVec( H, x, workspace->b_prm );
-	// for( i = 0; i < N; ++i )
-	// workspace->b_prm[i] *= workspace->Hdia_inv[i];    
-	// fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-	// for( i = 0; i < N; ++i )
-	// fprintf( fout, "%10.5f%15.12f%15.12f\n", 
-	// workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-
-	// fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
-	//	      itr, j, fabs( workspace->g[j] ) / bnorm );
-	// data->timing.matvec += itr * RESTART + j;
-
-	if( itr >= MAX_ITR ) {
-		fprintf( stderr, "GMRES convergence failed\n" );
-		// return -1;
-		return itr * (RESTART+1) + j + 1;
-	}
-
-	return itr * (RESTART+1) + j + 1;
+    int i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+
+    N = H->n;
+    bnorm = Norm( b, N );
+
+    /* apply the diagonal pre-conditioner to rhs */
+    for( i = 0; i < N; ++i )
+        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
+
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr ) {
+        /* calculate r0 */
+        Sparse_MatVec( H, x, workspace->b_prm );      
+
+        for( i = 0; i < N; ++i )
+            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */    
+
+
+        Vector_Sum(&workspace->v[ index_wkspace_sys (0,0,system) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
+        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
+        Vector_Scale( &workspace->v[ index_wkspace_sys (0,0,system) ], 1.0/workspace->g[0], &workspace->v[index_wkspace_sys(0,0,system)], N );
+
+        /* GMRES inner-loop */
+        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
+            /* matvec */
+            Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
+
+            for( k = 0; k < N; ++k )  
+                workspace->v[ index_wkspace_sys (j+1,k,system)] *= workspace->Hdia_inv[k]; /*pre-conditioner*/ 
+
+            /* apply modified Gram-Schmidt to orthogonalize the new residual */
+            for( i = 0; i <= j; i++ ) {
+                workspace->h[ index_wkspace_res (i,j) ] = Dot( &workspace->v[index_wkspace_sys(i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
+                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)], 
+                        -workspace->h[index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N );
+            }
+
+
+            workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
+            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], 
+                    1. / workspace->h[ index_wkspace_res (j+1,j) ], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
+            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
+
+
+            /* Givens rotations on the upper-Hessenberg matrix to make it U */
+            for( i = 0; i <= j; i++ )    {
+                if( i == j ) {
+                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
+                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
+                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
+                }
+
+                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
+                    workspace->hs[i] * workspace->h[ index_wkspace_res (i+1,j) ];
+                tmp2 = -workspace->hs[i] * workspace->h[ index_wkspace_res (i,j) ] + 
+                    workspace->hc[i] * workspace->h[ index_wkspace_res (i+1,j) ];
+
+                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
+                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
+            } 
+
+            /* apply Givens rotations to the rhs as well */
+            tmp1 =  workspace->hc[j] * workspace->g[j];
+            tmp2 = -workspace->hs[j] * workspace->g[j];
+            workspace->g[j] = tmp1;
+            workspace->g[j+1] = tmp2;
+
+            // fprintf( stderr, "h: " );
+            // for( i = 0; i <= j+1; ++i )
+            //  fprintf( stderr, "%.6f ", workspace->h[i][j] );
+            // fprintf( stderr, "\n" );
+            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
+        }
+
+
+        /* solve Hy = g.
+           H is now upper-triangular, do back-substitution */
+        for( i = j-1; i >= 0; i-- ) {
+            temp = workspace->g[i];      
+            for( k = j-1; k > i; k-- )
+                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+
+            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
+        }
+
+
+        /* update x = x_0 + Vy */
+        for( i = 0; i < j; i++ )
+            Vector_Add( x, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N );
+
+        /* stopping condition */
+        if( fabs(workspace->g[j]) / bnorm <= tol )
+            break;
+    }
+
+    // Sparse_MatVec( H, x, workspace->b_prm );
+    // for( i = 0; i < N; ++i )
+    // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
+    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
+    // for( i = 0; i < N; ++i )
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
+
+    // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
+    //          itr, j, fabs( workspace->g[j] ) / bnorm );
+    // data->timing.matvec += itr * RESTART + j;
+
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        // return -1;
+        return itr * (RESTART+1) + j + 1;
+    }
+
+    return itr * (RESTART+1) + j + 1;
 }
 
 
@@ -223,916 +223,916 @@ int GMRES( static_storage *workspace, sparse_matrix *H,
 
 GLOBAL void GMRES_Diagonal_Preconditioner (real *b_proc, real *b, real *Hdia_inv, int entries)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if (i >= entries) return;
+    if (i >= entries) return;
 
-	b_proc [i] = b[i] * Hdia_inv[i];
+    b_proc [i] = b[i] * Hdia_inv[i];
 }
 
 GLOBAL void GMRES_Givens_Rotation (int j, real *h, real *hc, real *hs, real g_j, real *output)
 {
-	real tmp1, tmp2, cc;
+    real tmp1, tmp2, cc;
 
-	for( int i = 0; i <= j; i++ )	{
-		if( i == j ) {
-			cc = SQRT( SQR(h[ index_wkspace_res (j,j) ])+SQR(h[ index_wkspace_res (j+1,j) ]) );
-			hc[j] = h[ index_wkspace_res (j,j) ] / cc;
-			hs[j] = h[ index_wkspace_res (j+1,j) ] / cc;
-		}
+    for( int i = 0; i <= j; i++ )    {
+        if( i == j ) {
+            cc = SQRT( SQR(h[ index_wkspace_res (j,j) ])+SQR(h[ index_wkspace_res (j+1,j) ]) );
+            hc[j] = h[ index_wkspace_res (j,j) ] / cc;
+            hs[j] = h[ index_wkspace_res (j+1,j) ] / cc;
+        }
 
-		tmp1 =  hc[i] * h[ index_wkspace_res (i,j) ] + hs[i] * h[ index_wkspace_res (i+1,j) ];
-		tmp2 = -hs[i] * h[ index_wkspace_res (i,j) ] + hc[i] * h[ index_wkspace_res (i+1,j) ];
+        tmp1 =  hc[i] * h[ index_wkspace_res (i,j) ] + hs[i] * h[ index_wkspace_res (i+1,j) ];
+        tmp2 = -hs[i] * h[ index_wkspace_res (i,j) ] + hc[i] * h[ index_wkspace_res (i+1,j) ];
 
-		h[ index_wkspace_res (i,j) ] = tmp1;
-		h[ index_wkspace_res (i+1,j) ] = tmp2;
-	} 
+        h[ index_wkspace_res (i,j) ] = tmp1;
+        h[ index_wkspace_res (i+1,j) ] = tmp2;
+    } 
 
-	/* apply Givens rotations to the rhs as well */
-	tmp1 =  hc[j] * g_j;
-	tmp2 = -hs[j] * g_j;
+    /* apply Givens rotations to the rhs as well */
+    tmp1 =  hc[j] * g_j;
+    tmp2 = -hs[j] * g_j;
 
-	output[0] = tmp1;
-	output[1] = tmp2;
+    output[0] = tmp1;
+    output[1] = tmp2;
 }
 
 GLOBAL void GMRES_BackSubstitution (int j, real *g, real *h, real *y)
 {
-	real temp;
-	for( int i = j-1; i >= 0; i-- ) {
-		temp = g[i];	  
-		for( int k = j-1; k > i; k-- )
-			temp -= h[ index_wkspace_res (i,k) ] * y[k];
-
-		y[i] = temp / h[ index_wkspace_res (i,i) ];
-	}
+    real temp;
+    for( int i = j-1; i >= 0; i-- ) {
+        temp = g[i];      
+        for( int k = j-1; k > i; k-- )
+            temp -= h[ index_wkspace_res (i,k) ] * y[k];
+
+        y[i] = temp / h[ index_wkspace_res (i,i) ];
+    }
 }
 
 
 int Cuda_GMRES( static_storage *workspace, real *b, real tol, real *x )
 {
-	int i, j, k, itr, N;
-	real cc, tmp1, tmp2, temp, bnorm;
-	real v_add_tmp;
-	sparse_matrix *H = &workspace->H;
+    int i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+    real v_add_tmp;
+    sparse_matrix *H = &workspace->H;
 
-	real t_start, t_elapsed;
+    real t_start, t_elapsed;
 
-	real *spad = (real *)scratch;
-	real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
+    real *spad = (real *)scratch;
+    real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
 
-	N = H->n;
+    N = H->n;
 
-	cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+    cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
 
-	Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, "Norm of the array is %e \n", bnorm );
+    fprintf (stderr, "Norm of the array is %e \n", bnorm );
 #endif
 
-	/* apply the diagonal pre-conditioner to rhs */
-	GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
-		(workspace->b_prc, b, workspace->Hdia_inv, N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	/* GMRES outer-loop */
-	for( itr = 0; itr < MAX_ITR; ++itr ) {
-		/* calculate r0 */
-		//Sparse_MatVec( H, x, workspace->b_prm );      
-		Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>>
-			(workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>>
-			(&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
-		{
-			cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
-
-			Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
-				(&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G);
-		}
-
-		Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
-			( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		/* GMRES inner-loop */
+    /* apply the diagonal pre-conditioner to rhs */
+    GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
+        (workspace->b_prc, b, workspace->Hdia_inv, N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr ) {
+        /* calculate r0 */
+        //Sparse_MatVec( H, x, workspace->b_prm );      
+        Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>>
+            (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>>
+            (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
+        {
+            cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+
+            Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
+                (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G);
+        }
+
+        Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
+            ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        /* GMRES inner-loop */
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] );
+        fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] );
 #endif
-		for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) {
-			/* matvec */
-			//Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
-			Cuda_Matvec_csr 
-				<<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> 
-				( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
-				(&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-
-			/* apply modified Gram-Schmidt to orthogonalize the new residual */
-			for( i = 0; i <= j; i++ ) {
-				Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-					(&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
-				cudaThreadSynchronize ();
-				cudaCheckError ();
-
-				Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
-				cudaThreadSynchronize ();
-				cudaCheckError ();
-
-				copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-
-				Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>>
-					( &workspace->v[index_wkspace_sys(j+1,0,N)], 
-					  -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
-				cudaThreadSynchronize ();
-				cudaCheckError ();
-			}
-
-
-			//workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-			cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
-
-			Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-
-			Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
-				( &workspace->v[index_wkspace_sys(j+1,0,N)], 
-				  1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			/* Givens rotations on the upper-Hessenberg matrix to make it U */
-			GMRES_Givens_Rotation <<<1, 1>>>
-				(j, workspace->h, workspace->hc, workspace->hs, g[j], spad);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-			copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-		}
-
-		copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
-
-		/* solve Hy = g.
-		   H is now upper-triangular, do back-substitution */
-		copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
-		GMRES_BackSubstitution <<<1, 1>>>
-			(j, spad, workspace->h, workspace->y);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		/* update x = x_0 + Vy */
-		for( i = 0; i < j; i++ )
-		{
-			copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>>
-				( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-		}
-
-		/* stopping condition */
-		if( fabs(g[j]) / bnorm <= tol )
-			break;
-	}
-
-	if( itr >= MAX_ITR ) {
-		fprintf( stderr, "GMRES convergence failed\n" );
-		return itr * (RESTART+1) + j + 1;
-	}
+        for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) {
+            /* matvec */
+            //Sparse_MatVec( H, &workspace->v[index_wkspace_sys(j,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
+            Cuda_Matvec_csr 
+                <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> 
+                ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
+                (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+
+            /* apply modified Gram-Schmidt to orthogonalize the new residual */
+            for( i = 0; i <= j; i++ ) {
+                Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+                    (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+
+                Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+
+                copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+
+                Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>>
+                    ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
+                      -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
+                cudaThreadSynchronize ();
+                cudaCheckError ();
+            }
+
+
+            //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
+            cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
+
+            Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+
+            Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
+                ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
+                  1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            /* Givens rotations on the upper-Hessenberg matrix to make it U */
+            GMRES_Givens_Rotation <<<1, 1>>>
+                (j, workspace->h, workspace->hc, workspace->hs, g[j], spad);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+            copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+        }
+
+        copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
+
+        /* solve Hy = g.
+           H is now upper-triangular, do back-substitution */
+        copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
+        GMRES_BackSubstitution <<<1, 1>>>
+            (j, spad, workspace->h, workspace->y);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        /* update x = x_0 + Vy */
+        for( i = 0; i < j; i++ )
+        {
+            copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>>
+                ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+        }
+
+        /* stopping condition */
+        if( fabs(g[j]) / bnorm <= tol )
+            break;
+    }
+
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        return itr * (RESTART+1) + j + 1;
+    }
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
+    fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
 #endif
-	return itr * (RESTART+1) + j + 1;
+    return itr * (RESTART+1) + j + 1;
 }
 
 
 int Cublas_GMRES(reax_system *system, static_storage *workspace, real *b, real tol, real *x )
 {
 
-	real CSR_ALPHA = 1, CSR_BETA = 0;
+    real CSR_ALPHA = 1, CSR_BETA = 0;
 
-	int i, j, k, itr, N;
-	real cc, tmp1, tmp2, temp, bnorm;
-	real v_add_tmp;
-	sparse_matrix *H = &workspace->H;
+    int i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+    real v_add_tmp;
+    sparse_matrix *H = &workspace->H;
 
-	real t_start, t_elapsed;
+    real t_start, t_elapsed;
 
-	real *spad = (real *)scratch;
-	real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
+    real *spad = (real *)scratch;
+    real *g = (real *) calloc ((RESTART+1), REAL_SIZE);
 
-	N = H->n;
+    N = H->n;
 
-	cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+    cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
 
-	/*
-	   Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL);
-	   cudaThreadSynchronize ();
-	   cudaCheckError ();
+    /*
+       Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (b, spad, H->n, INITIAL);
+       cudaThreadSynchronize ();
+       cudaCheckError ();
 
-	   Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
-	   cudaThreadSynchronize ();
-	   cudaCheckError ();
+       Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, spad + BLOCKS_POW_2, BLOCKS_POW_2, FINAL);
+       cudaThreadSynchronize ();
+       cudaCheckError ();
 
-	   copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-	 */
+       copy_host_device ( &bnorm, spad + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+     */
 
-	cublasCheckError (cublasDnrm2 ( cublasHandle, N, b, 1, &bnorm ));
+    cublasCheckError (cublasDnrm2 ( cublasHandle, N, b, 1, &bnorm ));
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, "Norm of the array is %e \n", bnorm );
+    fprintf (stderr, "Norm of the array is %e \n", bnorm );
 #endif
 
-	/* apply the diagonal pre-conditioner to rhs */
-	GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
-		(workspace->b_prc, b, workspace->Hdia_inv, N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	/* GMRES outer-loop */
-	for( itr = 0; itr < MAX_ITR; ++itr ) {
-		/* calculate r0 */
-		//Sparse_MatVec( H, x, workspace->b_prm );      
-		Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>>
-			(workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		/*
-		   Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>>
-		   (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
-		   cudaThreadSynchronize ();
-		   cudaCheckError ();
-		 */
-		cuda_memset (workspace->v, 0, REAL_SIZE * (RESTART+1) * N, RES_STORAGE_V);
-
-		double D_ONE = 1.;
-		double D_MINUS_ONE = -1.;
-		cublasCheckError (cublasDaxpy (cublasHandle, N, &D_ONE, workspace->b_prc, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
-		cublasCheckError (cublasDaxpy (cublasHandle, N, &D_MINUS_ONE, workspace->b_prm, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
-
-		//workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
-		{
-			/*
-			   cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
-
-			   Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
-			   (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL);
-			   cudaThreadSynchronize ();
-			   cudaCheckError ();
-
-			   Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
-			   cudaThreadSynchronize ();
-			   cudaCheckError ();
-
-			   copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G);
-			 */
-
-			cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (0, 0, N)], 1, g ));
-			copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
-		}
-
-		/*
-		   Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
-		   ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N );
-		   cudaThreadSynchronize ();
-		   cudaCheckError ();
-		 */
-
-		double D_SCALE = 1.0 / g[0];
-		cublasCheckError (cublasDscal (cublasHandle, N, &D_SCALE, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
-
-
-		/* GMRES inner-loop */
+    /* apply the diagonal pre-conditioner to rhs */
+    GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
+        (workspace->b_prc, b, workspace->Hdia_inv, N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr ) {
+        /* calculate r0 */
+        //Sparse_MatVec( H, x, workspace->b_prm );      
+        Cuda_Matvec_csr <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> ( *H, x, workspace->b_prm, N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        GMRES_Diagonal_Preconditioner <<< BLOCKS, BLOCK_SIZE >>>
+            (workspace->b_prm, workspace->b_prm, workspace->Hdia_inv, N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        /*
+           Cuda_Vector_Sum <<< BLOCKS, BLOCK_SIZE >>>
+           (&workspace->v[ index_wkspace_sys (0,0,N) ], 1.,workspace->b_prc, -1., workspace->b_prm, N);
+           cudaThreadSynchronize ();
+           cudaCheckError ();
+         */
+        cuda_memset (workspace->v, 0, REAL_SIZE * (RESTART+1) * N, RES_STORAGE_V);
+
+        double D_ONE = 1.;
+        double D_MINUS_ONE = -1.;
+        cublasCheckError (cublasDaxpy (cublasHandle, N, &D_ONE, workspace->b_prc, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
+        cublasCheckError (cublasDaxpy (cublasHandle, N, &D_MINUS_ONE, workspace->b_prm, 1, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
+
+        //workspace->g[0] = Norm( &workspace->v[index_wkspace_sys (0,0,system)], N );
+        {
+            /*
+               cuda_memset (spad, 0, REAL_SIZE * H->n * 2, RES_SCRATCH );
+
+               Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
+               (&workspace->v [index_wkspace_sys (0, 0, N)], spad, N, INITIAL);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+
+               Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->g[0], BLOCKS_POW_2, FINAL);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+
+               copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyDeviceToHost, RES_STORAGE_G);
+             */
+
+            cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (0, 0, N)], 1, g ));
+            copy_host_device( g, workspace->g, REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
+        }
+
+        /*
+           Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
+           ( &workspace->v[ index_wkspace_sys (0,0,N) ], 1.0/g[0], &workspace->v[index_wkspace_sys(0,0,N)], N );
+           cudaThreadSynchronize ();
+           cudaCheckError ();
+         */
+
+        double D_SCALE = 1.0 / g[0];
+        cublasCheckError (cublasDscal (cublasHandle, N, &D_SCALE, &workspace->v[ index_wkspace_sys (0,0,N) ], 1));
+
+
+        /* GMRES inner-loop */
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] );
+        fprintf (stderr, " Inner loop inputs bnorm : %f , tol : %f g[j] : %f \n", bnorm, tol, g[0] );
 #endif
-		for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) {
-			/* matvec */
-			Cuda_Matvec_csr 
-				<<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> 
-				( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
-				(&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-
-			/* apply modified Gram-Schmidt to orthogonalize the new residual */
-			for( i = 0; i <= j; i++ ) {
-
-				/*
-				   Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-				   (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
-				   cudaThreadSynchronize ();
-				   cudaCheckError ();
-
-				   Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
-				   cudaThreadSynchronize ();
-				   cudaCheckError ();
-
-				   copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-				 */
-
-				cublasCheckError (cublasDdot (cublasHandle, N, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
-							&workspace->v[index_wkspace_sys(j+1,0,N)], 1, 
-							&v_add_tmp));
-				copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
-
-				/*
-				   Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>>
-				   ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
-				   -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
-				   cudaThreadSynchronize ();
-				   cudaCheckError ();
-				 */
-
-				double NEG_V_ADD_TMP = -v_add_tmp;
-				cublasCheckError (cublasDaxpy (cublasHandle, N, &NEG_V_ADD_TMP, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
-							&workspace->v[index_wkspace_sys(j+1,0,N)], 1 ));
-			}
-
-
-			//workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-			/*
-			   cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
-
-			   Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
-			   cudaThreadSynchronize ();
-			   cudaCheckError ();
-
-			   Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
-			   cudaThreadSynchronize ();
-			   cudaCheckError ();
-
-			   copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			 */
-			cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (j+1, 0, N)], 1, &v_add_tmp ));
-			copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
-
-
-			/*
-			   Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
-			   ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
-			   1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N );
-			   cudaThreadSynchronize ();
-			   cudaCheckError ();
-			 */
-			double REC_V_ADD_TMP = 1. / v_add_tmp;
-			cublasCheckError (cublasDscal (cublasHandle, N, &REC_V_ADD_TMP,  &workspace->v[index_wkspace_sys(j+1,0,N)], 1));
-
-
-
-			/* Givens rotations on the upper-Hessenberg matrix to make it U */
-			GMRES_Givens_Rotation <<<1, 1>>>
-				(j, workspace->h, workspace->hc, workspace->hs, g[j], spad);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-			copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-		}
-
-		copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
-
-		/* solve Hy = g.
-		   H is now upper-triangular, do back-substitution */
-		copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
-		GMRES_BackSubstitution <<<1, 1>>>
-			(j, spad, workspace->h, workspace->y);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		/* update x = x_0 + Vy */
-		for( i = 0; i < j; i++ )
-		{
-			/*
-			   copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			   Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>>
-			   ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
-			   cudaThreadSynchronize ();
-			   cudaCheckError ();
-			 */
-
-			copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			cublasCheckError (cublasDaxpy (cublasHandle, N, &v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
-						x, 1));
-		}
-
-		/* stopping condition */
-		if( fabs(g[j]) / bnorm <= tol )
-			break;
-	}
-
-	if( itr >= MAX_ITR ) {
-		fprintf( stderr, "GMRES convergence failed\n" );
-		return itr * (RESTART+1) + j + 1;
-	}
+        for( j = 0; j < RESTART && fabs(g[j]) / bnorm > tol; j++ ) {
+            /* matvec */
+            Cuda_Matvec_csr 
+                <<<MATVEC_BLOCKS, MATVEC_BLOCK_SIZE, REAL_SIZE * MATVEC_BLOCK_SIZE>>> 
+                ( *H, &workspace->v[ index_wkspace_sys (j, 0, N)], &workspace->v[ index_wkspace_sys (j+1, 0, N) ], N );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            GMRES_Diagonal_Preconditioner <<<BLOCKS, BLOCK_SIZE>>>
+                (&workspace->v[ index_wkspace_sys (j+1,0,N) ], &workspace->v[ index_wkspace_sys (j+1,0,N) ], workspace->Hdia_inv, N);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+
+            /* apply modified Gram-Schmidt to orthogonalize the new residual */
+            for( i = 0; i <= j; i++ ) {
+
+                /*
+                   Cuda_Dot <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+                   (&workspace->v[index_wkspace_sys(i,0,N)], &workspace->v[index_wkspace_sys(j+1,0,N)], spad, N);
+                   cudaThreadSynchronize ();
+                   cudaCheckError ();
+
+                   Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (i,j) ], BLOCKS_POW_2);
+                   cudaThreadSynchronize ();
+                   cudaCheckError ();
+
+                   copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+                 */
+
+                cublasCheckError (cublasDdot (cublasHandle, N, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
+                            &workspace->v[index_wkspace_sys(j+1,0,N)], 1, 
+                            &v_add_tmp));
+                copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (i,j)], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
+
+                /*
+                   Cuda_Vector_Add <<< BLOCKS, BLOCK_SIZE >>>
+                   ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
+                   -v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
+                   cudaThreadSynchronize ();
+                   cudaCheckError ();
+                 */
+
+                double NEG_V_ADD_TMP = -v_add_tmp;
+                cublasCheckError (cublasDaxpy (cublasHandle, N, &NEG_V_ADD_TMP, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
+                            &workspace->v[index_wkspace_sys(j+1,0,N)], 1 ));
+            }
+
+
+            //workspace->h[ index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys(j+1,0,system)], N );
+            /*
+               cuda_memset (spad, 0, REAL_SIZE * N * 2, RES_SCRATCH );
+
+               Cuda_Norm <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> (&workspace->v[index_wkspace_sys(j+1,0,N)], spad, N, INITIAL);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+
+               Cuda_Norm <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> (spad, &workspace->h[ index_wkspace_res (j+1,j) ], BLOCKS_POW_2, FINAL);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+
+               copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+             */
+            cublasCheckError (cublasDnrm2 ( cublasHandle, N, &workspace->v [index_wkspace_sys (j+1, 0, N)], 1, &v_add_tmp ));
+            copy_host_device (&v_add_tmp, &workspace->h[ index_wkspace_res (j+1,j) ], REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
+
+
+            /*
+               Cuda_Vector_Scale <<< BLOCKS, BLOCK_SIZE >>>
+               ( &workspace->v[index_wkspace_sys(j+1,0,N)], 
+               1. / v_add_tmp, &workspace->v[index_wkspace_sys(j+1,0,N)], N );
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+             */
+            double REC_V_ADD_TMP = 1. / v_add_tmp;
+            cublasCheckError (cublasDscal (cublasHandle, N, &REC_V_ADD_TMP,  &workspace->v[index_wkspace_sys(j+1,0,N)], 1));
+
+
+
+            /* Givens rotations on the upper-Hessenberg matrix to make it U */
+            GMRES_Givens_Rotation <<<1, 1>>>
+                (j, workspace->h, workspace->hc, workspace->hs, g[j], spad);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+            copy_host_device (&g[j], spad, 2 * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+        }
+
+        copy_host_device (g, workspace->g, (RESTART+1)*REAL_SIZE, cudaMemcpyHostToDevice, __LINE__);
+
+        /* solve Hy = g.
+           H is now upper-triangular, do back-substitution */
+        copy_host_device (g, spad, (RESTART+1) * REAL_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_G);
+        GMRES_BackSubstitution <<<1, 1>>>
+            (j, spad, workspace->h, workspace->y);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        /* update x = x_0 + Vy */
+        for( i = 0; i < j; i++ )
+        {
+            /*
+               copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+               Cuda_Vector_Add <<<BLOCKS, BLOCK_SIZE>>>
+               ( x, v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], N );
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+             */
+
+            copy_host_device (&v_add_tmp, &workspace->y[i], REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            cublasCheckError (cublasDaxpy (cublasHandle, N, &v_add_tmp, &workspace->v[index_wkspace_sys(i,0,N)], 1, 
+                        x, 1));
+        }
+
+        /* stopping condition */
+        if( fabs(g[j]) / bnorm <= tol )
+            break;
+    }
+
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        return itr * (RESTART+1) + j + 1;
+    }
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
+    fprintf (stderr, " GPU values itr : %d, RESTART: %d, j: %d \n", itr, RESTART, j);
 #endif
-	return itr * (RESTART+1) + j + 1;
+    return itr * (RESTART+1) + j + 1;
 }
 
 int GMRES_HouseHolder( static_storage *workspace, sparse_matrix *H, 
-		real *b, real tol, real *x, FILE *fout, reax_system *system)
+        real *b, real tol, real *x, FILE *fout, reax_system *system)
 {
-	int  i, j, k, itr, N;
-	real cc, tmp1, tmp2, temp, bnorm;
-	real v[10000], z[RESTART+2][10000], w[RESTART+2];
-	real u[RESTART+2][10000];
+    int  i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+    real v[10000], z[RESTART+2][10000], w[RESTART+2];
+    real u[RESTART+2][10000];
 
-	N = H->n;
-	bnorm = Norm( b, N );
+    N = H->n;
+    bnorm = Norm( b, N );
 
-	/* apply the diagonal pre-conditioner to rhs */
-	for( i = 0; i < N; ++i )
-		workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
+    /* apply the diagonal pre-conditioner to rhs */
+    for( i = 0; i < N; ++i )
+        workspace->b_prc[i] = b[i] * workspace->Hdia_inv[i];  
 
-	// memset( x, 0, sizeof(real) * N );
+    // memset( x, 0, sizeof(real) * N );
 
-	/* GMRES outer-loop */
-	for( itr = 0; itr < MAX_ITR; ++itr ) {
-		/* compute z = r0 */
-		Sparse_MatVec( H, x, workspace->b_prm );      
-		for( i = 0; i < N; ++i )
-			workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
-		Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr ) {
+        /* compute z = r0 */
+        Sparse_MatVec( H, x, workspace->b_prm );      
+        for( i = 0; i < N; ++i )
+            workspace->b_prm[i] *= workspace->Hdia_inv[i]; /* pre-conditioner */
+        Vector_Sum( z[0], 1.,  workspace->b_prc, -1., workspace->b_prm, N );
 
-		Vector_MakeZero( w, RESTART+1 );
-		w[0] = Norm( z[0], N );
+        Vector_MakeZero( w, RESTART+1 );
+        w[0] = Norm( z[0], N );
 
-		Vector_Copy( u[0], z[0], N );
-		u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
-		Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
+        Vector_Copy( u[0], z[0], N );
+        u[0][0] += ( u[0][0] < 0.0 ? -1 : 1 ) * w[0];
+        Vector_Scale( u[0], 1 / Norm( u[0], N ), u[0], N );
 
-		w[0]    *= ( u[0][0] < 0.0 ?  1 :-1 );
-		// fprintf( stderr, "\n\n%12.6f\n", w[0] );
+        w[0]    *= ( u[0][0] < 0.0 ?  1 :-1 );
+        // fprintf( stderr, "\n\n%12.6f\n", w[0] );
 
-		/* GMRES inner-loop */
-		for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) {
-			/* compute v_j */
-			Vector_Scale( z[j], -2 * u[j][j], u[j], N );
-			z[j][j] += 1.; /* due to e_j */
+        /* GMRES inner-loop */
+        for( j = 0; j < RESTART && fabs( w[j] ) / bnorm > tol; j++ ) {
+            /* compute v_j */
+            Vector_Scale( z[j], -2 * u[j][j], u[j], N );
+            z[j][j] += 1.; /* due to e_j */
 
-			for( i = j-1; i >= 0; --i )
-				Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i );
+            for( i = j-1; i >= 0; --i )
+                Vector_Add( z[j]+i, -2 * Dot( u[i]+i, z[j]+i, N-i ), u[i]+i, N-i );
 
 
-			/* matvec */
-			Sparse_MatVec( H, z[j], v );
+            /* matvec */
+            Sparse_MatVec( H, z[j], v );
 
-			for( k = 0; k < N; ++k )
-				v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
+            for( k = 0; k < N; ++k )
+                v[k] *= workspace->Hdia_inv[k]; /* pre-conditioner */
 
-			for( i = 0; i <= j; ++i )
-				Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i );
+            for( i = 0; i <= j; ++i )
+                Vector_Add( v+i, -2 * Dot( u[i]+i, v+i, N-i ), u[i]+i, N-i );
 
 
-			if( !Vector_isZero( v + (j+1), N - (j+1) ) ) {
-				/* compute the HouseHolder unit vector u_j+1 */
-				for( i = 0; i <= j; ++i )  
-					u[j+1][i] = 0;
+            if( !Vector_isZero( v + (j+1), N - (j+1) ) ) {
+                /* compute the HouseHolder unit vector u_j+1 */
+                for( i = 0; i <= j; ++i )  
+                    u[j+1][i] = 0;
 
-				Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) );
+                Vector_Copy( u[j+1] + (j+1), v + (j+1), N - (j+1) );
 
-				u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) );
+                u[j+1][j+1] += ( v[j+1]<0.0 ? -1:1 ) * Norm( v+(j+1), N-(j+1) );
 
-				Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N );
+                Vector_Scale( u[j+1], 1 / Norm( u[j+1], N ), u[j+1], N );
 
-				/* overwrite v with P_m+1 * v */
-				v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1];
-				Vector_MakeZero( v + (j+2), N - (j+2) );
-				// Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
-			}
+                /* overwrite v with P_m+1 * v */
+                v[j+1] -= 2 * Dot( u[j+1]+(j+1), v+(j+1), N-(j+1) ) * u[j+1][j+1];
+                Vector_MakeZero( v + (j+2), N - (j+2) );
+                // Vector_Add( v, -2 * Dot( u[j+1], v, N ), u[j+1], N );
+            }
 
 
-			/* prev Givens rots on the upper-Hessenberg matrix to make it U */
-			for( i = 0; i < j; i++ ) {
-				tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1];
-				tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1];
+            /* prev Givens rots on the upper-Hessenberg matrix to make it U */
+            for( i = 0; i < j; i++ ) {
+                tmp1 =  workspace->hc[i] * v[i] + workspace->hs[i] * v[i+1];
+                tmp2 = -workspace->hs[i] * v[i] + workspace->hc[i] * v[i+1];
 
-				v[i]   = tmp1;
-				v[i+1] = tmp2;
-			}
+                v[i]   = tmp1;
+                v[i+1] = tmp2;
+            }
 
-			/* apply the new Givens rotation to H and right-hand side */
-			if( fabs(v[j+1]) >= ALMOST_ZERO )	{
-				cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) );
-				workspace->hc[j] = v[j] / cc;
-				workspace->hs[j] = v[j+1] / cc;
+            /* apply the new Givens rotation to H and right-hand side */
+            if( fabs(v[j+1]) >= ALMOST_ZERO )    {
+                cc = SQRT( SQR( v[j] ) + SQR( v[j+1] ) );
+                workspace->hc[j] = v[j] / cc;
+                workspace->hs[j] = v[j+1] / cc;
 
-				tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1];
-				tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1];
+                tmp1 =  workspace->hc[j] * v[j] + workspace->hs[j] * v[j+1];
+                tmp2 = -workspace->hs[j] * v[j] + workspace->hc[j] * v[j+1];
 
-				v[j]   = tmp1;
-				v[j+1] = tmp2;
+                v[j]   = tmp1;
+                v[j+1] = tmp2;
 
-				/* Givens rotations to rhs */
-				tmp1 =  workspace->hc[j] * w[j];
-				tmp2 = -workspace->hs[j] * w[j];
-				w[j]   = tmp1;
-				w[j+1] = tmp2;
-			}
+                /* Givens rotations to rhs */
+                tmp1 =  workspace->hc[j] * w[j];
+                tmp2 = -workspace->hs[j] * w[j];
+                w[j]   = tmp1;
+                w[j+1] = tmp2;
+            }
 
-			/* extend R */
-			for( i = 0; i <= j; ++i )
-				workspace->h[ index_wkspace_res (i,j) ] = v[i];
+            /* extend R */
+            for( i = 0; i <= j; ++i )
+                workspace->h[ index_wkspace_res (i,j) ] = v[i];
 
 
-			// fprintf( stderr, "h:" );
-			// for( i = 0; i <= j+1 ; ++i )
-			// fprintf( stderr, "%.6f ", h[i][j] );
-			// fprintf( stderr, "\n" );
-			// fprintf( stderr, "%12.6f\n", w[j+1] );
-		}
+            // fprintf( stderr, "h:" );
+            // for( i = 0; i <= j+1 ; ++i )
+            // fprintf( stderr, "%.6f ", h[i][j] );
+            // fprintf( stderr, "\n" );
+            // fprintf( stderr, "%12.6f\n", w[j+1] );
+        }
 
 
-		/* solve Hy = w.
-		   H is now upper-triangular, do back-substitution */
-		for( i = j-1; i >= 0; i-- ) {
-			temp = w[i];	  
-			for( k = j-1; k > i; k-- )
-				temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+        /* solve Hy = w.
+           H is now upper-triangular, do back-substitution */
+        for( i = j-1; i >= 0; i-- ) {
+            temp = w[i];      
+            for( k = j-1; k > i; k-- )
+                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
 
-			workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
-		}
+            workspace->y[i] = temp / workspace->h[ index_wkspace_res (i,i) ];
+        }
 
-		// fprintf( stderr, "y: " );
-		// for( i = 0; i < RESTART+1; ++i )
-		//   fprintf( stderr, "%8.3f ", workspace->y[i] );
+        // fprintf( stderr, "y: " );
+        // for( i = 0; i < RESTART+1; ++i )
+        //   fprintf( stderr, "%8.3f ", workspace->y[i] );
 
 
-		/* update x = x_0 + Vy */
-		// memset( z, 0, sizeof(real) * N );
-		// for( i = j-1; i >= 0; i-- )
-		//   {
-		//     Vector_Copy( v, z, N );
-		//     v[i] += workspace->y[i];
-		//    
-		//     Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N );
-		//   }      
-		//
-		// fprintf( stderr, "\nz: " );
-		// for( k = 0; k < N; ++k )
-		// fprintf( stderr, "%6.2f ", z[k] );
+        /* update x = x_0 + Vy */
+        // memset( z, 0, sizeof(real) * N );
+        // for( i = j-1; i >= 0; i-- )
+        //   {
+        //     Vector_Copy( v, z, N );
+        //     v[i] += workspace->y[i];
+        //    
+        //     Vector_Sum( z, 1., v, -2 * Dot( u[i], v, N ), u[i], N );
+        //   }      
+        //
+        // fprintf( stderr, "\nz: " );
+        // for( k = 0; k < N; ++k )
+        // fprintf( stderr, "%6.2f ", z[k] );
 
-		// fprintf( stderr, "\nx_bef: " );
-		// for( i = 0; i < N; ++i )
-		//   fprintf( stderr, "%6.2f ", x[i] );
+        // fprintf( stderr, "\nx_bef: " );
+        // for( i = 0; i < N; ++i )
+        //   fprintf( stderr, "%6.2f ", x[i] );
 
-		// Vector_Add( x, 1, z, N );
-		for( i = j-1; i >= 0; i-- )
-			Vector_Add( x, workspace->y[i], z[i], N );
+        // Vector_Add( x, 1, z, N );
+        for( i = j-1; i >= 0; i-- )
+            Vector_Add( x, workspace->y[i], z[i], N );
 
-		// fprintf( stderr, "\nx_aft: " );
-		// for( i = 0; i < N; ++i )
-		//   fprintf( stderr, "%6.2f ", x[i] );
-
-		/* stopping condition */
-		if( fabs( w[j] ) / bnorm <= tol )
-			break;
-	}
-
-	// Sparse_MatVec( H, x, workspace->b_prm );
-	// for( i = 0; i < N; ++i )
-	// workspace->b_prm[i] *= workspace->Hdia_inv[i];
-
-	// fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-	// for( i = 0; i < N; ++i )
-	// fprintf( fout, "%10.5f%15.12f%15.12f\n", 
-	// workspace->b_prc[i], workspace->b_prm[i], x[i] );
-
-	//fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", 
-	//	     itr, j, fabs( workspace->g[j] ) / bnorm );
-
-	if( itr >= MAX_ITR ) {
-		fprintf( stderr, "GMRES convergence failed\n" );
-		// return -1;
-		return itr * (RESTART+1) + j + 1;
-	}
-
-	return itr * (RESTART+1) + j + 1;
+        // fprintf( stderr, "\nx_aft: " );
+        // for( i = 0; i < N; ++i )
+        //   fprintf( stderr, "%6.2f ", x[i] );
+
+        /* stopping condition */
+        if( fabs( w[j] ) / bnorm <= tol )
+            break;
+    }
+
+    // Sparse_MatVec( H, x, workspace->b_prm );
+    // for( i = 0; i < N; ++i )
+    // workspace->b_prm[i] *= workspace->Hdia_inv[i];
+
+    // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
+    // for( i = 0; i < N; ++i )
+    // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+    // workspace->b_prc[i], workspace->b_prm[i], x[i] );
+
+    //fprintf( fout,"GMRES outer:%d, inner:%d iters - residual norm: %15.10f\n", 
+    //         itr, j, fabs( workspace->g[j] ) / bnorm );
+
+    if( itr >= MAX_ITR ) {
+        fprintf( stderr, "GMRES convergence failed\n" );
+        // return -1;
+        return itr * (RESTART+1) + j + 1;
+    }
+
+    return itr * (RESTART+1) + j + 1;
 }
 
 
 int PGMRES( static_storage *workspace, sparse_matrix *H, real *b, real tol, 
-		sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system )
+        sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system *system )
 {
-	int i, j, k, itr, N;
-	real cc, tmp1, tmp2, temp, bnorm;
-
-	N = H->n;
-	bnorm = Norm( b, N );
-
-	/* GMRES outer-loop */
-	for( itr = 0; itr < MAX_ITR; ++itr ) {
-		/* calculate r0 */
-		Sparse_MatVec( H, x, workspace->b_prm );      
-		Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system)], 1., b, -1., workspace->b_prm, N );
-		Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] );
-		Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] );
-		workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system)], N );
-		Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system)], N );
-		//fprintf( stderr, "res: %.15e\n", workspace->g[0] );
-
-		/* GMRES inner-loop */
-		for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
-			/* matvec */
-			Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system)], &workspace->v[index_wkspace_sys (j+1,0,system)] );
-			Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
-			Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
-
-			/* apply modified Gram-Schmidt to orthogonalize the new residual */
-			for( i = 0; i < j-1; i++ ) workspace->h[ index_wkspace_res (i,j)] = 0;
-
-			//for( i = 0; i <= j; i++ ) {
-			for( i = MAX(j-1,0); i <= j; i++ ) {
-				workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-				Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N );
-			}
-
-			workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system)], N );
-			Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], 
-					1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
-			// fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
-
-			/* Givens rotations on the upper-Hessenberg matrix to make it U */
-			for( i = MAX(j-1,0); i <= j; i++ )	{
-				if( i == j ) {
-					cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
-					workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
-					workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
-				}
-
-				tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
-					workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ];
-				tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + 
-					workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ];
-
-				workspace->h[ index_wkspace_res (i,j) ] = tmp1;
-				workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
-			} 
-
-			/* apply Givens rotations to the rhs as well */
-			tmp1 =  workspace->hc[j] * workspace->g[j];
-			tmp2 = -workspace->hs[j] * workspace->g[j];
-			workspace->g[j] = tmp1;
-			workspace->g[j+1] = tmp2;
-
-			//fprintf( stderr, "h: " );
-			//for( i = 0; i <= j+1; ++i )
-			//fprintf( stderr, "%.6f ", workspace->h[i][j] );
-			//fprintf( stderr, "\n" );
-			//fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
-		}
-
-
-		/* solve Hy = g: H is now upper-triangular, do back-substitution */
-		for( i = j-1; i >= 0; i-- ) {
-			temp = workspace->g[i];	  
-			for( k = j-1; k > i; k-- )
-				temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
-
-			workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)];
-		}
-
-		/* update x = x_0 + Vy */
-		Vector_MakeZero( workspace->p, N );
-		for( i = 0; i < j; i++ )
-			Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N );
-		//Backward_Subs( U, workspace->p, workspace->p );
-		//Forward_Subs( L, workspace->p, workspace->p );
-		Vector_Add( x, 1., workspace->p, N );
-
-		/* stopping condition */
-		if( fabs(workspace->g[j]) / bnorm <= tol )
-			break;
-		}
-
-		// Sparse_MatVec( H, x, workspace->b_prm );
-		// for( i = 0; i < N; ++i )
-		// workspace->b_prm[i] *= workspace->Hdia_inv[i];    
-		// fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
-		// for( i = 0; i < N; ++i )
-		// fprintf( fout, "%10.5f%15.12f%15.12f\n", 
-		// workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
-
-		// fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
-		//	      itr, j, fabs( workspace->g[j] ) / bnorm );
-		// data->timing.matvec += itr * RESTART + j;
-
-		if( itr >= MAX_ITR ) {
-			fprintf( stderr, "GMRES convergence failed\n" );
-			// return -1;
-			return itr * (RESTART+1) + j + 1;
-		}
-
-		return itr * (RESTART+1) + j + 1;
-	}
-
-
-
-	int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, 
-			sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system )
-	{
-		int  i, N;
-		real tmp, alpha, beta, b_norm, r_norm;
-		real sig0, sig_old, sig_new;
-
-		N = A->n;
-		b_norm = Norm( b, N );
-		//fprintf( stderr, "b_norm: %.15e\n", b_norm );
-
-		Sparse_MatVec( A, x, workspace->q );
-		Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-		r_norm = Norm(workspace->r, N);
-		//Print_Soln( workspace, x, q, b, N );
-		//fprintf( stderr, "res: %.15e\n", r_norm );
-
-		Forward_Subs( L, workspace->r, workspace->d );
-		Backward_Subs( U, workspace->d, workspace->p );
-		sig_new = Dot( workspace->r, workspace->p, N );
-		sig0 = sig_new;
-
-		for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) {
-			//for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) {
-			Sparse_MatVec( A, workspace->p, workspace->q );
-			tmp = Dot( workspace->q, workspace->p, N );
-			alpha = sig_new / tmp;
-			Vector_Add( x, alpha, workspace->p, N );
-			//fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
-			//     i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp );
-
-			Vector_Add( workspace->r, -alpha, workspace->q, N );
-			r_norm = Norm(workspace->r, N);
-			//fprintf( stderr, "res: %.15e\n", r_norm );
-
-			Forward_Subs( L, workspace->r, workspace->d );
-			Backward_Subs( U, workspace->d, workspace->d );
-			sig_old = sig_new;
-			sig_new = Dot( workspace->r, workspace->d, N );
-			beta = sig_new / sig_old;
-			Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N );
-		}
-
-		//fprintf( fout, "CG took %d iterations\n", i );
-		if( i >= 200 ) {
-			fprintf( stderr, "CG convergence failed!\n" );
-			return i;
-		}
-
-		return i;
-		}
-
-
-		int CG( static_storage *workspace, sparse_matrix *H, 
-				real *b, real tol, real *x, FILE *fout, reax_system *system)
-		{
-			int  i, j, N;
-			real tmp, alpha, beta, b_norm;
-			real sig_old, sig_new, sig0;
-
-			N = H->n;
-			b_norm = Norm( b, N );
-			//fprintf( stderr, "b_norm: %10.6f\n", b_norm );
-
-			Sparse_MatVec( H, x, workspace->q );
-			Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-			for( j = 0; j < N; ++j )
-				workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-
-			sig_new = Dot( workspace->r, workspace->d, N );
-			sig0 = sig_new;
-			//Print_Soln( workspace, x, q, b, N );
-			//fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", 
-			// sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) );
-			//fprintf( stderr, "sig_new: %f\n", sig_new );
-
-			for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) {
-				//for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) {
-				Sparse_MatVec( H, workspace->d, workspace->q );
-				tmp = Dot( workspace->d, workspace->q, N );
-				//fprintf( stderr, "tmp: %f\n", tmp );
-				alpha = sig_new / tmp;    
-				Vector_Add( x, alpha, workspace->d, N );
-				//fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
-				//     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
-
-				Vector_Add( workspace->r, -alpha, workspace->q, N );    
-				for( j = 0; j < N; ++j )
-					workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
-
-				sig_old = sig_new;
-				sig_new = Dot( workspace->r, workspace->p, N );
-				beta = sig_new / sig_old;
-				Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N );
-				//fprintf( stderr, "sig_new: %f\n", sig_new );
-			}
-
-			fprintf( stderr, "CG took %d iterations\n", i );
-
-			if( i >= 300 ) {
-				fprintf( stderr, "CG convergence failed!\n" );
-				return i;
-			}
-
-			return i;
-			}
-
-
-
-			/* Steepest Descent */
-			int SDM( static_storage *workspace, sparse_matrix *H, 
-					real *b, real tol, real *x, FILE *fout )
-			{
-				int  i, j, N;
-				real tmp, alpha, beta, b_norm;
-				real sig0, sig;
-
-				N = H->n;
-				b_norm = Norm( b, N );
-				//fprintf( stderr, "b_norm: %10.6f\n", b_norm );
-
-				Sparse_MatVec( H, x, workspace->q );
-				Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
-				for( j = 0; j < N; ++j )
-					workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-
-				sig = Dot( workspace->r, workspace->d, N );
-				sig0 = sig;
-
-				for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) {
-					Sparse_MatVec( H, workspace->d, workspace->q );
-
-					sig = Dot( workspace->r, workspace->d, N );
-					tmp = Dot( workspace->d, workspace->q, N );
-					alpha = sig / tmp;    
-
-					Vector_Add( x, alpha, workspace->d, N );
-					Vector_Add( workspace->r, -alpha, workspace->q, N );
-					for( j = 0; j < N; ++j )
-						workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
-
-					//fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
-					//     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
-				}
-
-				fprintf( stderr, "SDM took %d iterations\n", i );
-
-				if( i >= 300 ) {
-					fprintf( stderr, "SDM convergence failed!\n" );
-					return i;
-				}
-
-				return i;
-			}
+    int i, j, k, itr, N;
+    real cc, tmp1, tmp2, temp, bnorm;
+
+    N = H->n;
+    bnorm = Norm( b, N );
+
+    /* GMRES outer-loop */
+    for( itr = 0; itr < MAX_ITR; ++itr ) {
+        /* calculate r0 */
+        Sparse_MatVec( H, x, workspace->b_prm );      
+        Vector_Sum( &workspace->v[index_wkspace_sys(0,0,system)], 1., b, -1., workspace->b_prm, N );
+        Forward_Subs( L, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] );
+        Backward_Subs( U, &workspace->v[index_wkspace_sys(0,0,system)], &workspace->v[index_wkspace_sys(0,0,system)] );
+        workspace->g[0] = Norm( &workspace->v[index_wkspace_sys(0,0,system)], N );
+        Vector_Scale( &workspace->v[index_wkspace_sys(0,0,system)], 1. / workspace->g[0], &workspace->v[index_wkspace_sys (0,0,system)], N );
+        //fprintf( stderr, "res: %.15e\n", workspace->g[0] );
+
+        /* GMRES inner-loop */
+        for( j = 0; j < RESTART && fabs(workspace->g[j]) / bnorm > tol; j++ ) {
+            /* matvec */
+            Sparse_MatVec( H, &workspace->v[index_wkspace_sys (j,0,system)], &workspace->v[index_wkspace_sys (j+1,0,system)] );
+            Forward_Subs( L, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
+            Backward_Subs( U, &workspace->v[index_wkspace_sys(j+1,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)] );
+
+            /* apply modified Gram-Schmidt to orthogonalize the new residual */
+            for( i = 0; i < j-1; i++ ) workspace->h[ index_wkspace_res (i,j)] = 0;
+
+            //for( i = 0; i <= j; i++ ) {
+            for( i = MAX(j-1,0); i <= j; i++ ) {
+                workspace->h[index_wkspace_res (i,j)] = Dot( &workspace->v[index_wkspace_sys (i,0,system)], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
+                Vector_Add( &workspace->v[index_wkspace_sys(j+1,0,system)],-workspace->h[ index_wkspace_res (i,j) ], &workspace->v[index_wkspace_sys(i,0,system)], N );
+            }
+
+            workspace->h[index_wkspace_res (j+1,j) ] = Norm( &workspace->v[index_wkspace_sys (j+1,0,system)], N );
+            Vector_Scale( &workspace->v[index_wkspace_sys(j+1,0,system)], 
+                    1. / workspace->h[ index_wkspace_res (j+1,j)], &workspace->v[index_wkspace_sys(j+1,0,system)], N );
+            // fprintf( stderr, "%d-%d: orthogonalization completed.\n", itr, j );
+
+            /* Givens rotations on the upper-Hessenberg matrix to make it U */
+            for( i = MAX(j-1,0); i <= j; i++ )    {
+                if( i == j ) {
+                    cc = SQRT( SQR(workspace->h[ index_wkspace_res (j,j) ])+SQR(workspace->h[ index_wkspace_res (j+1,j) ]) );
+                    workspace->hc[j] = workspace->h[ index_wkspace_res (j,j) ] / cc;
+                    workspace->hs[j] = workspace->h[ index_wkspace_res (j+1,j) ] / cc;
+                }
+
+                tmp1 =  workspace->hc[i] * workspace->h[ index_wkspace_res (i,j) ] + 
+                    workspace->hs[i] * workspace->h[index_wkspace_res (i+1,j) ];
+                tmp2 = -workspace->hs[i] * workspace->h[index_wkspace_res (i,j)] + 
+                    workspace->hc[i] * workspace->h[index_wkspace_res (i+1,j) ];
+
+                workspace->h[ index_wkspace_res (i,j) ] = tmp1;
+                workspace->h[ index_wkspace_res (i+1,j) ] = tmp2;
+            } 
+
+            /* apply Givens rotations to the rhs as well */
+            tmp1 =  workspace->hc[j] * workspace->g[j];
+            tmp2 = -workspace->hs[j] * workspace->g[j];
+            workspace->g[j] = tmp1;
+            workspace->g[j+1] = tmp2;
+
+            //fprintf( stderr, "h: " );
+            //for( i = 0; i <= j+1; ++i )
+            //fprintf( stderr, "%.6f ", workspace->h[i][j] );
+            //fprintf( stderr, "\n" );
+            //fprintf( stderr, "res: %.15e\n", workspace->g[j+1] );
+        }
+
+
+        /* solve Hy = g: H is now upper-triangular, do back-substitution */
+        for( i = j-1; i >= 0; i-- ) {
+            temp = workspace->g[i];      
+            for( k = j-1; k > i; k-- )
+                temp -= workspace->h[ index_wkspace_res (i,k) ] * workspace->y[k];
+
+            workspace->y[i] = temp / workspace->h[index_wkspace_res (i,i)];
+        }
+
+        /* update x = x_0 + Vy */
+        Vector_MakeZero( workspace->p, N );
+        for( i = 0; i < j; i++ )
+            Vector_Add( workspace->p, workspace->y[i], &workspace->v[index_wkspace_sys(i,0,system)], N );
+        //Backward_Subs( U, workspace->p, workspace->p );
+        //Forward_Subs( L, workspace->p, workspace->p );
+        Vector_Add( x, 1., workspace->p, N );
+
+        /* stopping condition */
+        if( fabs(workspace->g[j]) / bnorm <= tol )
+            break;
+        }
+
+        // Sparse_MatVec( H, x, workspace->b_prm );
+        // for( i = 0; i < N; ++i )
+        // workspace->b_prm[i] *= workspace->Hdia_inv[i];    
+        // fprintf( fout, "\n%10s%15s%15s\n", "b_prc", "b_prm", "x" );
+        // for( i = 0; i < N; ++i )
+        // fprintf( fout, "%10.5f%15.12f%15.12f\n", 
+        // workspace->b_prc[i], workspace->b_prm[i], x[i] );*/
+
+        // fprintf(fout,"GMRES outer:%d, inner:%d iters - residual norm: %25.20f\n", 
+        //          itr, j, fabs( workspace->g[j] ) / bnorm );
+        // data->timing.matvec += itr * RESTART + j;
+
+        if( itr >= MAX_ITR ) {
+            fprintf( stderr, "GMRES convergence failed\n" );
+            // return -1;
+            return itr * (RESTART+1) + j + 1;
+        }
+
+        return itr * (RESTART+1) + j + 1;
+    }
+
+
+
+    int PCG( static_storage *workspace, sparse_matrix *A, real *b, real tol, 
+            sparse_matrix *L, sparse_matrix *U, real *x, FILE *fout, reax_system* system )
+    {
+        int  i, N;
+        real tmp, alpha, beta, b_norm, r_norm;
+        real sig0, sig_old, sig_new;
+
+        N = A->n;
+        b_norm = Norm( b, N );
+        //fprintf( stderr, "b_norm: %.15e\n", b_norm );
+
+        Sparse_MatVec( A, x, workspace->q );
+        Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+        r_norm = Norm(workspace->r, N);
+        //Print_Soln( workspace, x, q, b, N );
+        //fprintf( stderr, "res: %.15e\n", r_norm );
+
+        Forward_Subs( L, workspace->r, workspace->d );
+        Backward_Subs( U, workspace->d, workspace->p );
+        sig_new = Dot( workspace->r, workspace->p, N );
+        sig0 = sig_new;
+
+        for( i = 0; i < 200 && r_norm/b_norm > tol; ++i ) {
+            //for( i = 0; i < 200 && sig_new > SQR(tol) * sig0; ++i ) {
+            Sparse_MatVec( A, workspace->p, workspace->q );
+            tmp = Dot( workspace->q, workspace->p, N );
+            alpha = sig_new / tmp;
+            Vector_Add( x, alpha, workspace->p, N );
+            //fprintf( stderr, "iter%d: |p|=%.15e |q|=%.15e tmp=%.15e\n",
+            //     i+1, Norm(workspace->p,N), Norm(workspace->q,N), tmp );
+
+            Vector_Add( workspace->r, -alpha, workspace->q, N );
+            r_norm = Norm(workspace->r, N);
+            //fprintf( stderr, "res: %.15e\n", r_norm );
+
+            Forward_Subs( L, workspace->r, workspace->d );
+            Backward_Subs( U, workspace->d, workspace->d );
+            sig_old = sig_new;
+            sig_new = Dot( workspace->r, workspace->d, N );
+            beta = sig_new / sig_old;
+            Vector_Sum( workspace->p, 1., workspace->d, beta, workspace->p, N );
+        }
+
+        //fprintf( fout, "CG took %d iterations\n", i );
+        if( i >= 200 ) {
+            fprintf( stderr, "CG convergence failed!\n" );
+            return i;
+        }
+
+        return i;
+        }
+
+
+        int CG( static_storage *workspace, sparse_matrix *H, 
+                real *b, real tol, real *x, FILE *fout, reax_system *system)
+        {
+            int  i, j, N;
+            real tmp, alpha, beta, b_norm;
+            real sig_old, sig_new, sig0;
+
+            N = H->n;
+            b_norm = Norm( b, N );
+            //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
+
+            Sparse_MatVec( H, x, workspace->q );
+            Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+            for( j = 0; j < N; ++j )
+                workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+
+            sig_new = Dot( workspace->r, workspace->d, N );
+            sig0 = sig_new;
+            //Print_Soln( workspace, x, q, b, N );
+            //fprintf( stderr, "sig_new: %24.15e, d_norm:%24.15e, q_norm:%24.15e\n", 
+            // sqrt(sig_new), Norm(workspace->d,N), Norm(workspace->q,N) );
+            //fprintf( stderr, "sig_new: %f\n", sig_new );
+
+            for( i = 0; i < 300 && SQRT(sig_new) / b_norm > tol; ++i ) {
+                //for( i = 0; i < 300 && sig_new > SQR(tol)*sig0; ++i ) {
+                Sparse_MatVec( H, workspace->d, workspace->q );
+                tmp = Dot( workspace->d, workspace->q, N );
+                //fprintf( stderr, "tmp: %f\n", tmp );
+                alpha = sig_new / tmp;    
+                Vector_Add( x, alpha, workspace->d, N );
+                //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
+                //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
+
+                Vector_Add( workspace->r, -alpha, workspace->q, N );    
+                for( j = 0; j < N; ++j )
+                    workspace->p[j] = workspace->r[j] * workspace->Hdia_inv[j];
+
+                sig_old = sig_new;
+                sig_new = Dot( workspace->r, workspace->p, N );
+                beta = sig_new / sig_old;
+                Vector_Sum( workspace->d, 1., workspace->p, beta, workspace->d, N );
+                //fprintf( stderr, "sig_new: %f\n", sig_new );
+            }
+
+            fprintf( stderr, "CG took %d iterations\n", i );
+
+            if( i >= 300 ) {
+                fprintf( stderr, "CG convergence failed!\n" );
+                return i;
+            }
+
+            return i;
+            }
+
+
+
+            /* Steepest Descent */
+            int SDM( static_storage *workspace, sparse_matrix *H, 
+                    real *b, real tol, real *x, FILE *fout )
+            {
+                int  i, j, N;
+                real tmp, alpha, beta, b_norm;
+                real sig0, sig;
+
+                N = H->n;
+                b_norm = Norm( b, N );
+                //fprintf( stderr, "b_norm: %10.6f\n", b_norm );
+
+                Sparse_MatVec( H, x, workspace->q );
+                Vector_Sum( workspace->r , 1.,  b, -1., workspace->q, N );
+                for( j = 0; j < N; ++j )
+                    workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+
+                sig = Dot( workspace->r, workspace->d, N );
+                sig0 = sig;
+
+                for( i = 0; i < 300 && SQRT(sig) / b_norm > tol; ++i ) {
+                    Sparse_MatVec( H, workspace->d, workspace->q );
+
+                    sig = Dot( workspace->r, workspace->d, N );
+                    tmp = Dot( workspace->d, workspace->q, N );
+                    alpha = sig / tmp;    
+
+                    Vector_Add( x, alpha, workspace->d, N );
+                    Vector_Add( workspace->r, -alpha, workspace->q, N );
+                    for( j = 0; j < N; ++j )
+                        workspace->d[j] = workspace->r[j] * workspace->Hdia_inv[j];
+
+                    //fprintf( stderr, "d_norm:%24.15e, q_norm:%24.15e, tmp:%24.15e\n",
+                    //     Norm(workspace->d,N), Norm(workspace->q,N), tmp );
+                }
+
+                fprintf( stderr, "SDM took %d iterations\n", i );
+
+                if( i >= 300 ) {
+                    fprintf( stderr, "SDM convergence failed!\n" );
+                    return i;
+                }
+
+                return i;
+            }
 
diff --git a/PuReMD-GPU/src/QEq.cu b/PuReMD-GPU/src/QEq.cu
index 03f3fe74..5d849b26 100644
--- a/PuReMD-GPU/src/QEq.cu
+++ b/PuReMD-GPU/src/QEq.cu
@@ -36,416 +36,416 @@
 
 HOST_DEVICE void swap(sparse_matrix_entry *array, int index1, int index2) 
 {
-	sparse_matrix_entry temp = array[index1];
-	array[index1] = array[index2];
-	array[index2] = temp;
+    sparse_matrix_entry temp = array[index1];
+    array[index1] = array[index2];
+    array[index2] = temp;
 }
 
 HOST_DEVICE void quick_sort(sparse_matrix_entry *array, int start, int end)
 {
-	int i = start;
-	int k = end; 
-
-	if (end - start >= 1)  
-	{  
-		int pivot = array[start].j;
-
-		while (k > i) 
-		{  
-			while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++;
-			while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--;
-			if (k > i) swap(array, i, k);
-		}  
-		swap(array, start, k);
-		quick_sort(array, start, k - 1);
-		quick_sort(array, k + 1, end);
-	}  
+    int i = start;
+    int k = end; 
+
+    if (end - start >= 1)  
+    {  
+        int pivot = array[start].j;
+
+        while (k > i) 
+        {  
+            while ((array[i].j <= pivot) && (i <= end) && (k > i)) i++;
+            while ((array[k].j > pivot) && (k >= start) && (k >= i)) k--;
+            if (k > i) swap(array, i, k);
+        }  
+        swap(array, start, k);
+        quick_sort(array, start, k - 1);
+        quick_sort(array, k + 1, end);
+    }  
 }
 
 int compare_matrix_entry(const void *v1, const void *v2)
 {
-	return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
+    return ((sparse_matrix_entry *)v1)->j - ((sparse_matrix_entry *)v2)->j;
 }
 
 
 void Sort_Matrix_Rows( sparse_matrix *A )
 {
-	int i, si, ei;
-
-	for( i = 0; i < A->n; ++i ) {
-		si = A->start[i];
-		ei = A->start[i+1];
-		qsort( &(A->entries[si]), ei - si, 
-				sizeof(sparse_matrix_entry), compare_matrix_entry );
-	}
+    int i, si, ei;
+
+    for( i = 0; i < A->n; ++i ) {
+        si = A->start[i];
+        ei = A->start[i+1];
+        qsort( &(A->entries[si]), ei - si, 
+                sizeof(sparse_matrix_entry), compare_matrix_entry );
+    }
 }
 
 GLOBAL void Cuda_Sort_Matrix_Rows ( sparse_matrix A )
 {
-	int i;
-	int si, ei;
+    int i;
+    int si, ei;
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if ( i >= A.n ) return;
+    if ( i >= A.n ) return;
 
-	si = A.start[i];
-	ei = A.end [i];
+    si = A.start[i];
+    ei = A.end [i];
 
-	quick_sort( A.entries + si, 0, ei-si-1 );
+    quick_sort( A.entries + si, 0, ei-si-1 );
 }
 
 
 void Calculate_Droptol( sparse_matrix *A, real *droptol, real dtol )
 {
-	int i, j, k;
-	real val;
-
-	/* init droptol to 0 */
-	for( i = 0; i < A->n; ++i )
-		droptol[i] = 0;
-
-	/* calculate sqaure of the norm of each row */
-	for( i = 0; i < A->n; ++i ) {
-		for( k = A->start[i]; k < A->start[i+1]-1; ++k ) {
-			j = A->entries[k].j;
-			val = A->entries[k].val;
-
-			droptol[i] += val*val;
-			droptol[j] += val*val;
-		}
-
-		val = A->entries[k].val; // diagonal entry
-		droptol[i] += val*val;
-	}
-
-	/* calculate local droptol for each row */
-	//fprintf( stderr, "droptol: " );
-	for( i = 0; i < A->n; ++i ) {
-		//fprintf( stderr, "%f-->", droptol[i] );
-		droptol[i] = SQRT( droptol[i] ) * dtol;
-		//fprintf( stderr, "%f  ", droptol[i] );
-	}
-	//fprintf( stderr, "\n" );
+    int i, j, k;
+    real val;
+
+    /* init droptol to 0 */
+    for( i = 0; i < A->n; ++i )
+        droptol[i] = 0;
+
+    /* calculate sqaure of the norm of each row */
+    for( i = 0; i < A->n; ++i ) {
+        for( k = A->start[i]; k < A->start[i+1]-1; ++k ) {
+            j = A->entries[k].j;
+            val = A->entries[k].val;
+
+            droptol[i] += val*val;
+            droptol[j] += val*val;
+        }
+
+        val = A->entries[k].val; // diagonal entry
+        droptol[i] += val*val;
+    }
+
+    /* calculate local droptol for each row */
+    //fprintf( stderr, "droptol: " );
+    for( i = 0; i < A->n; ++i ) {
+        //fprintf( stderr, "%f-->", droptol[i] );
+        droptol[i] = SQRT( droptol[i] ) * dtol;
+        //fprintf( stderr, "%f  ", droptol[i] );
+    }
+    //fprintf( stderr, "\n" );
 }
 
 GLOBAL void Cuda_Calculate_Droptol ( sparse_matrix p_A, real *droptol, real dtol )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	int k, j, offset, x, diagnol;
-	real val;
-	sparse_matrix *A = &p_A;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int k, j, offset, x, diagnol;
+    real val;
+    sparse_matrix *A = &p_A;
 
-	if ( i < A->n ) {
-		droptol [i] = 0;
+    if ( i < A->n ) {
+        droptol [i] = 0;
 
-		for (k = A->start[i]; k < A->end[i]; ++k ) {
-			val = A->entries[k].val;
-			droptol [i] += val*val;
-		}
-	}
+        for (k = A->start[i]; k < A->end[i]; ++k ) {
+            val = A->entries[k].val;
+            droptol [i] += val*val;
+        }
+    }
 
-	__syncthreads ();
-	if ( i < A->n ) {
-		droptol [i] = SQRT (droptol[i]) * dtol;
-	}
+    __syncthreads ();
+    if ( i < A->n ) {
+        droptol [i] = SQRT (droptol[i]) * dtol;
+    }
 
 }
 
 GLOBAL void Cuda_Calculate_Droptol_js ( sparse_matrix p_A, real *droptol, real dtol )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	int k, j, offset, x, diagnol;
-	real val;
-	sparse_matrix *A = &p_A;
-
-	for (x = 0; x < A->n; x ++)
-	{
-		if (i < (A->end[i]-1 - A->start[i])) {
-			offset = A->start [i] + i;
-			j = A->entries[offset].j;
-			val = A->entries[offset].val;
-			droptol [j] += val * val;
-		}
-		__syncthreads ();
-	}
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int k, j, offset, x, diagnol;
+    real val;
+    sparse_matrix *A = &p_A;
+
+    for (x = 0; x < A->n; x ++)
+    {
+        if (i < (A->end[i]-1 - A->start[i])) {
+            offset = A->start [i] + i;
+            j = A->entries[offset].j;
+            val = A->entries[offset].val;
+            droptol [j] += val * val;
+        }
+        __syncthreads ();
+    }
 }
 
 GLOBAL void Cuda_Calculate_Droptol_diagnol ( sparse_matrix p_A, real *droptol, real dtol )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	int k, j, offset, x, diagnol;
-	real val;
-	sparse_matrix *A = &p_A;
-
-	if ( i < A->n ) {
-		//diagnol element
-		diagnol = A->end[i]-1;
-		val = A->entries [diagnol].val;
-		droptol [i] += val*val;
-	}
-
-	/*calculate local droptol for each row*/
-	if ( i < A->n )
-		droptol [i] = SQRT (droptol[i]) * dtol;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int k, j, offset, x, diagnol;
+    real val;
+    sparse_matrix *A = &p_A;
+
+    if ( i < A->n ) {
+        //diagnol element
+        diagnol = A->end[i]-1;
+        val = A->entries [diagnol].val;
+        droptol [i] += val*val;
+    }
+
+    /*calculate local droptol for each row*/
+    if ( i < A->n )
+        droptol [i] = SQRT (droptol[i]) * dtol;
 }
 
 
 int Estimate_LU_Fill( sparse_matrix *A, real *droptol )
 {
-	int i, j, pj;
-	int fillin;
-	real val;
+    int i, j, pj;
+    int fillin;
+    real val;
 
-	fillin = 0;
+    fillin = 0;
 
-	//fprintf( stderr, "n: %d\n", A->n );
-	for( i = 0; i < A->n; ++i )
-		for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
-			j = A->entries[pj].j;
-			val = A->entries[pj].val;
-			//fprintf( stderr, "i: %d, j: %d", i, j );
+    //fprintf( stderr, "n: %d\n", A->n );
+    for( i = 0; i < A->n; ++i )
+        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
+            j = A->entries[pj].j;
+            val = A->entries[pj].val;
+            //fprintf( stderr, "i: %d, j: %d", i, j );
 
-			if( fabs(val) > droptol[i] )
-				++fillin;
-		}
+            if( fabs(val) > droptol[i] )
+                ++fillin;
+        }
 
-	return fillin + A->n;
+    return fillin + A->n;
 }
 
 GLOBAL void Cuda_Estimate_LU_Fill ( sparse_matrix p_A, real *droptol, int *fillin)
 {
-	int i, j, pj;
-	real val;
-	sparse_matrix *A = &p_A;
+    int i, j, pj;
+    real val;
+    sparse_matrix *A = &p_A;
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= A->n) return;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= A->n) return;
 
-	fillin [i] = 0;
+    fillin [i] = 0;
 
-	for (pj = A->start[i]; pj < A->end[i]-1; ++pj)
-	{
-		j = A->entries [pj].j;
-		val = A->entries[pj].val;
+    for (pj = A->start[i]; pj < A->end[i]-1; ++pj)
+    {
+        j = A->entries [pj].j;
+        val = A->entries[pj].val;
 
-		if (fabs (val) > droptol [i]) ++fillin [i];
-	}
+        if (fabs (val) > droptol [i]) ++fillin [i];
+    }
 }
 
 void ICHOLT( sparse_matrix *A, real *droptol, 
-		sparse_matrix *L, sparse_matrix *U )
+        sparse_matrix *L, sparse_matrix *U )
 {
-	sparse_matrix_entry tmp[1000];
-	int i, j, pj, k1, k2, tmptop, Ltop;
-	real val;
-	int *Utop;
-
-	Utop = (int*) malloc((A->n+1) * sizeof(int));
-
-	// clear variables
-	Ltop = 0;
-	tmptop = 0;
-	for( i = 0; i <= A->n; ++i )
-		L->start[i] = U->start[i] = 0;
-
-	for( i = 0; i < A->n; ++i )
-		Utop[i] = 0;
-
-	//fprintf( stderr, "n: %d\n", A->n );
-	for( i = 0; i < A->n; ++i ){
-		L->start[i] = Ltop;
-		tmptop = 0;
-
-		for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
-			j = A->entries[pj].j;
-			val = A->entries[pj].val;
-			//fprintf( stderr, "i: %d, j: %d", i, j );
-
-			if( fabs(val) > droptol[i] ){
-				k1 = 0;
-				k2 = L->start[j];
-				while( k1 < tmptop && k2 < L->start[j+1] ){
-					if( tmp[k1].j < L->entries[k2].j )
-						++k1;
-					else if( tmp[k1].j > L->entries[k2].j )
-						++k2;
-					else
-						val -= (tmp[k1++].val * L->entries[k2++].val);
-				}
-
-				// L matrix is lower triangular, 
-				// so right before the start of next row comes jth diagonal
-				val /= L->entries[L->start[j+1]-1].val;
-
-				tmp[tmptop].j = j;
-				tmp[tmptop].val = val;
-				++tmptop;
-			}
-			//fprintf( stderr, " -- done\n" );
-		}
-
-		// compute the ith diagonal in L
-		// sanity check
-		if( A->entries[pj].j != i ) {
-			fprintf( stderr, "i=%d, badly built A matrix!\n", i );
-			exit(999);
-		}
-
-		val = A->entries[pj].val;
-		for( k1 = 0; k1 < tmptop; ++k1 )
-			val -= (tmp[k1].val * tmp[k1].val);
-
-		tmp[tmptop].j = i;
-		tmp[tmptop].val = SQRT(val);
-
-		// apply the dropping rule once again
-		//fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
-		//for( k1 = 0; k1<= tmptop; ++k1 )
-		//  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
-		//fprintf( stderr, "\n" );
-		//fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
-		for( k1 = 0; k1 < tmptop; ++k1 )
-			if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
-				L->entries[Ltop].j = tmp[k1].j;
-				L->entries[Ltop].val = tmp[k1].val;
-				U->start[tmp[k1].j+1]++;
-				++Ltop;
-				//fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
-			}
-		// keep the diagonal in any case
-		L->entries[Ltop].j = tmp[k1].j;
-		L->entries[Ltop].val = tmp[k1].val;
-		++Ltop;
-		//fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
-	}
-
-	L->start[i] = Ltop;
-	//fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
-
-	for( i = 1; i <= U->n; ++i )
-		Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
-
-	for( i = 0; i < L->n; ++i )
-		for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
-			j = L->entries[pj].j;
-			U->entries[Utop[j]].j = i;
-			U->entries[Utop[j]].val = L->entries[pj].val;
-			Utop[j]++;
-		}
-
-	//fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+    sparse_matrix_entry tmp[1000];
+    int i, j, pj, k1, k2, tmptop, Ltop;
+    real val;
+    int *Utop;
+
+    Utop = (int*) malloc((A->n+1) * sizeof(int));
+
+    // clear variables
+    Ltop = 0;
+    tmptop = 0;
+    for( i = 0; i <= A->n; ++i )
+        L->start[i] = U->start[i] = 0;
+
+    for( i = 0; i < A->n; ++i )
+        Utop[i] = 0;
+
+    //fprintf( stderr, "n: %d\n", A->n );
+    for( i = 0; i < A->n; ++i ){
+        L->start[i] = Ltop;
+        tmptop = 0;
+
+        for( pj = A->start[i]; pj < A->start[i+1]-1; ++pj ){
+            j = A->entries[pj].j;
+            val = A->entries[pj].val;
+            //fprintf( stderr, "i: %d, j: %d", i, j );
+
+            if( fabs(val) > droptol[i] ){
+                k1 = 0;
+                k2 = L->start[j];
+                while( k1 < tmptop && k2 < L->start[j+1] ){
+                    if( tmp[k1].j < L->entries[k2].j )
+                        ++k1;
+                    else if( tmp[k1].j > L->entries[k2].j )
+                        ++k2;
+                    else
+                        val -= (tmp[k1++].val * L->entries[k2++].val);
+                }
+
+                // L matrix is lower triangular, 
+                // so right before the start of next row comes jth diagonal
+                val /= L->entries[L->start[j+1]-1].val;
+
+                tmp[tmptop].j = j;
+                tmp[tmptop].val = val;
+                ++tmptop;
+            }
+            //fprintf( stderr, " -- done\n" );
+        }
+
+        // compute the ith diagonal in L
+        // sanity check
+        if( A->entries[pj].j != i ) {
+            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
+            exit(999);
+        }
+
+        val = A->entries[pj].val;
+        for( k1 = 0; k1 < tmptop; ++k1 )
+            val -= (tmp[k1].val * tmp[k1].val);
+
+        tmp[tmptop].j = i;
+        tmp[tmptop].val = SQRT(val);
+
+        // apply the dropping rule once again
+        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
+        //for( k1 = 0; k1<= tmptop; ++k1 )
+        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
+        //fprintf( stderr, "\n" );
+        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
+        for( k1 = 0; k1 < tmptop; ++k1 )
+            if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
+                L->entries[Ltop].j = tmp[k1].j;
+                L->entries[Ltop].val = tmp[k1].val;
+                U->start[tmp[k1].j+1]++;
+                ++Ltop;
+                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
+            }
+        // keep the diagonal in any case
+        L->entries[Ltop].j = tmp[k1].j;
+        L->entries[Ltop].val = tmp[k1].val;
+        ++Ltop;
+        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
+    }
+
+    L->start[i] = Ltop;
+    //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
+
+    for( i = 1; i <= U->n; ++i )
+        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
+
+    for( i = 0; i < L->n; ++i )
+        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
+            j = L->entries[pj].j;
+            U->entries[Utop[j]].j = i;
+            U->entries[Utop[j]].val = L->entries[pj].val;
+            Utop[j]++;
+        }
+
+    //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
 }
 
 
 
 void Cuda_ICHOLT( sparse_matrix *A, real *droptol, 
-		sparse_matrix *L, sparse_matrix *U )
+        sparse_matrix *L, sparse_matrix *U )
 {
-	sparse_matrix_entry tmp[1000];
-	int i, j, pj, k1, k2, tmptop, Ltop;
-	real val;
-	int *Utop;
-
-	Utop = (int*) malloc((A->n+1) * sizeof(int));
-
-	// clear variables
-	Ltop = 0;
-	tmptop = 0;
-	for( i = 0; i <= A->n; ++i )
-		L->start[i] = U->start[i] = 0;
-
-	for( i = 0; i < A->n; ++i )
-		Utop[i] = 0;
-
-	//fprintf( stderr, "n: %d\n", A->n );
-	for( i = 0; i < A->n; ++i ){
-		L->start[i] = Ltop;
-		tmptop = 0;
-
-		for( pj = A->start[i]; pj < A->end[i]-1; ++pj ){
-			j = A->entries[pj].j;
-			val = A->entries[pj].val;
-			//fprintf( stderr, "i: %d, j: %d", i, j );
-
-			//CHANGE ORIGINAL
-			if (j >= i) break;
-			//CHANGE ORIGINAL
-
-			if( fabs(val) > droptol[i] ){
-				k1 = 0;
-				k2 = L->start[j];
-				while( k1 < tmptop && k2 < L->start[j+1] ){
-					if( tmp[k1].j < L->entries[k2].j )
-						++k1;
-					else if( tmp[k1].j > L->entries[k2].j )
-						++k2;
-					else
-						val -= (tmp[k1++].val * L->entries[k2++].val);
-				}
-
-				// L matrix is lower triangular, 
-				// so right before the start of next row comes jth diagonal
-				val /= L->entries[L->start[j+1]-1].val;
-
-				tmp[tmptop].j = j;
-				tmp[tmptop].val = val;
-				++tmptop;
-			}
-
-			//fprintf( stderr, " -- done\n" );
-		}
-
-		// compute the ith diagonal in L
-		// sanity check
-		if( A->entries[pj].j != i ) {
-			fprintf( stderr, "i=%d, badly built A matrix!\n", i );
-			exit(999);
-		}
-
-		val = A->entries[pj].val;
-		for( k1 = 0; k1 < tmptop; ++k1 )
-			val -= (tmp[k1].val * tmp[k1].val);
-
-		tmp[tmptop].j = i;
-		tmp[tmptop].val = SQRT(val);
-
-		// apply the dropping rule once again
-		//fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
-		//for( k1 = 0; k1<= tmptop; ++k1 )
-		//  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
-		//fprintf( stderr, "\n" );
-		//fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
-		for( k1 = 0; k1 < tmptop; ++k1 )
-			if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
-				L->entries[Ltop].j = tmp[k1].j;
-				L->entries[Ltop].val = tmp[k1].val;
-				U->start[tmp[k1].j+1]++;
-				++Ltop;
-				//fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
-			}
-		// keep the diagonal in any case
-		L->entries[Ltop].j = tmp[k1].j;
-		L->entries[Ltop].val = tmp[k1].val;
-		++Ltop;
-		//fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
-	}
-
-	L->start[i] = Ltop;
-	//fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
-
-	for( i = 1; i <= U->n; ++i )
-		Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
-
-	for( i = 0; i < L->n; ++i )
-		for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
-			j = L->entries[pj].j;
-			U->entries[Utop[j]].j = i;
-			U->entries[Utop[j]].val = L->entries[pj].val;
-			Utop[j]++;
-		}
-
-	//fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
+    sparse_matrix_entry tmp[1000];
+    int i, j, pj, k1, k2, tmptop, Ltop;
+    real val;
+    int *Utop;
+
+    Utop = (int*) malloc((A->n+1) * sizeof(int));
+
+    // clear variables
+    Ltop = 0;
+    tmptop = 0;
+    for( i = 0; i <= A->n; ++i )
+        L->start[i] = U->start[i] = 0;
+
+    for( i = 0; i < A->n; ++i )
+        Utop[i] = 0;
+
+    //fprintf( stderr, "n: %d\n", A->n );
+    for( i = 0; i < A->n; ++i ){
+        L->start[i] = Ltop;
+        tmptop = 0;
+
+        for( pj = A->start[i]; pj < A->end[i]-1; ++pj ){
+            j = A->entries[pj].j;
+            val = A->entries[pj].val;
+            //fprintf( stderr, "i: %d, j: %d", i, j );
+
+            //CHANGE ORIGINAL
+            if (j >= i) break;
+            //CHANGE ORIGINAL
+
+            if( fabs(val) > droptol[i] ){
+                k1 = 0;
+                k2 = L->start[j];
+                while( k1 < tmptop && k2 < L->start[j+1] ){
+                    if( tmp[k1].j < L->entries[k2].j )
+                        ++k1;
+                    else if( tmp[k1].j > L->entries[k2].j )
+                        ++k2;
+                    else
+                        val -= (tmp[k1++].val * L->entries[k2++].val);
+                }
+
+                // L matrix is lower triangular, 
+                // so right before the start of next row comes jth diagonal
+                val /= L->entries[L->start[j+1]-1].val;
+
+                tmp[tmptop].j = j;
+                tmp[tmptop].val = val;
+                ++tmptop;
+            }
+
+            //fprintf( stderr, " -- done\n" );
+        }
+
+        // compute the ith diagonal in L
+        // sanity check
+        if( A->entries[pj].j != i ) {
+            fprintf( stderr, "i=%d, badly built A matrix!\n", i );
+            exit(999);
+        }
+
+        val = A->entries[pj].val;
+        for( k1 = 0; k1 < tmptop; ++k1 )
+            val -= (tmp[k1].val * tmp[k1].val);
+
+        tmp[tmptop].j = i;
+        tmp[tmptop].val = SQRT(val);
+
+        // apply the dropping rule once again
+        //fprintf( stderr, "row%d: tmptop: %d\n", i, tmptop );
+        //for( k1 = 0; k1<= tmptop; ++k1 )
+        //  fprintf( stderr, "%d(%f)  ", tmp[k1].j, tmp[k1].val );
+        //fprintf( stderr, "\n" );
+        //fprintf( stderr, "row(%d): droptol=%.4f\n", i+1, droptol[i] );
+        for( k1 = 0; k1 < tmptop; ++k1 )
+            if( fabs(tmp[k1].val) > droptol[i] / tmp[tmptop].val ){
+                L->entries[Ltop].j = tmp[k1].j;
+                L->entries[Ltop].val = tmp[k1].val;
+                U->start[tmp[k1].j+1]++;
+                ++Ltop;
+                //fprintf( stderr, "%d(%.4f)  ", tmp[k1].j+1, tmp[k1].val );
+            }
+        // keep the diagonal in any case
+        L->entries[Ltop].j = tmp[k1].j;
+        L->entries[Ltop].val = tmp[k1].val;
+        ++Ltop;
+        //fprintf( stderr, "%d(%.4f)\n", tmp[k1].j+1,  tmp[k1].val );
+    }
+
+    L->start[i] = Ltop;
+    //fprintf( stderr, "nnz(L): %d, max: %d\n", Ltop, L->n * 50 );
+
+    for( i = 1; i <= U->n; ++i )
+        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
+
+    for( i = 0; i < L->n; ++i )
+        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
+            j = L->entries[pj].j;
+            U->entries[Utop[j]].j = i;
+            U->entries[Utop[j]].val = L->entries[pj].val;
+            Utop[j]++;
+        }
+
+    //fprintf( stderr, "nnz(U): %d, max: %d\n", Utop[U->n], U->n * 50 );
 }
 
 
@@ -534,29 +534,29 @@ __syncthreads ();
 // sanity check
 if (kid == 0) 
 {
-	if( A->entries[end].j != i ) {
-		//intentional core dump here for sanity sake
-		*null_val = 1;
-	}
+    if( A->entries[end].j != i ) {
+        //intentional core dump here for sanity sake
+        *null_val = 1;
+    }
 }
 
 //diagnol element
 //val = A->entries[pj].val;
 //for( k1 = 0; k1 < tmptop; ++k1 )
 if (kid < count) 
-	tmp_val[kid] = (tmp[kid].val * tmp[kid].val);
+    tmp_val[kid] = (tmp[kid].val * tmp[kid].val);
 
-	__syncthreads ();
+    __syncthreads ();
 
 if (kid == 0)
 {
-	val = A->entries [end].val;
-	for (i = 0; i < count; i++)
-		tempvalue += tmp_val [i];
+    val = A->entries [end].val;
+    for (i = 0; i < count; i++)
+        tempvalue += tmp_val [i];
 
-	val -= tempvalue;
-	tmp[tmptop].j = i;
-	tmp[tmptop].val = SQRT(val);
+    val -= tempvalue;
+    tmp[tmptop].j = i;
+    tmp[tmptop].val = SQRT(val);
 }
 __syncthreads ();
 
@@ -564,510 +564,510 @@ __syncthreads ();
 //for( k1 = 0; k1 < count; ++k1 )
 if (kid < count )
 {
-	if( fabs(tmp[kid].val) > droptol[i] / tmp[tmptop].val ){
-		L->entries[offset + kid].j = tmp[kid].j;
-		L->entries[offset + kid].val = tmp[kid].val;
-		U->start[tmp[kid].j+1]++;
-	}
+    if( fabs(tmp[kid].val) > droptol[i] / tmp[tmptop].val ){
+        L->entries[offset + kid].j = tmp[kid].j;
+        L->entries[offset + kid].val = tmp[kid].val;
+        U->start[tmp[kid].j+1]++;
+    }
 }
 __syncthreads ();
 
 if (kid == 0) {
-	// keep the diagonal in any case
-	offset += count;
-	L->entries[offset].j = tmp[count].j;
-	L->entries[offset].val = tmp[count].val;
-	++offset;
-	L->end [i] = offset;
+    // keep the diagonal in any case
+    offset += count;
+    L->entries[offset].j = tmp[count].j;
+    L->entries[offset].val = tmp[count].val;
+    ++offset;
+    L->end [i] = offset;
 }
 __syncthreads ();
 } // end of main for loop
 }
 
-void Cuda_Fill_U	( sparse_matrix *A, real *droptol, 
-		sparse_matrix *L, sparse_matrix *U )
+void Cuda_Fill_U    ( sparse_matrix *A, real *droptol, 
+        sparse_matrix *L, sparse_matrix *U )
 {
-	int i, pj, j;
-
-	for( i = 1; i <= U->n; ++i )
-		Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
-
-	for( i = 0; i < L->n; ++i )
-		for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
-			j = L->entries[pj].j;
-			U->entries[Utop[j]].j = i;
-			U->entries[Utop[j]].val = L->entries[pj].val;
-			Utop[j]++;
-		}
+    int i, pj, j;
+
+    for( i = 1; i <= U->n; ++i )
+        Utop[i] = U->start[i] = U->start[i] + U->start[i-1] + 1;
+
+    for( i = 0; i < L->n; ++i )
+        for( pj = L->start[i]; pj < L->start[i+1]; ++pj ){
+            j = L->entries[pj].j;
+            U->entries[Utop[j]].j = i;
+            U->entries[Utop[j]].val = L->entries[pj].val;
+            Utop[j]++;
+        }
 }
 */
 
 
 void Init_MatVec( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list *far_nbrs )
+        simulation_data *data, static_storage *workspace, 
+        list *far_nbrs )
 {
-	int i, fillin;
-	real s_tmp, t_tmp;
-	//char fname[100];
+    int i, fillin;
+    real s_tmp, t_tmp;
+    //char fname[100];
 
-	if(control->refactor > 0 && 
-			((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)){
-		//Print_Linear_System( system, control, workspace, data->step );
-		Sort_Matrix_Rows( &workspace->H );
+    if(control->refactor > 0 && 
+            ((data->step-data->prev_steps)%control->refactor==0 || workspace->L.entries==NULL)){
+        //Print_Linear_System( system, control, workspace, data->step );
+        Sort_Matrix_Rows( &workspace->H );
 
-		//fprintf( stderr, "H matrix sorted\n" );
+        //fprintf( stderr, "H matrix sorted\n" );
 
-		Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); 
-		//fprintf( stderr, "drop tolerances calculated\n" );
+        Calculate_Droptol( &workspace->H, workspace->droptol, control->droptol ); 
+        //fprintf( stderr, "drop tolerances calculated\n" );
 
 
-		if( workspace->L.entries == NULL ) {
-			fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol );
+        if( workspace->L.entries == NULL ) {
+            fillin = Estimate_LU_Fill( &workspace->H, workspace->droptol );
 #ifdef __DEBUG_CUDA__
-			fprintf( stderr, "fillin = %d\n", fillin );
+            fprintf( stderr, "fillin = %d\n", fillin );
 #endif
-			if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 ||
-					Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ){
-				fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
-				exit(INSUFFICIENT_SPACE);
-			}
+            if( Allocate_Matrix( &(workspace->L), far_nbrs->n, fillin ) == 0 ||
+                    Allocate_Matrix( &(workspace->U), far_nbrs->n, fillin ) == 0 ){
+                fprintf( stderr, "not enough memory for LU matrices. terminating.\n" );
+                exit(INSUFFICIENT_SPACE);
+            }
 #if defined(DEBUG_FOCUS)
-			fprintf( stderr, "fillin = %d\n", fillin );
-			fprintf( stderr, "allocated memory: L = U = %ldMB\n",
-					fillin * sizeof(sparse_matrix_entry) / (1024*1024) );
+            fprintf( stderr, "fillin = %d\n", fillin );
+            fprintf( stderr, "allocated memory: L = U = %ldMB\n",
+                    fillin * sizeof(sparse_matrix_entry) / (1024*1024) );
 #endif
-		}
+        }
 
-		ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U );
+        ICHOLT( &workspace->H, workspace->droptol, &workspace->L, &workspace->U );
 
 #if defined(DEBUG_FOCUS)
-		fprintf( stderr, "icholt-" );
-		//sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
-		//Print_Sparse_Matrix2( workspace->L, fname );
-		//Print_Sparse_Matrix( U );
+        fprintf( stderr, "icholt-" );
+        //sprintf( fname, "%s.L%d.out", control->sim_name, data->step );
+        //Print_Sparse_Matrix2( workspace->L, fname );
+        //Print_Sparse_Matrix( U );
 #endif
-	}
-
-	/* extrapolation for s & t */
-	for( i = 0; i < system->N; ++i ) {
-		// no extrapolation
-		//s_tmp = workspace->s[0][i];
-		//t_tmp = workspace->t[0][i];
-
-		// linear
-		//s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
-		//t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
-
-		// quadratic
-		//s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
-		t_tmp = workspace->t[index_wkspace_sys(2,i,system)] + 3*(workspace->t[index_wkspace_sys(0,i,system)]-workspace->t[index_wkspace_sys(1,i,system)]);
-
-		// cubic
-		s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system)] + workspace->s[index_wkspace_sys(2,i,system)]) - 
-			(6 * workspace->s[index_wkspace_sys(1,i,system)] + workspace->s[index_wkspace_sys(3,i,system)] );
-		//t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
-		//  (6 * workspace->t[1][i] + workspace->t[3][i] );
-
-		// 4th order
-		//s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
-		//  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
-		//t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
-		//  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
-
-		workspace->s[index_wkspace_sys(4,i,system)] = workspace->s[index_wkspace_sys(3,i,system)];
-		workspace->s[index_wkspace_sys(3,i,system)] = workspace->s[index_wkspace_sys(2,i,system)]; 
-		workspace->s[index_wkspace_sys(2,i,system)] = workspace->s[index_wkspace_sys(1,i,system)];
-		workspace->s[index_wkspace_sys(1,i,system)] = workspace->s[index_wkspace_sys(0,i,system)];
-		workspace->s[index_wkspace_sys(0,i,system)] = s_tmp;
-
-		workspace->t[index_wkspace_sys(4,i,system)] = workspace->t[index_wkspace_sys(3,i,system)];
-		workspace->t[index_wkspace_sys(3,i,system)] = workspace->t[index_wkspace_sys(2,i,system)]; 
-		workspace->t[index_wkspace_sys(2,i,system)] = workspace->t[index_wkspace_sys(1,i,system)];
-		workspace->t[index_wkspace_sys(1,i,system)] = workspace->t[index_wkspace_sys(0,i,system)];
-		workspace->t[index_wkspace_sys(0,i,system)] = t_tmp;
-	}
+    }
+
+    /* extrapolation for s & t */
+    for( i = 0; i < system->N; ++i ) {
+        // no extrapolation
+        //s_tmp = workspace->s[0][i];
+        //t_tmp = workspace->t[0][i];
+
+        // linear
+        //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
+        //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
+
+        // quadratic
+        //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
+        t_tmp = workspace->t[index_wkspace_sys(2,i,system)] + 3*(workspace->t[index_wkspace_sys(0,i,system)]-workspace->t[index_wkspace_sys(1,i,system)]);
+
+        // cubic
+        s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,system)] + workspace->s[index_wkspace_sys(2,i,system)]) - 
+            (6 * workspace->s[index_wkspace_sys(1,i,system)] + workspace->s[index_wkspace_sys(3,i,system)] );
+        //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
+        //  (6 * workspace->t[1][i] + workspace->t[3][i] );
+
+        // 4th order
+        //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
+        //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
+        //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
+        //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
+
+        workspace->s[index_wkspace_sys(4,i,system)] = workspace->s[index_wkspace_sys(3,i,system)];
+        workspace->s[index_wkspace_sys(3,i,system)] = workspace->s[index_wkspace_sys(2,i,system)]; 
+        workspace->s[index_wkspace_sys(2,i,system)] = workspace->s[index_wkspace_sys(1,i,system)];
+        workspace->s[index_wkspace_sys(1,i,system)] = workspace->s[index_wkspace_sys(0,i,system)];
+        workspace->s[index_wkspace_sys(0,i,system)] = s_tmp;
+
+        workspace->t[index_wkspace_sys(4,i,system)] = workspace->t[index_wkspace_sys(3,i,system)];
+        workspace->t[index_wkspace_sys(3,i,system)] = workspace->t[index_wkspace_sys(2,i,system)]; 
+        workspace->t[index_wkspace_sys(2,i,system)] = workspace->t[index_wkspace_sys(1,i,system)];
+        workspace->t[index_wkspace_sys(1,i,system)] = workspace->t[index_wkspace_sys(0,i,system)];
+        workspace->t[index_wkspace_sys(0,i,system)] = t_tmp;
+    }
 }
 
-void Cuda_Init_MatVec( 	reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list *far_nbrs )
+void Cuda_Init_MatVec(     reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        list *far_nbrs )
 {
-	int i, fillin;
-	real s_tmp, t_tmp;
-	int *spad = (int *)scratch;
-	real start = 0, end = 0;
+    int i, fillin;
+    real s_tmp, t_tmp;
+    int *spad = (int *)scratch;
+    real start = 0, end = 0;
 
-	if(control->refactor > 0 && 
-			((data->step-data->prev_steps)%control->refactor==0 || dev_workspace->L.entries==NULL)){
+    if(control->refactor > 0 && 
+            ((data->step-data->prev_steps)%control->refactor==0 || dev_workspace->L.entries==NULL)){
 
-		Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>>
-			( dev_workspace->H );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
+        Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>>
+            ( dev_workspace->H );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, "Sorting done... \n");
+        fprintf (stderr, "Sorting done... \n");
 #endif
 
-		Cuda_Calculate_Droptol <<<BLOCKS, BLOCK_SIZE >>>
-			( dev_workspace->H, dev_workspace->droptol, control->droptol );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
+        Cuda_Calculate_Droptol <<<BLOCKS, BLOCK_SIZE >>>
+            ( dev_workspace->H, dev_workspace->droptol, control->droptol );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, "Droptol done... \n");
+        fprintf (stderr, "Droptol done... \n");
 #endif
 
-		if( dev_workspace->L.entries == NULL ) {
+        if( dev_workspace->L.entries == NULL ) {
 
-			cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH );
-			Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>>
-				( dev_workspace->H, dev_workspace->droptol, spad );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            cuda_memset ( spad, 0, 2 * INT_SIZE * system->N, RES_SCRATCH );
+            Cuda_Estimate_LU_Fill <<< BLOCKS, BLOCK_SIZE >>>
+                ( dev_workspace->H, dev_workspace->droptol, spad );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
-			//Reduction for fill in 
-			Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>>  
-				(spad, spad + system->N,  system->N);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            //Reduction for fill in 
+            Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, INT_SIZE * BLOCK_SIZE >>>  
+                (spad, spad + system->N,  system->N);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
-			Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
-				(spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); 
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            Cuda_reduction <<<1, BLOCKS_POW_2, INT_SIZE * BLOCKS_POW_2>>> 
+                (spad + system->N, spad + system->N + BLOCKS_POW_2, BLOCKS_POW_2); 
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
-			copy_host_device (&fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH );
-			fillin += dev_workspace->H.n;
+            copy_host_device (&fillin, spad + system->N + BLOCKS_POW_2, INT_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH );
+            fillin += dev_workspace->H.n;
 
 #ifdef __DEBUG_CUDA__
-			fprintf (stderr, "Calculated value of the fill in is --> %d \n ", fillin );
+            fprintf (stderr, "Calculated value of the fill in is --> %d \n ", fillin );
 #endif
 
-			dev_workspace->L.n = far_nbrs->n;
-			dev_workspace->L.m = fillin;
-			Cuda_Init_Sparse_Matrix( &dev_workspace->L, fillin, far_nbrs->n );
+            dev_workspace->L.n = far_nbrs->n;
+            dev_workspace->L.m = fillin;
+            Cuda_Init_Sparse_Matrix( &dev_workspace->L, fillin, far_nbrs->n );
 
-			dev_workspace->U.n = far_nbrs->n;
-			dev_workspace->U.m = fillin;
-			Cuda_Init_Sparse_Matrix( &dev_workspace->U, fillin, far_nbrs->n );
-		}
+            dev_workspace->U.n = far_nbrs->n;
+            dev_workspace->U.m = fillin;
+            Cuda_Init_Sparse_Matrix( &dev_workspace->U, fillin, far_nbrs->n );
+        }
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, "LU matrix done...\n");
+        fprintf (stderr, "LU matrix done...\n");
 #endif
 
-		//TODO -- This is the ILU Factorization of the H Matrix. 
-		//This is present in the CUDA 5.0 compilation which is not working currently. 
-		//Fix this when CUDA 5.0 is correctly setup. 
-		//TODO
-		//shared memory is per block
-		// here we have only one block - 
-		/*
-		   fprintf (stderr, "max sparse matrix entries %d \n", system->max_sparse_matrix_entries );
-		   Cuda_ICHOLT <<<1, system->max_sparse_matrix_entries, 
-		   system->max_sparse_matrix_entries *(REAL_SIZE + SPARSE_MATRIX_ENTRY_SIZE)   >>>
-		   ( system, dev_workspace->H, 
-		   dev_workspace->droptol, 
-		   dev_workspace->L, 
-		   dev_workspace->U );
-		   cudaThreadSynchronize ();
-		   fprintf (stderr, "Cuda_ICHOLT .. done ...-> %d\n ", cudaGetLastError ());
-		 */
-
-		//1. copy the H matrix from device to host
-		//2. Allocate the L/U matrices on the host and device. 
-		//3. Compute the L/U on the host
-		//4. copy the results to the device
-		//5. Continue the computation.
-		sparse_matrix t_H, t_L, t_U;
-		real *t_droptol;
-
-		t_droptol = (real *) malloc (REAL_SIZE * system->N);
+        //TODO -- This is the ILU Factorization of the H Matrix. 
+        //This is present in the CUDA 5.0 compilation which is not working currently. 
+        //Fix this when CUDA 5.0 is correctly setup. 
+        //TODO
+        //shared memory is per block
+        // here we have only one block - 
+        /*
+           fprintf (stderr, "max sparse matrix entries %d \n", system->max_sparse_matrix_entries );
+           Cuda_ICHOLT <<<1, system->max_sparse_matrix_entries, 
+           system->max_sparse_matrix_entries *(REAL_SIZE + SPARSE_MATRIX_ENTRY_SIZE)   >>>
+           ( system, dev_workspace->H, 
+           dev_workspace->droptol, 
+           dev_workspace->L, 
+           dev_workspace->U );
+           cudaThreadSynchronize ();
+           fprintf (stderr, "Cuda_ICHOLT .. done ...-> %d\n ", cudaGetLastError ());
+         */
+
+        //1. copy the H matrix from device to host
+        //2. Allocate the L/U matrices on the host and device. 
+        //3. Compute the L/U on the host
+        //4. copy the results to the device
+        //5. Continue the computation.
+        sparse_matrix t_H, t_L, t_U;
+        real *t_droptol;
+
+        t_droptol = (real *) malloc (REAL_SIZE * system->N);
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m );
+        fprintf (stderr, " Allocation temp matrices count %d entries %d \n", dev_workspace->H.n, dev_workspace->H.m );
 #endif
-		start = Get_Time ();
-		if (!Allocate_Matrix (&t_H, dev_workspace->H.n, dev_workspace->H.m)) { fprintf (stderr, "No space for H matrix \n"); exit (0);}
-		if (!Allocate_Matrix (&t_L, far_nbrs->n, dev_workspace->L.m)) { fprintf (stderr, "No space for L matrix \n"); exit (0); }
-		if (!Allocate_Matrix (&t_U, far_nbrs->n, dev_workspace->U.m)) { fprintf (stderr, "No space for U matrix \n"); exit (0); }
-
-		copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX );
-		copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX );
-		copy_host_device ( t_H.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY );
-
-		copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL );
-
-		//fprintf (stderr, " Done copying LUH .. \n");
-		Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U);
-
-		Sync_Host_Device (&t_L, &t_U, cudaMemcpyHostToDevice);
-		end += Get_Timing_Info (start);
-
-		/*
-		   fprintf (stderr, "Done syncing .... \n");
-		   free (t_droptol);
-		   fprintf (stderr, "Freed droptol ... \n");
-		   Deallocate_Matrix (&t_H);
-		   fprintf (stderr, "Freed H ... \n");
-		   Deallocate_Matrix (&t_L);
-		   fprintf (stderr, "Freed l ... \n");
-		   Deallocate_Matrix (&t_U);
-		   fprintf (stderr, "Freed u ... \n");
-		 */
-
-		//#ifdef __DEBUG_CUDA__
-		fprintf (stderr, "Done copying the L/U matrices to the device ---> %f \n", end);
-		//#endif
-
-		//#ifdef __BUILD_DEBUG__
-		//		validate_lu (workspace);
-		//#endif
-	}
+        start = Get_Time ();
+        if (!Allocate_Matrix (&t_H, dev_workspace->H.n, dev_workspace->H.m)) { fprintf (stderr, "No space for H matrix \n"); exit (0);}
+        if (!Allocate_Matrix (&t_L, far_nbrs->n, dev_workspace->L.m)) { fprintf (stderr, "No space for L matrix \n"); exit (0); }
+        if (!Allocate_Matrix (&t_U, far_nbrs->n, dev_workspace->U.m)) { fprintf (stderr, "No space for U matrix \n"); exit (0); }
+
+        copy_host_device ( t_H.start, dev_workspace->H.start, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX );
+        copy_host_device ( t_H.end, dev_workspace->H.end, INT_SIZE * (dev_workspace->H.n + 1), cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_INDEX );
+        copy_host_device ( t_H.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->H.m, cudaMemcpyDeviceToHost, RES_SPARSE_MATRIX_ENTRY );
+
+        copy_host_device ( t_droptol, dev_workspace->droptol, REAL_SIZE * system->N, cudaMemcpyDeviceToHost, RES_STORAGE_DROPTOL );
+
+        //fprintf (stderr, " Done copying LUH .. \n");
+        Cuda_ICHOLT (&t_H, t_droptol, &t_L, &t_U);
+
+        Sync_Host_Device (&t_L, &t_U, cudaMemcpyHostToDevice);
+        end += Get_Timing_Info (start);
+
+        /*
+           fprintf (stderr, "Done syncing .... \n");
+           free (t_droptol);
+           fprintf (stderr, "Freed droptol ... \n");
+           Deallocate_Matrix (&t_H);
+           fprintf (stderr, "Freed H ... \n");
+           Deallocate_Matrix (&t_L);
+           fprintf (stderr, "Freed l ... \n");
+           Deallocate_Matrix (&t_U);
+           fprintf (stderr, "Freed u ... \n");
+         */
+
+        //#ifdef __DEBUG_CUDA__
+        fprintf (stderr, "Done copying the L/U matrices to the device ---> %f \n", end);
+        //#endif
+
+        //#ifdef __BUILD_DEBUG__
+        //        validate_lu (workspace);
+        //#endif
+    }
 }
 
 GLOBAL void Init_MatVec_Postprocess (static_storage p_workspace, int N )
 {
 
-	static_storage *workspace = &p_workspace;
-	real s_tmp, t_tmp;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if (i >= N) return;
-	// no extrapolation
-	//s_tmp = workspace->s[0][i];
-	//t_tmp = workspace->t[0][i];
-
-	// linear
-	//s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
-	//t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
-
-	// quadratic
-	//s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
-	t_tmp = workspace->t[index_wkspace_sys(2,i,N)] + 3*(workspace->t[index_wkspace_sys(0,i,N)]-workspace->t[index_wkspace_sys(1,i,N)]);
-
-	// cubic
-	s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,N)] + workspace->s[index_wkspace_sys(2,i,N)]) - 
-		(6 * workspace->s[index_wkspace_sys(1,i,N)] + workspace->s[index_wkspace_sys(3,i,N)] );
-	//t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
-	//  (6 * workspace->t[1][i] + workspace->t[3][i] );
-
-	// 4th order
-	//s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
-	//  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
-	//t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
-	//  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
-
-	workspace->s[index_wkspace_sys(4,i,N)] = workspace->s[index_wkspace_sys(3,i,N)];
-	workspace->s[index_wkspace_sys(3,i,N)] = workspace->s[index_wkspace_sys(2,i,N)]; 
-	workspace->s[index_wkspace_sys(2,i,N)] = workspace->s[index_wkspace_sys(1,i,N)];
-	workspace->s[index_wkspace_sys(1,i,N)] = workspace->s[index_wkspace_sys(0,i,N)];
-	workspace->s[index_wkspace_sys(0,i,N)] = s_tmp;
-
-	workspace->t[index_wkspace_sys(4,i,N)] = workspace->t[index_wkspace_sys(3,i,N)];
-	workspace->t[index_wkspace_sys(3,i,N)] = workspace->t[index_wkspace_sys(2,i,N)]; 
-	workspace->t[index_wkspace_sys(2,i,N)] = workspace->t[index_wkspace_sys(1,i,N)];
-	workspace->t[index_wkspace_sys(1,i,N)] = workspace->t[index_wkspace_sys(0,i,N)];
-	workspace->t[index_wkspace_sys(0,i,N)] = t_tmp;
+    static_storage *workspace = &p_workspace;
+    real s_tmp, t_tmp;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= N) return;
+    // no extrapolation
+    //s_tmp = workspace->s[0][i];
+    //t_tmp = workspace->t[0][i];
+
+    // linear
+    //s_tmp = 2 * workspace->s[0][i] - workspace->s[1][i];
+    //t_tmp = 2 * workspace->t[0][i] - workspace->t[1][i];
+
+    // quadratic
+    //s_tmp = workspace->s[2][i] + 3 * (workspace->s[0][i]-workspace->s[1][i]);
+    t_tmp = workspace->t[index_wkspace_sys(2,i,N)] + 3*(workspace->t[index_wkspace_sys(0,i,N)]-workspace->t[index_wkspace_sys(1,i,N)]);
+
+    // cubic
+    s_tmp = 4 * (workspace->s[index_wkspace_sys(0,i,N)] + workspace->s[index_wkspace_sys(2,i,N)]) - 
+        (6 * workspace->s[index_wkspace_sys(1,i,N)] + workspace->s[index_wkspace_sys(3,i,N)] );
+    //t_tmp = 4 * (workspace->t[0][i] + workspace->t[2][i]) - 
+    //  (6 * workspace->t[1][i] + workspace->t[3][i] );
+
+    // 4th order
+    //s_tmp = 5 * (workspace->s[0][i] - workspace->s[3][i]) + 
+    //  10 * (-workspace->s[1][i] + workspace->s[2][i] ) + workspace->s[4][i];
+    //t_tmp = 5 * (workspace->t[0][i] - workspace->t[3][i]) + 
+    //  10 * (-workspace->t[1][i] + workspace->t[2][i] ) + workspace->t[4][i];
+
+    workspace->s[index_wkspace_sys(4,i,N)] = workspace->s[index_wkspace_sys(3,i,N)];
+    workspace->s[index_wkspace_sys(3,i,N)] = workspace->s[index_wkspace_sys(2,i,N)]; 
+    workspace->s[index_wkspace_sys(2,i,N)] = workspace->s[index_wkspace_sys(1,i,N)];
+    workspace->s[index_wkspace_sys(1,i,N)] = workspace->s[index_wkspace_sys(0,i,N)];
+    workspace->s[index_wkspace_sys(0,i,N)] = s_tmp;
+
+    workspace->t[index_wkspace_sys(4,i,N)] = workspace->t[index_wkspace_sys(3,i,N)];
+    workspace->t[index_wkspace_sys(3,i,N)] = workspace->t[index_wkspace_sys(2,i,N)]; 
+    workspace->t[index_wkspace_sys(2,i,N)] = workspace->t[index_wkspace_sys(1,i,N)];
+    workspace->t[index_wkspace_sys(1,i,N)] = workspace->t[index_wkspace_sys(0,i,N)];
+    workspace->t[index_wkspace_sys(0,i,N)] = t_tmp;
 }
 
 void Calculate_Charges( reax_system *system, static_storage *workspace )
 {
-	int i;
-	real u, s_sum, t_sum;
+    int i;
+    real u, s_sum, t_sum;
 
-	s_sum = t_sum = 0.;
-	for( i = 0; i < system->N; ++i ) {
-		s_sum += workspace->s[index_wkspace_sys(0,i,system)];
-		t_sum += workspace->t[index_wkspace_sys(0,i,system)];
-	}
+    s_sum = t_sum = 0.;
+    for( i = 0; i < system->N; ++i ) {
+        s_sum += workspace->s[index_wkspace_sys(0,i,system)];
+        t_sum += workspace->t[index_wkspace_sys(0,i,system)];
+    }
 
-	u = s_sum / t_sum;
+    u = s_sum / t_sum;
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
+    fprintf (stderr, "Host --->s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
 #endif
 
-	for( i = 0; i < system->N; ++i )
-		system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system)] - u * workspace->t[index_wkspace_sys(0,i,system)];
+    for( i = 0; i < system->N; ++i )
+        system->atoms[i].q = workspace->s[index_wkspace_sys(0,i,system)] - u * workspace->t[index_wkspace_sys(0,i,system)];
 }
 
 GLOBAL void Cuda_Update_Atoms_q ( reax_atom *atoms, real *s, real u, real *t, int N)
 {
-	int i = blockIdx.x*blockDim.x + threadIdx.x;
-	if (i >= N) return;
+    int i = blockIdx.x*blockDim.x + threadIdx.x;
+    if (i >= N) return;
 
-	atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)];
+    atoms[i].q = s[index_wkspace_sys(0,i,N)] - u * t[index_wkspace_sys(0,i,N)];
 }
 
 void Cuda_Calculate_Charges (reax_system *system, static_storage *workspace)
 {
-	real *spad = (real *) scratch;
-	real u, s_sum, t_sum;
+    real *spad = (real *) scratch;
+    real u, s_sum, t_sum;
 
-	cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
+    cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
 
-	//s_sum 
-	Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
-		(&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    //s_sum 
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
+        (&dev_workspace->s [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-		(spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+        (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	copy_host_device (&s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (&s_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
 
-	//t_sum
-	cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
-	Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
-		(&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    //t_sum
+    cuda_memset (spad, 0, (BLOCKS_POW_2 * 2 * REAL_SIZE), RES_SCRATCH );
+    Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>  
+        (&dev_workspace->t [index_wkspace_sys (0, 0,system->N)], spad,  system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-		(spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+        (spad, spad+BLOCKS_POW_2, BLOCKS_POW_2); 
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	copy_host_device (&t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (&t_sum, spad+BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
 
-	//fraction here
-	u = s_sum / t_sum;
+    //fraction here
+    u = s_sum / t_sum;
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
+    fprintf (stderr, "DEVICE ---> s %13.2f, t %13.f, u %13.2f \n", s_sum, t_sum, u );
 #endif
 
-	Cuda_Update_Atoms_q <<< BLOCKS, BLOCK_SIZE >>>
-		( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Cuda_Update_Atoms_q <<< BLOCKS, BLOCK_SIZE >>>
+        ( (reax_atom *)system->d_atoms, dev_workspace->s, u, dev_workspace->t, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 
 void QEq( reax_system *system, control_params *control, simulation_data *data, 
-		static_storage *workspace, list *far_nbrs, 
-		output_controls *out_control )
+        static_storage *workspace, list *far_nbrs, 
+        output_controls *out_control )
 {
-	int matvecs;
+    int matvecs;
 
-	//real t_start, t_elapsed;
+    //real t_start, t_elapsed;
 
-	//t_start = Get_Time ();
-	Init_MatVec( system, control, data, workspace, far_nbrs );
-	//t_elapsed = Get_Timing_Info ( t_start );
+    //t_start = Get_Time ();
+    Init_MatVec( system, control, data, workspace, far_nbrs );
+    //t_elapsed = Get_Timing_Info ( t_start );
 
-	//fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed );
+    //fprintf (stderr, " CPU Init_MatVec timing ----> %f \n", t_elapsed );
 
-	//if( data->step % 10 == 0 )
-	//  Print_Linear_System( system, control, workspace, far_nbrs, data->step );
+    //if( data->step % 10 == 0 )
+    //  Print_Linear_System( system, control, workspace, far_nbrs, data->step );
 
-	//t_start = Get_Time ( );
-	matvecs = GMRES( workspace, &workspace->H, 
-			workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system );
-	matvecs += GMRES( workspace, &workspace->H, 
-			workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system );
-	//t_elapsed = Get_Timing_Info ( t_start );
+    //t_start = Get_Time ( );
+    matvecs = GMRES( workspace, &workspace->H, 
+            workspace->b_s, control->q_err, &workspace->s[0], out_control->log, system );
+    matvecs += GMRES( workspace, &workspace->H, 
+            workspace->b_t, control->q_err, &workspace->t[0], out_control->log, system );
+    //t_elapsed = Get_Timing_Info ( t_start );
 
-	//fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed );
+    //fprintf (stderr, " CPU GMRES timing ---> %f \n", t_elapsed );
 
-	//matvecs = GMRES_HouseHolder( workspace, workspace->H, 
-	//    workspace->b_s, control->q_err, workspace->s[0], out_control->log );
-	//matvecs += GMRES_HouseHolder( workspace, workspace->H,  
-	//    workspace->b_t, control->q_err, workspace->t[0], out_control->log );
+    //matvecs = GMRES_HouseHolder( workspace, workspace->H, 
+    //    workspace->b_s, control->q_err, workspace->s[0], out_control->log );
+    //matvecs += GMRES_HouseHolder( workspace, workspace->H,  
+    //    workspace->b_t, control->q_err, workspace->t[0], out_control->log );
 
-	//matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err,
-	//  &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system)], out_control->log, system );
-	//matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err,
-	//  &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system)], out_control->log, system );
+    //matvecs = PGMRES( workspace, &workspace->H, workspace->b_s, control->q_err,
+    //  &workspace->L, &workspace->U, &workspace->s[index_wkspace_sys(0,0,system)], out_control->log, system );
+    //matvecs += PGMRES( workspace, &workspace->H, workspace->b_t, control->q_err,
+    //  &workspace->L, &workspace->U, &workspace->t[index_wkspace_sys(0,0,system)], out_control->log, system );
 
-	//matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, 
-	//	  workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1;
-	///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, 
-	//     workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1;
+    //matvecs=PCG( workspace, workspace->H, workspace->b_s, control->q_err, 
+    //      workspace->L, workspace->U, workspace->s[0], out_control->log ) + 1;
+    ///matvecs+=PCG( workspace, workspace->H, workspace->b_t, control->q_err, 
+    //     workspace->L, workspace->U, workspace->t[0], out_control->log ) + 1;
 
-	//matvecs = CG( workspace, workspace->H, 
-	// workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
-	//matvecs += CG( workspace, workspace->H, 
-	// workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
+    //matvecs = CG( workspace, workspace->H, 
+    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
+    //matvecs += CG( workspace, workspace->H, 
+    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
 
-	//matvecs = SDM( workspace, workspace->H, 
-	// workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
-	//matvecs += SDM( workspace, workspace->H, 
-	// workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
+    //matvecs = SDM( workspace, workspace->H, 
+    // workspace->b_s, control->q_err, workspace->s[0], out_control->log ) + 1;
+    //matvecs += SDM( workspace, workspace->H, 
+    // workspace->b_t, control->q_err, workspace->t[0], out_control->log ) + 1;
 
-	//fprintf (stderr, " GMRES done with iterations %d \n", matvecs );
+    //fprintf (stderr, " GMRES done with iterations %d \n", matvecs );
 
-	data->timing.matvecs += matvecs;
+    data->timing.matvecs += matvecs;
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "linsolve-" );
+    fprintf( stderr, "linsolve-" );
 #endif
 
-	Calculate_Charges( system, workspace );
-	//fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", 
-	//   data->step, 
-	//   workspace->s[0][0], workspace->t[0][0], 
-	//   workspace->s[0][1], workspace->t[0][1], 
-	//   workspace->s[0][2], workspace->t[0][2] );
-	// if( data->step == control->nsteps )
-	//Print_Charges( system, control, workspace, data->step );
+    Calculate_Charges( system, workspace );
+    //fprintf( stderr, "%d %.9f %.9f %.9f %.9f %.9f %.9f\n", 
+    //   data->step, 
+    //   workspace->s[0][0], workspace->t[0][0], 
+    //   workspace->s[0][1], workspace->t[0][1], 
+    //   workspace->s[0][2], workspace->t[0][2] );
+    // if( data->step == control->nsteps )
+    //Print_Charges( system, control, workspace, data->step );
 }
 
 void Cuda_QEq( reax_system *system, control_params *control, simulation_data *data, 
-		static_storage *workspace, list *far_nbrs, 
-		output_controls *out_control )
+        static_storage *workspace, list *far_nbrs, 
+        output_controls *out_control )
 {
-	int matvecs = 0;
-	real t_start, t_elapsed;
+    int matvecs = 0;
+    real t_start, t_elapsed;
 
 #ifdef __DEBUG_CUDA__
-	t_start = Get_Time ();
+    t_start = Get_Time ();
 #endif
 
-	/*
-	//Cuda_Init_MatVec( system, control, data, workspace, far_nbrs );
+    /*
+    //Cuda_Init_MatVec( system, control, data, workspace, far_nbrs );
 
-	Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>>
-	( dev_workspace->H );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Cuda_Sort_Matrix_Rows <<< BLOCKS, BLOCK_SIZE >>>
+    ( dev_workspace->H );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	t_elapsed = Get_Timing_Info (t_start);
-	fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed);
-	 */
-	Init_MatVec_Postprocess <<< BLOCKS, BLOCK_SIZE >>>
-		(*dev_workspace, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    t_elapsed = Get_Timing_Info (t_start);
+    fprintf (stderr, "Sorting done...tming --> %f \n", t_elapsed);
+     */
+    Init_MatVec_Postprocess <<< BLOCKS, BLOCK_SIZE >>>
+        (*dev_workspace, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-	t_elapsed = Get_Timing_Info (t_start);
-	fprintf (stderr, "Done with post processing of init_matvec --> %d  with time ---> %f \n", cudaGetLastError (), t_elapsed);
+    t_elapsed = Get_Timing_Info (t_start);
+    fprintf (stderr, "Done with post processing of init_matvec --> %d  with time ---> %f \n", cudaGetLastError (), t_elapsed);
 #endif
 
-	//Here goes the GMRES part of the program ()
-	//#ifdef __DEBUG_CUDA__
-	t_start = Get_Time ();
-	//#endif
+    //Here goes the GMRES part of the program ()
+    //#ifdef __DEBUG_CUDA__
+    t_start = Get_Time ();
+    //#endif
 
-	//matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s );
-	//matvecs += Cuda_GMRES( dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t );
+    //matvecs = Cuda_GMRES( dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s );
+    //matvecs += Cuda_GMRES( dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t );
 
-	matvecs = Cublas_GMRES( system, dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s );
-	matvecs += Cublas_GMRES( system, dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t );
+    matvecs = Cublas_GMRES( system, dev_workspace, dev_workspace->b_s, control->q_err, dev_workspace->s );
+    matvecs += Cublas_GMRES( system, dev_workspace, dev_workspace->b_t, control->q_err, dev_workspace->t );
 
-	d_timing.matvecs += matvecs;
+    d_timing.matvecs += matvecs;
 
 #ifdef __DEBUG_CUDA__
-	t_elapsed = Get_Timing_Info ( t_start );
-	fprintf (stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed );
+    t_elapsed = Get_Timing_Info ( t_start );
+    fprintf (stderr, " Cuda_GMRES done with iterations %d with timing ---> %f \n", matvecs, t_elapsed );
 #endif
 
-	//Here cuda calculate charges
-	Cuda_Calculate_Charges (system, workspace);
+    //Here cuda calculate charges
+    Cuda_Calculate_Charges (system, workspace);
 }
diff --git a/PuReMD-GPU/src/allocate.cu b/PuReMD-GPU/src/allocate.cu
index 3de11aa4..37b80693 100644
--- a/PuReMD-GPU/src/allocate.cu
+++ b/PuReMD-GPU/src/allocate.cu
@@ -26,480 +26,480 @@
 
 void Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 {
-	Delete_List( far_nbrs );
-	if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs )){
-		fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-		exit( INIT_ERR );
-	}
+    Delete_List( far_nbrs );
+    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs )){
+        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
+        exit( INIT_ERR );
+    }
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
-			num_intrs, far_nbrs->num_intrs );  
-	fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-			num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
+    fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
+            num_intrs, far_nbrs->num_intrs );  
+    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
+            num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
 #endif
 }
 
 void Cuda_Reallocate_Neighbor_List( list *far_nbrs, int n, int num_intrs )
 {
-	Delete_List( far_nbrs, TYP_DEVICE );
-	if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE )){
-		fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-		exit( INIT_ERR );
-	}
+    Delete_List( far_nbrs, TYP_DEVICE );
+    if(!Make_List( n, num_intrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE )){
+        fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
+        exit( INIT_ERR );
+    }
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
-			num_intrs, far_nbrs->num_intrs );  
-	fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-			num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
+    fprintf( stderr, "num_far = %d, far_nbrs = %d -> reallocating!\n",
+            num_intrs, far_nbrs->num_intrs );  
+    fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
+            num_intrs * sizeof(far_neighbor_data) / (1024*1024) );
 #endif
 }
 
 
 int Allocate_Matrix( sparse_matrix *H, int n, int m )
 {
-	H->n = n;
-	H->m = m;
-	if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL )
-		return 0;
+    H->n = n;
+    H->m = m;
+    if( (H->start = (int*) malloc(sizeof(int) * n+1)) == NULL )
+        return 0;
 
-	if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL )
-		return 0;
+    if( (H->end = (int*) malloc(sizeof(int) * n+1)) == NULL )
+        return 0;
 
-	if( (H->entries = 
-				(sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL )
-		return 0;
+    if( (H->entries = 
+                (sparse_matrix_entry*) malloc(sizeof(sparse_matrix_entry)*m)) == NULL )
+        return 0;
 
-	return 1;
+    return 1;
 }
 
 int Cuda_Allocate_Matrix( sparse_matrix *H, int n, int m )
 {
-	H->n = n;
-	H->m = m;
+    H->n = n;
+    H->m = m;
 
-	cuda_malloc ((void **) &H->start, INT_SIZE * (n+1), 0, RES_SPARSE_MATRIX_INDEX );
-	cuda_malloc ((void **) &H->end, INT_SIZE *(n+1), 0, RES_SPARSE_MATRIX_INDEX );
-	cuda_malloc ((void **) &H->entries, SPARSE_MATRIX_ENTRY_SIZE * m, 0, RES_SPARSE_MATRIX_ENTRY );
+    cuda_malloc ((void **) &H->start, INT_SIZE * (n+1), 0, RES_SPARSE_MATRIX_INDEX );
+    cuda_malloc ((void **) &H->end, INT_SIZE *(n+1), 0, RES_SPARSE_MATRIX_INDEX );
+    cuda_malloc ((void **) &H->entries, SPARSE_MATRIX_ENTRY_SIZE * m, 0, RES_SPARSE_MATRIX_ENTRY );
 
-	return 1;
+    return 1;
 }
 
 
 void Deallocate_Matrix( sparse_matrix *H )
 {
-	free(H->start);
-	free(H->entries);
-	free(H->end);
+    free(H->start);
+    free(H->entries);
+    free(H->end);
 }
 
 void Cuda_Deallocate_Matrix( sparse_matrix *H )
 {
-	cuda_free(H->start, RES_SPARSE_MATRIX_INDEX);
-	cuda_free(H->end, RES_SPARSE_MATRIX_INDEX);
-	cuda_free(H->entries, RES_SPARSE_MATRIX_ENTRY);
+    cuda_free(H->start, RES_SPARSE_MATRIX_INDEX);
+    cuda_free(H->end, RES_SPARSE_MATRIX_INDEX);
+    cuda_free(H->entries, RES_SPARSE_MATRIX_ENTRY);
 
-	H->start = NULL;
-	H->end = NULL;
-	H->entries = NULL;
+    H->start = NULL;
+    H->end = NULL;
+    H->entries = NULL;
 }
 
 
 int Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
 {
-	Deallocate_Matrix( H );
-	if( !Allocate_Matrix( H, n, m ) ) {
-		fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
-		exit( 1 );
-	}
+    Deallocate_Matrix( H );
+    if( !Allocate_Matrix( H, n, m ) ) {
+        fprintf(stderr, "not enough space for %s matrix. terminating!\n", name);
+        exit( 1 );
+    }
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
-			name, n, m );
-	fprintf( stderr, "memory allocated: %s = %ldMB\n", 
-			name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
+    fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
+            name, n, m );
+    fprintf( stderr, "memory allocated: %s = %ldMB\n", 
+            name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
 #endif
-	return 1;
+    return 1;
 }
 
 int Cuda_Reallocate_Matrix( sparse_matrix *H, int n, int m, char *name )
 {
-	Cuda_Deallocate_Matrix( H );
+    Cuda_Deallocate_Matrix( H );
 
-	if( !Cuda_Allocate_Matrix( H, n, m ) ) {
-		fprintf(stderr, "not enough space for %s matrix on GPU . terminating!\n", name);
-		exit( 1 );
-	}
+    if( !Cuda_Allocate_Matrix( H, n, m ) ) {
+        fprintf(stderr, "not enough space for %s matrix on GPU . terminating!\n", name);
+        exit( 1 );
+    }
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
-			name, n, m );
-	fprintf( stderr, "memory allocated: %s = %ldMB\n", 
-			name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
+    fprintf( stderr, "reallocating %s matrix, n = %d, m = %d\n",
+            name, n, m );
+    fprintf( stderr, "memory allocated: %s = %ldMB\n", 
+            name, m * sizeof(sparse_matrix_entry) / (1024*1024) );
 #endif
-	return 1;
+    return 1;
 }
 
 
 int Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, 
-		list *hbonds )
+        list *hbonds )
 {
-	int i, num_hbonds;
-
-	num_hbonds = 0;
-	/* find starting indexes for each H and the total number of hbonds */
-	for( i = 1; i < n; ++i )
-		hb_top[i] += hb_top[i-1];
-	num_hbonds = hb_top[n-1];
-
-	if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) {
-		fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-		exit( INIT_ERR );
-	}
-
-	for( i = 0; i < n; ++i )
-		if( h_index[i] == 0 ){
-			Set_Start_Index( 0, 0, hbonds ); 
-			Set_End_Index( 0, 0, hbonds ); 
-		}
-		else if( h_index[i] > 0 ){
-			Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); 
-			Set_End_Index( h_index[i], hb_top[i-1], hbonds ); 
-		}
+    int i, num_hbonds;
+
+    num_hbonds = 0;
+    /* find starting indexes for each H and the total number of hbonds */
+    for( i = 1; i < n; ++i )
+        hb_top[i] += hb_top[i-1];
+    num_hbonds = hb_top[n-1];
+
+    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds ) ) {
+        fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
+        exit( INIT_ERR );
+    }
+
+    for( i = 0; i < n; ++i )
+        if( h_index[i] == 0 ){
+            Set_Start_Index( 0, 0, hbonds ); 
+            Set_End_Index( 0, 0, hbonds ); 
+        }
+        else if( h_index[i] > 0 ){
+            Set_Start_Index( h_index[i], hb_top[i-1], hbonds ); 
+            Set_End_Index( h_index[i], hb_top[i-1], hbonds ); 
+        }
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds );
-	fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
-			num_hbonds * sizeof(hbond_data) / (1024*1024) );
+    fprintf( stderr, "allocating hbonds - num_hbonds: %d\n", num_hbonds );
+    fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
+            num_hbonds * sizeof(hbond_data) / (1024*1024) );
 #endif
-	return 1;
+    return 1;
 }
 
 GLOBAL void Init_HBond_Indexes ( int *h_index, int *hb_top, list hbonds, int N )
 {
-	int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if (index >= N) return;
-
-	if( h_index[index] == 0 ){
-		Set_Start_Index( 0, 0, &hbonds ); 
-		Set_End_Index( 0, 0, &hbonds ); 
-	}
-	else if( h_index[index] > 0 ){
-		Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); 
-		Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); 
-	}
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (index >= N) return;
+
+    if( h_index[index] == 0 ){
+        Set_Start_Index( 0, 0, &hbonds ); 
+        Set_End_Index( 0, 0, &hbonds ); 
+    }
+    else if( h_index[index] > 0 ){
+        Set_Start_Index( h_index[index], hb_top[index-1], &hbonds ); 
+        Set_End_Index( h_index[index], hb_top[index-1], &hbonds ); 
+    }
 }
 
 int Cuda_Allocate_HBond_List( int n, int num_h, int *h_index, int *hb_top, list *hbonds )
 {
-	int i, num_hbonds;
-	int blocks, block_size;
-	int *d_hb_top;
-	num_hbonds = 0;
-
-	/* find starting indexes for each H and the total number of hbonds */
-	for( i = 1; i < n; ++i )
-		hb_top[i] += hb_top[i-1];
-	num_hbonds = hb_top[n-1];
-
-	if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds , TYP_DEVICE) ) {
-		fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
-		exit( INIT_ERR );
-	}
-
-	//cuda_malloc ((void **) &d_hb_top, INT_SIZE * (n), 1, __LINE__);
-	d_hb_top = (int *) scratch;
-	cuda_memset ( d_hb_top, 0, INT_SIZE * n, RES_SCRATCH );
-	copy_host_device (hb_top, (d_hb_top), INT_SIZE * n, cudaMemcpyHostToDevice, __LINE__);
-
-	Init_HBond_Indexes <<< BLOCKS, BLOCK_SIZE >>>
-		( h_index, d_hb_top, *hbonds, n);
-	cudaThreadSynchronize ();
+    int i, num_hbonds;
+    int blocks, block_size;
+    int *d_hb_top;
+    num_hbonds = 0;
+
+    /* find starting indexes for each H and the total number of hbonds */
+    for( i = 1; i < n; ++i )
+        hb_top[i] += hb_top[i-1];
+    num_hbonds = hb_top[n-1];
+
+    if( !Make_List(num_h, num_hbonds, TYP_HBOND, hbonds , TYP_DEVICE) ) {
+        fprintf( stderr, "not enough space for hbonds list. terminating!\n" );
+        exit( INIT_ERR );
+    }
+
+    //cuda_malloc ((void **) &d_hb_top, INT_SIZE * (n), 1, __LINE__);
+    d_hb_top = (int *) scratch;
+    cuda_memset ( d_hb_top, 0, INT_SIZE * n, RES_SCRATCH );
+    copy_host_device (hb_top, (d_hb_top), INT_SIZE * n, cudaMemcpyHostToDevice, __LINE__);
+
+    Init_HBond_Indexes <<< BLOCKS, BLOCK_SIZE >>>
+        ( h_index, d_hb_top, *hbonds, n);
+    cudaThreadSynchronize ();
 
 #ifdef __DEBUG_CUDA__
-	fprintf( stderr, "Done with allocating hbonds - num_hbonds: %d\n", num_hbonds );
+    fprintf( stderr, "Done with allocating hbonds - num_hbonds: %d\n", num_hbonds );
 #endif
 
-	return 1;
+    return 1;
 }
 
 int Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
 {
-	int i;
-	int *hb_top;
+    int i;
+    int *hb_top;
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "reallocating hbonds\n" );
+    fprintf( stderr, "reallocating hbonds\n" );
 #endif
-	hb_top = (int *)calloc( n, sizeof(int) );
-	for( i = 0; i < n; ++i )
-		if( h_index[i] >= 0 )
-			hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS);
+    hb_top = (int *)calloc( n, sizeof(int) );
+    for( i = 0; i < n; ++i )
+        if( h_index[i] >= 0 )
+            hb_top[i] = MAX(Num_Entries(h_index[i],hbonds)*SAFE_HBONDS, MIN_HBONDS);
 
-	Delete_List( hbonds );
+    Delete_List( hbonds );
 
-	Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
+    Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
 
-	free( hb_top );
+    free( hb_top );
 
-	return 1;
+    return 1;
 }
 
 int Cuda_Reallocate_HBonds_List(  int n, int num_h, int *h_index, list *hbonds )
 {
-	int i;
-	int *hb_top;
-	int *hb_start;
-	int *hb_end;
+    int i;
+    int *hb_top;
+    int *hb_start;
+    int *hb_end;
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "reallocating hbonds\n" );
+    fprintf( stderr, "reallocating hbonds\n" );
 #endif
-	hb_top = (int *)calloc( n, sizeof(int) );
-	hb_start = (int *) calloc (hbonds->n, sizeof (int));
-	hb_end = (int *) calloc (hbonds->n, sizeof (int));
+    hb_top = (int *)calloc( n, sizeof(int) );
+    hb_start = (int *) calloc (hbonds->n, sizeof (int));
+    hb_end = (int *) calloc (hbonds->n, sizeof (int));
 
-	copy_host_device (hb_start, hbonds->index, sizeof (int) * hbonds->n, 
-			cudaMemcpyDeviceToHost, LIST_INDEX);
-	copy_host_device (hb_end , hbonds->end_index, sizeof (int) * hbonds->n, 
-			cudaMemcpyDeviceToHost, LIST_END_INDEX);
+    copy_host_device (hb_start, hbonds->index, sizeof (int) * hbonds->n, 
+            cudaMemcpyDeviceToHost, LIST_INDEX);
+    copy_host_device (hb_end , hbonds->end_index, sizeof (int) * hbonds->n, 
+            cudaMemcpyDeviceToHost, LIST_END_INDEX);
 
-	for( i = 0; i < n; ++i )
-		//if( h_index[i] >= 0 )
-		hb_top[i] = MAX((hb_end [i] - hb_start[i])*SAFE_HBONDS, MIN_HBONDS);
+    for( i = 0; i < n; ++i )
+        //if( h_index[i] >= 0 )
+        hb_top[i] = MAX((hb_end [i] - hb_start[i])*SAFE_HBONDS, MIN_HBONDS);
 
-	Delete_List( hbonds, TYP_DEVICE );
+    Delete_List( hbonds, TYP_DEVICE );
 
-	Cuda_Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
+    Cuda_Allocate_HBond_List( n, num_h, h_index, hb_top, hbonds );
 
-	free( hb_top );
-	free( hb_start );
-	free( hb_end );
+    free( hb_top );
+    free( hb_start );
+    free( hb_end );
 
-	return 1;
+    return 1;
 }
 
 GLOBAL void Init_Bond_Indexes ( int *b_top, list bonds, int N )
 {
-	int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if (index >= N) return;
-
-	if( index == 0 ){
-		Set_Start_Index( 0, 0, &bonds ); 
-		Set_End_Index( 0, 0, &bonds ); 
-	}
-	else if( index > 0 ){
-		Set_Start_Index( index, b_top[index-1], &bonds ); 
-		Set_End_Index( index, b_top[index-1], &bonds ); 
-	}
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (index >= N) return;
+
+    if( index == 0 ){
+        Set_Start_Index( 0, 0, &bonds ); 
+        Set_End_Index( 0, 0, &bonds ); 
+    }
+    else if( index > 0 ){
+        Set_Start_Index( index, b_top[index-1], &bonds ); 
+        Set_End_Index( index, b_top[index-1], &bonds ); 
+    }
 }
 
 int Cuda_Allocate_Bond_List( int num_b, int *b_top, list *bonds )
 {
-	int i, num_bonds;
-	int *d_b_top = (int *) scratch;
-	num_bonds = 0;
-
-	/* find starting indexes for each H and the total number of hbonds */
-	for( i = 1; i < num_b; ++i )
-		b_top[i] += b_top[i-1];
-	num_bonds = b_top[num_b-1];
-
-	if( !Make_List(num_b, num_bonds, TYP_BOND, bonds, TYP_DEVICE) ) {
-		fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-		exit( INIT_ERR );
-	}
-
-	//cuda_malloc ((void **) &d_b_top, INT_SIZE * num_b, 1, __LINE__);
-	cuda_memset ( d_b_top, 0, INT_SIZE * num_b, RES_SCRATCH );
-	copy_host_device (b_top, d_b_top, INT_SIZE * num_b, cudaMemcpyHostToDevice, __LINE__);
-
-	Init_Bond_Indexes <<< BLOCKS, BLOCK_SIZE>>>
-		( d_b_top, *bonds, num_b);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	return 1;
+    int i, num_bonds;
+    int *d_b_top = (int *) scratch;
+    num_bonds = 0;
+
+    /* find starting indexes for each H and the total number of hbonds */
+    for( i = 1; i < num_b; ++i )
+        b_top[i] += b_top[i-1];
+    num_bonds = b_top[num_b-1];
+
+    if( !Make_List(num_b, num_bonds, TYP_BOND, bonds, TYP_DEVICE) ) {
+        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
+        exit( INIT_ERR );
+    }
+
+    //cuda_malloc ((void **) &d_b_top, INT_SIZE * num_b, 1, __LINE__);
+    cuda_memset ( d_b_top, 0, INT_SIZE * num_b, RES_SCRATCH );
+    copy_host_device (b_top, d_b_top, INT_SIZE * num_b, cudaMemcpyHostToDevice, __LINE__);
+
+    Init_Bond_Indexes <<< BLOCKS, BLOCK_SIZE>>>
+        ( d_b_top, *bonds, num_b);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    return 1;
 }
 
 
 int Allocate_Bond_List( int n, int *bond_top, list *bonds )
 {
-	int i, num_bonds;
-
-	num_bonds = 0;
-	/* find starting indexes for each atom and the total number of bonds */
-	for( i = 1; i < n; ++i )
-		bond_top[i] += bond_top[i-1];
-	num_bonds = bond_top[n-1];
-
-	if( !Make_List(n, num_bonds, TYP_BOND, bonds ) ) {
-		fprintf( stderr, "not enough space for bonds list. terminating!\n" );
-		exit( INIT_ERR );
-	}
-
-	Set_Start_Index( 0, 0, bonds ); 
-	Set_End_Index( 0, 0, bonds ); 
-	for( i = 1; i < n; ++i ) {
-		Set_Start_Index( i, bond_top[i-1], bonds ); 
-		Set_End_Index( i, bond_top[i-1], bonds ); 
-	}
+    int i, num_bonds;
+
+    num_bonds = 0;
+    /* find starting indexes for each atom and the total number of bonds */
+    for( i = 1; i < n; ++i )
+        bond_top[i] += bond_top[i-1];
+    num_bonds = bond_top[n-1];
+
+    if( !Make_List(n, num_bonds, TYP_BOND, bonds ) ) {
+        fprintf( stderr, "not enough space for bonds list. terminating!\n" );
+        exit( INIT_ERR );
+    }
+
+    Set_Start_Index( 0, 0, bonds ); 
+    Set_End_Index( 0, 0, bonds ); 
+    for( i = 1; i < n; ++i ) {
+        Set_Start_Index( i, bond_top[i-1], bonds ); 
+        Set_End_Index( i, bond_top[i-1], bonds ); 
+    }
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds );
-	fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
-			num_bonds * sizeof(bond_data) / (1024*1024) );
+    fprintf( stderr, "allocating bonds - num_bonds: %d\n", num_bonds );
+    fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
+            num_bonds * sizeof(bond_data) / (1024*1024) );
 #endif
-	return 1;
+    return 1;
 }
 
 
 int Reallocate_Bonds_List( int n, list *bonds, int *num_bonds, int *est_3body )
 {
-	int i;
-	int *bond_top;
+    int i;
+    int *bond_top;
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "reallocating bonds\n" );
+    fprintf( stderr, "reallocating bonds\n" );
 #endif
-	bond_top = (int *)calloc( n, sizeof(int) );
-	*est_3body = 0;
-	for( i = 0; i < n; ++i ){
-		*est_3body += SQR( Num_Entries( i, bonds ) );
-		bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
-	}
+    bond_top = (int *)calloc( n, sizeof(int) );
+    *est_3body = 0;
+    for( i = 0; i < n; ++i ){
+        *est_3body += SQR( Num_Entries( i, bonds ) );
+        bond_top[i] = MAX( Num_Entries( i, bonds ) * 2, MIN_BONDS );
+    }
 
-	Delete_List( bonds );
+    Delete_List( bonds );
 
-	Allocate_Bond_List( n, bond_top, bonds );
-	*num_bonds = bond_top[n-1];
+    Allocate_Bond_List( n, bond_top, bonds );
+    *num_bonds = bond_top[n-1];
 
-	free( bond_top );
+    free( bond_top );
 
-	return 1;
+    return 1;
 }
 
 void GLOBAL Calculate_Bond_Indexes (int *bond_top, list bonds, int *per_block_results, int n)
 {
-	extern __shared__ int sh_input[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
-
-	if(i < n)
-	{
-		x = SQR (Num_Entries( i, &bonds ) );
-		bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS );
-	}
-	sh_input[threadIdx.x] = x;
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{   
-			sh_input[threadIdx.x] += sh_input[threadIdx.x + offset];
-		}   
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		per_block_results[blockIdx.x] = sh_input[0];
-	}
+    extern __shared__ int sh_input[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if(i < n)
+    {
+        x = SQR (Num_Entries( i, &bonds ) );
+        bond_top[i] = MAX( Num_Entries( i, &bonds ) * 2, MIN_BONDS );
+    }
+    sh_input[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {   
+            sh_input[threadIdx.x] += sh_input[threadIdx.x + offset];
+        }   
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sh_input[0];
+    }
 }
 
 
 int Cuda_Reallocate_Bonds_List( int n, list *bonds, int *num_3body )
 {
-	int i;
-	int *b_top;
-	int *b_start;
-	int *b_end;
+    int i;
+    int *b_top;
+    int *b_start;
+    int *b_end;
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "reallocating bonds\n" );
+    fprintf( stderr, "reallocating bonds\n" );
 #endif
-	b_top = (int *)calloc( n, sizeof(int) );
-	b_start = (int *) calloc (bonds->n, sizeof (int));
-	b_end = (int *) calloc (bonds->n, sizeof (int));
+    b_top = (int *)calloc( n, sizeof(int) );
+    b_start = (int *) calloc (bonds->n, sizeof (int));
+    b_end = (int *) calloc (bonds->n, sizeof (int));
 
-	copy_host_device (b_start, bonds->index, sizeof (int) * bonds->n, 
-			cudaMemcpyDeviceToHost, LIST_INDEX);
-	copy_host_device (b_end , bonds->end_index, sizeof (int) * bonds->n, 
-			cudaMemcpyDeviceToHost, LIST_END_INDEX);
+    copy_host_device (b_start, bonds->index, sizeof (int) * bonds->n, 
+            cudaMemcpyDeviceToHost, LIST_INDEX);
+    copy_host_device (b_end , bonds->end_index, sizeof (int) * bonds->n, 
+            cudaMemcpyDeviceToHost, LIST_END_INDEX);
 
-	for( i = 0; i < n; ++i ) {
-		*num_3body += SQR (b_end[i] - b_start[i]);
-		b_top[i] = MAX((b_end [i] - b_start[i])*2, MIN_BONDS);
-	}
+    for( i = 0; i < n; ++i ) {
+        *num_3body += SQR (b_end[i] - b_start[i]);
+        b_top[i] = MAX((b_end [i] - b_start[i])*2, MIN_BONDS);
+    }
 
-	Delete_List( bonds, TYP_DEVICE );
+    Delete_List( bonds, TYP_DEVICE );
 
-	Cuda_Allocate_Bond_List(n, b_top, bonds );
+    Cuda_Allocate_Bond_List(n, b_top, bonds );
 
-	i = b_top[ n-1 ];
+    i = b_top[ n-1 ];
 
-	free( b_top );
-	free( b_start );
-	free( b_end );
+    free( b_top );
+    free( b_start );
+    free( b_end );
 
-	return i;
+    return i;
 }
 
 int Cuda_Reallocate_ThreeBody_List ( list *thblist, int count )
 {
-	int i;
-	int thb_total = 0;
-	int *thb_start;
-	int *thb_end;
+    int i;
+    int thb_total = 0;
+    int *thb_start;
+    int *thb_end;
 
-	int new_total, new_count;
+    int new_total, new_count;
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "reallocating bonds\n" );
+    fprintf( stderr, "reallocating bonds\n" );
 #endif
-	thb_start = (int *) calloc (thblist->n, sizeof (int));
-	thb_end = (int *) calloc (thblist->n, sizeof (int));
+    thb_start = (int *) calloc (thblist->n, sizeof (int));
+    thb_end = (int *) calloc (thblist->n, sizeof (int));
 
-	copy_host_device (thb_start, thblist->index, sizeof (int) * thblist->n, 
-			cudaMemcpyDeviceToHost, LIST_INDEX);
-	copy_host_device (thb_end , thblist->end_index, sizeof (int) * thblist->n, 
-			cudaMemcpyDeviceToHost, LIST_END_INDEX);
+    copy_host_device (thb_start, thblist->index, sizeof (int) * thblist->n, 
+            cudaMemcpyDeviceToHost, LIST_INDEX);
+    copy_host_device (thb_end , thblist->end_index, sizeof (int) * thblist->n, 
+            cudaMemcpyDeviceToHost, LIST_END_INDEX);
 
-	for( i = 0; i < thblist->n; ++i )
-		thb_total += (thb_end[i] - thb_start[i]) * SAFE_ZONE;
+    for( i = 0; i < thblist->n; ++i )
+        thb_total += (thb_end[i] - thb_start[i]) * SAFE_ZONE;
 
-	//new_total = MAX( thb_total, thblist->num_intrs );
-	//new_count = MAX( num_3body, thblist->n );
+    //new_total = MAX( thb_total, thblist->num_intrs );
+    //new_count = MAX( num_3body, thblist->n );
 
-	new_total = thb_total;
-	new_count = count;
+    new_total = thb_total;
+    new_count = count;
 
-	Delete_List( thblist, TYP_DEVICE );
+    Delete_List( thblist, TYP_DEVICE );
 
-	/*Allocate the list */
-	if(!Make_List( new_count, new_total, TYP_THREE_BODY, thblist, TYP_DEVICE )){
-		fprintf(stderr, "Problem in reallocating three-body list. Terminating!\n");
-		exit( INIT_ERR );
-	}
+    /*Allocate the list */
+    if(!Make_List( new_count, new_total, TYP_THREE_BODY, thblist, TYP_DEVICE )){
+        fprintf(stderr, "Problem in reallocating three-body list. Terminating!\n");
+        exit( INIT_ERR );
+    }
 
 #if defined(__CUDA_MEM__)
-	fprintf( stderr, "reallocating 3 bodies - \n" );
-	fprintf( stderr, "num_bonds: %d ", new_count);
-	fprintf( stderr, "num_3body: %d ", new_total);
-	fprintf( stderr, "3body memory: %ldMB\n", 
-			new_total * sizeof(three_body_interaction_data)/
-			(1024*1024) );
+    fprintf( stderr, "reallocating 3 bodies - \n" );
+    fprintf( stderr, "num_bonds: %d ", new_count);
+    fprintf( stderr, "num_3body: %d ", new_total);
+    fprintf( stderr, "3body memory: %ldMB\n", 
+            new_total * sizeof(three_body_interaction_data)/
+            (1024*1024) );
 #endif
 
-	free( thb_start );
-	free( thb_end );
+    free( thb_start );
+    free( thb_end );
 
-	return 1;
+    return 1;
 }
 
 
@@ -543,184 +543,184 @@ cuda_memset (d_bond_top, 0, (n+BLOCKS_POW_2+1) * INT_SIZE, RES_SCRATCH );
 
 
 void Reallocate( reax_system *system, static_storage *workspace, list **lists, 
-		int nbr_flag )
+        int nbr_flag )
 {
-	int num_bonds, est_3body;
-	reallocate_data *realloc;
-	grid *g;
-
-	realloc = &(workspace->realloc);
-	g = &(system->g);
-
-	if( realloc->num_far > 0 && nbr_flag ) {
-		fprintf (stderr, " Reallocating neighbors \n");
-		Reallocate_Neighbor_List( (*lists)+FAR_NBRS, 
-				system->N, realloc->num_far * SAFE_ZONE );
-		realloc->num_far = -1;
-	}
-
-	if( realloc->Htop > 0 ){
-		fprintf (stderr, " Reallocating Matrix \n");
-		Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
-		realloc->Htop = -1;
-
-		Deallocate_Matrix( &workspace->L );
-		Deallocate_Matrix( &workspace->U );
-	}
-
-	if( realloc->hbonds > 0 ){
-		fprintf (stderr, " Reallocating hbonds \n");
-		Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
-				(*lists)+HBONDS );
-		realloc->hbonds = -1;
-	}
-
-	num_bonds = est_3body = -1;
-	if( realloc->bonds > 0 ){
-		fprintf (stderr, " Reallocating bonds \n");
-		Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body );
-		realloc->bonds = -1;
-		realloc->num_3body = MAX( realloc->num_3body, est_3body );
-	}
-
-	if( realloc->num_3body > 0 ) {
-		fprintf (stderr, " Reallocating 3Body \n");
-		Delete_List( (*lists)+THREE_BODIES );
-
-		if( num_bonds == -1 )
-			num_bonds = ((*lists)+BONDS)->num_intrs;
-		realloc->num_3body *= SAFE_ZONE;
-
-		if( !Make_List( num_bonds, realloc->num_3body,
-					TYP_THREE_BODY, (*lists)+THREE_BODIES ) ) {
-			fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-			exit( INIT_ERR );
-		}
-		realloc->num_3body = -1;
+    int num_bonds, est_3body;
+    reallocate_data *realloc;
+    grid *g;
+
+    realloc = &(workspace->realloc);
+    g = &(system->g);
+
+    if( realloc->num_far > 0 && nbr_flag ) {
+        fprintf (stderr, " Reallocating neighbors \n");
+        Reallocate_Neighbor_List( (*lists)+FAR_NBRS, 
+                system->N, realloc->num_far * SAFE_ZONE );
+        realloc->num_far = -1;
+    }
+
+    if( realloc->Htop > 0 ){
+        fprintf (stderr, " Reallocating Matrix \n");
+        Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
+        realloc->Htop = -1;
+
+        Deallocate_Matrix( &workspace->L );
+        Deallocate_Matrix( &workspace->U );
+    }
+
+    if( realloc->hbonds > 0 ){
+        fprintf (stderr, " Reallocating hbonds \n");
+        Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
+                (*lists)+HBONDS );
+        realloc->hbonds = -1;
+    }
+
+    num_bonds = est_3body = -1;
+    if( realloc->bonds > 0 ){
+        fprintf (stderr, " Reallocating bonds \n");
+        Reallocate_Bonds_List( system->N, (*lists)+BONDS, &num_bonds, &est_3body );
+        realloc->bonds = -1;
+        realloc->num_3body = MAX( realloc->num_3body, est_3body );
+    }
+
+    if( realloc->num_3body > 0 ) {
+        fprintf (stderr, " Reallocating 3Body \n");
+        Delete_List( (*lists)+THREE_BODIES );
+
+        if( num_bonds == -1 )
+            num_bonds = ((*lists)+BONDS)->num_intrs;
+        realloc->num_3body *= SAFE_ZONE;
+
+        if( !Make_List( num_bonds, realloc->num_3body,
+                    TYP_THREE_BODY, (*lists)+THREE_BODIES ) ) {
+            fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
+            exit( INIT_ERR );
+        }
+        realloc->num_3body = -1;
 #if defined(DEBUG_FOCUS)
-		fprintf( stderr, "reallocating 3 bodies\n" );
-		fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds );
-		fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body );
-		fprintf( stderr, "reallocated 3body memory: %ldMB\n", 
-				realloc->num_3body*sizeof(three_body_interaction_data)/
-				(1024*1024) );
+        fprintf( stderr, "reallocating 3 bodies\n" );
+        fprintf( stderr, "reallocated - num_bonds: %d\n", num_bonds );
+        fprintf( stderr, "reallocated - num_3body: %d\n", realloc->num_3body );
+        fprintf( stderr, "reallocated 3body memory: %ldMB\n", 
+                realloc->num_3body*sizeof(three_body_interaction_data)/
+                (1024*1024) );
 #endif
-	}
+    }
 
-	if( realloc->gcell_atoms > -1 ){
+    if( realloc->gcell_atoms > -1 ){
 #if defined(DEBUG_FOCUS)
-		fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
+        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
 #endif
 
-		free (g->atoms);
-		g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
-				sizeof (int) * workspace->realloc.gcell_atoms);
-		realloc->gcell_atoms = -1;
-	}
+        free (g->atoms);
+        g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
+                sizeof (int) * workspace->realloc.gcell_atoms);
+        realloc->gcell_atoms = -1;
+    }
 }
 
 void Cuda_Reallocate( reax_system *system, static_storage *workspace, list *lists, 
-		int nbr_flag, int step )
+        int nbr_flag, int step )
 {
-	int num_bonds, est_3body;
-	int old_count = 0;
-	reallocate_data *realloc;
-	grid *g;
+    int num_bonds, est_3body;
+    int old_count = 0;
+    reallocate_data *realloc;
+    grid *g;
 
-	realloc = &(workspace->realloc);
-	g = &(system->d_g);
+    realloc = &(workspace->realloc);
+    g = &(system->d_g);
 
-	if( realloc->num_far > 0 && nbr_flag ) {
+    if( realloc->num_far > 0 && nbr_flag ) {
 
 #ifdef __CUDA_MEM__
-		fprintf (stderr, " Reallocating Neighbors: step: %d, old_count: %d new_count: %d size: %d (MB)\n", 
-				step, (dev_lists+FAR_NBRS)->num_intrs, (int)(realloc->num_far * SAFE_ZONE), 
-				(int)(sizeof (far_neighbor_data) * realloc->num_far * SAFE_ZONE)/(1024*1024));
+        fprintf (stderr, " Reallocating Neighbors: step: %d, old_count: %d new_count: %d size: %d (MB)\n", 
+                step, (dev_lists+FAR_NBRS)->num_intrs, (int)(realloc->num_far * SAFE_ZONE), 
+                (int)(sizeof (far_neighbor_data) * realloc->num_far * SAFE_ZONE)/(1024*1024));
 #endif
-		Cuda_Reallocate_Neighbor_List( lists+FAR_NBRS, 
-				system->N, realloc->num_far * SAFE_ZONE );
+        Cuda_Reallocate_Neighbor_List( lists+FAR_NBRS, 
+                system->N, realloc->num_far * SAFE_ZONE );
 
-		realloc->num_far = -1;
-		realloc->estimate_nbrs = 1;
-	}
+        realloc->num_far = -1;
+        realloc->estimate_nbrs = 1;
+    }
 
-	if( realloc->Htop > 0 ){
+    if( realloc->Htop > 0 ){
 
 #ifdef __CUDA_MEM__
-		fprintf (stderr, " Reallocating Matrix : step: %d, old_count: %d new_count: %d size: %d (MB)\n", 
-				step, dev_workspace->H.m, (int)(realloc->Htop * system->N * SAFE_ZONE), 
-				(int) (sizeof (sparse_matrix_entry) * (realloc->Htop * system->N * SAFE_ZONE))/(1024 * 1024));
+        fprintf (stderr, " Reallocating Matrix : step: %d, old_count: %d new_count: %d size: %d (MB)\n", 
+                step, dev_workspace->H.m, (int)(realloc->Htop * system->N * SAFE_ZONE), 
+                (int) (sizeof (sparse_matrix_entry) * (realloc->Htop * system->N * SAFE_ZONE))/(1024 * 1024));
 #endif
-		//Cuda_Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
-		Cuda_Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop * system->N * SAFE_ZONE,"H");
-		system->max_sparse_matrix_entries = realloc->Htop * SAFE_ZONE;
-		realloc->Htop = -1;
+        //Cuda_Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop*SAFE_ZONE,"H");
+        Cuda_Reallocate_Matrix(&(workspace->H), system->N, realloc->Htop * system->N * SAFE_ZONE,"H");
+        system->max_sparse_matrix_entries = realloc->Htop * SAFE_ZONE;
+        realloc->Htop = -1;
 
-		/*
-		   Cuda_Deallocate_Matrix( &workspace->L );
-		   fprintf (stderr, "Done deallocating the L ower matrix \n");
-		   Cuda_Deallocate_Matrix( &workspace->U );
-		   fprintf (stderr, "Done deallocating the Upper  matrix \n");
-		 */
-	}
+        /*
+           Cuda_Deallocate_Matrix( &workspace->L );
+           fprintf (stderr, "Done deallocating the L ower matrix \n");
+           Cuda_Deallocate_Matrix( &workspace->U );
+           fprintf (stderr, "Done deallocating the Upper  matrix \n");
+         */
+    }
 
-	if( realloc->hbonds > 0 ){
+    if( realloc->hbonds > 0 ){
 
-		old_count = (dev_lists+HBONDS)->num_intrs;
+        old_count = (dev_lists+HBONDS)->num_intrs;
 
-		Cuda_Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
-				dev_lists+HBONDS );
+        Cuda_Reallocate_HBonds_List(system->N, workspace->num_H, workspace->hbond_index,
+                dev_lists+HBONDS );
 
 #ifdef __CUDA_MEM__
-		fprintf (stderr, " Reallocating HBonds: step: %d, old_count: %d, new_count: %d, size: %d (MB)\n", 
-				step, old_count,(dev_lists+HBONDS)->num_intrs, 
-				(int) sizeof (hbond_data) * (dev_lists+HBONDS)->num_intrs / (1024 * 1024));
+        fprintf (stderr, " Reallocating HBonds: step: %d, old_count: %d, new_count: %d, size: %d (MB)\n", 
+                step, old_count,(dev_lists+HBONDS)->num_intrs, 
+                (int) sizeof (hbond_data) * (dev_lists+HBONDS)->num_intrs / (1024 * 1024));
 #endif
-		realloc->hbonds = -1;
-	}
+        realloc->hbonds = -1;
+    }
 
-	num_bonds = est_3body = -1;
-	if( realloc->bonds > 0 ){
+    num_bonds = est_3body = -1;
+    if( realloc->bonds > 0 ){
 
-		old_count = (dev_lists+BONDS)->num_intrs;
-		num_bonds = Cuda_Reallocate_Bonds_List( system->N, dev_lists+BONDS, &est_3body );
+        old_count = (dev_lists+BONDS)->num_intrs;
+        num_bonds = Cuda_Reallocate_Bonds_List( system->N, dev_lists+BONDS, &est_3body );
 
 #ifdef __CUDA_MEM__
-		fprintf (stderr, " Reallocating Bonds: step: %d, old_count: %d, new_count: %d, size: %d (MB) \n", 
-				step, old_count,(dev_lists+BONDS)->num_intrs, 
-				(int) sizeof (bond_data) * (dev_lists+BONDS)->num_intrs / (1024 * 1024));
+        fprintf (stderr, " Reallocating Bonds: step: %d, old_count: %d, new_count: %d, size: %d (MB) \n", 
+                step, old_count,(dev_lists+BONDS)->num_intrs, 
+                (int) sizeof (bond_data) * (dev_lists+BONDS)->num_intrs / (1024 * 1024));
 #endif
 
-		realloc->bonds = -1;
-		realloc->num_3body = 1;//MAX( realloc->num_3body, est_3body );
-	}
+        realloc->bonds = -1;
+        realloc->num_3body = 1;//MAX( realloc->num_3body, est_3body );
+    }
 
-	/*
-	   if( realloc->num_3body > 0 ) {
+    /*
+       if( realloc->num_3body > 0 ) {
 
-	   if (num_bonds < 0)
-	   num_bonds = (dev_lists+BONDS)->num_intrs;
+       if (num_bonds < 0)
+       num_bonds = (dev_lists+BONDS)->num_intrs;
 
-	   Cuda_Reallocate_ThreeBody_List (dev_lists + THREE_BODIES, num_bonds);
-	   realloc->num_3body = -1;
-	   }
-	 */
+       Cuda_Reallocate_ThreeBody_List (dev_lists + THREE_BODIES, num_bonds);
+       realloc->num_3body = -1;
+       }
+     */
 
-	if( realloc->gcell_atoms > -1 ){
+    if( realloc->gcell_atoms > -1 ){
 #if defined(DEBUG_FOCUS)
-		fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
+        fprintf(stderr, "reallocating gcell: g->max_atoms: %d\n", g->max_atoms);
 #endif
 
 #ifdef __CUDA_MEM__
-		fprintf (stderr, "Reallocating the atoms in the grid ---> %d \n", workspace->realloc.gcell_atoms );
+        fprintf (stderr, "Reallocating the atoms in the grid ---> %d \n", workspace->realloc.gcell_atoms );
 #endif
 
-		free (g->atoms);
-		g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
-				sizeof (int) * workspace->realloc.gcell_atoms);
+        free (g->atoms);
+        g->atoms = (int *) calloc ( g->ncell[0]*g->ncell[1]*g->ncell[2],
+                sizeof (int) * workspace->realloc.gcell_atoms);
 
-		cuda_free (g->atoms, RES_GRID_ATOMS);
-		cuda_malloc ((void **) &g->atoms, INT_SIZE * workspace->realloc.gcell_atoms * g->ncell[0]*g->ncell[1]*g->ncell[2], 1, RES_GRID_ATOMS );
-		realloc->gcell_atoms = -1;
-	}
+        cuda_free (g->atoms, RES_GRID_ATOMS);
+        cuda_malloc ((void **) &g->atoms, INT_SIZE * workspace->realloc.gcell_atoms * g->ncell[0]*g->ncell[1]*g->ncell[2], 1, RES_GRID_ATOMS );
+        realloc->gcell_atoms = -1;
+    }
 }
diff --git a/PuReMD-GPU/src/bond_orders.cu b/PuReMD-GPU/src/bond_orders.cu
index e180e680..57f5baac 100644
--- a/PuReMD-GPU/src/bond_orders.cu
+++ b/PuReMD-GPU/src/bond_orders.cu
@@ -32,493 +32,493 @@
 
 inline real Cf45( real p1, real p2 )
 {
-	return  -EXP(-p2 / 2) / 
-		( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) );
+    return  -EXP(-p2 / 2) / 
+        ( SQR( EXP(-p1 / 2) + EXP(p1 / 2) ) * (EXP(-p2 / 2) + EXP(p2 / 2)) );
 }
 
 #ifdef TEST_FORCES
 void Get_dBO( reax_system *system, list **lists, 
-		int i, int pj, real C, rvec *v )
+        int i, int pj, real C, rvec *v )
 {
-	list *bonds = (*lists) + BONDS;
-	list *dBOs = (*lists) + DBO;
-	int start_pj, end_pj, k;
+    list *bonds = (*lists) + BONDS;
+    list *dBOs = (*lists) + DBO;
+    int start_pj, end_pj, k;
 
-	pj = bonds->select.bond_list[pj].dbond_index;
-	start_pj = Start_Index(pj, dBOs);
-	end_pj = End_Index(pj, dBOs);
+    pj = bonds->select.bond_list[pj].dbond_index;
+    start_pj = Start_Index(pj, dBOs);
+    end_pj = End_Index(pj, dBOs);
 
-	for( k = start_pj; k < end_pj; ++k )
-		rvec_Scale( v[dBOs->select.dbo_list[k].wrt], 
-				C, dBOs->select.dbo_list[k].dBO );  
+    for( k = start_pj; k < end_pj; ++k )
+        rvec_Scale( v[dBOs->select.dbo_list[k].wrt], 
+                C, dBOs->select.dbo_list[k].dBO );  
 }
 
 
 void Get_dBOpinpi2( reax_system *system, list **lists, 
-		int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
+        int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
 {
-	list *bonds = (*lists) + BONDS;
-	list *dBOs = (*lists) + DBO;
-	dbond_data *dbo_k;
-	int start_pj, end_pj, k;
-
-	pj = bonds->select.bond_list[pj].dbond_index;
-	start_pj = Start_Index(pj, dBOs);
-	end_pj = End_Index(pj, dBOs);
-
-	for( k = start_pj; k < end_pj; ++k ) {
-		dbo_k = &(dBOs->select.dbo_list[k]);
-		rvec_Scale( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi );
-		rvec_Scale( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 );
-	}
+    list *bonds = (*lists) + BONDS;
+    list *dBOs = (*lists) + DBO;
+    dbond_data *dbo_k;
+    int start_pj, end_pj, k;
+
+    pj = bonds->select.bond_list[pj].dbond_index;
+    start_pj = Start_Index(pj, dBOs);
+    end_pj = End_Index(pj, dBOs);
+
+    for( k = start_pj; k < end_pj; ++k ) {
+        dbo_k = &(dBOs->select.dbo_list[k]);
+        rvec_Scale( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi );
+        rvec_Scale( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 );
+    }
 }
 
 
 void Add_dBO( reax_system *system, list **lists, 
-		int i, int pj, real C, rvec *v )
+        int i, int pj, real C, rvec *v )
 {
-	list *bonds = (*lists) + BONDS;
-	list *dBOs = (*lists) + DBO;
-	int start_pj, end_pj, k;
+    list *bonds = (*lists) + BONDS;
+    list *dBOs = (*lists) + DBO;
+    int start_pj, end_pj, k;
 
-	pj = bonds->select.bond_list[pj].dbond_index;
-	start_pj = Start_Index(pj, dBOs);
-	end_pj = End_Index(pj, dBOs);
+    pj = bonds->select.bond_list[pj].dbond_index;
+    start_pj = Start_Index(pj, dBOs);
+    end_pj = End_Index(pj, dBOs);
 
-	//fprintf( stderr, "i=%d j=%d start=%d end=%d\n", i, pj, start_pj, end_pj );
+    //fprintf( stderr, "i=%d j=%d start=%d end=%d\n", i, pj, start_pj, end_pj );
 
-	for( k = start_pj; k < end_pj; ++k )
-		rvec_ScaledAdd( v[dBOs->select.dbo_list[k].wrt], 
-				C, dBOs->select.dbo_list[k].dBO );
+    for( k = start_pj; k < end_pj; ++k )
+        rvec_ScaledAdd( v[dBOs->select.dbo_list[k].wrt], 
+                C, dBOs->select.dbo_list[k].dBO );
 
 }
 
 
 void Add_dBOpinpi2( reax_system *system, list **lists, 
-		int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
+        int i, int pj, real Cpi, real Cpi2, rvec *vpi, rvec *vpi2 )
 {
-	list *bonds = (*lists) + BONDS;
-	list *dBOs = (*lists) + DBO;
-	dbond_data *dbo_k;
-	int start_pj, end_pj, k;
-
-	pj = bonds->select.bond_list[pj].dbond_index;
-	start_pj = Start_Index(pj, dBOs);
-	end_pj = End_Index(pj, dBOs);
-
-	for( k = start_pj; k < end_pj; ++k )
-	{
-		dbo_k = &(dBOs->select.dbo_list[k]);
-		rvec_ScaledAdd( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi );
-		rvec_ScaledAdd( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 );
-	}
+    list *bonds = (*lists) + BONDS;
+    list *dBOs = (*lists) + DBO;
+    dbond_data *dbo_k;
+    int start_pj, end_pj, k;
+
+    pj = bonds->select.bond_list[pj].dbond_index;
+    start_pj = Start_Index(pj, dBOs);
+    end_pj = End_Index(pj, dBOs);
+
+    for( k = start_pj; k < end_pj; ++k )
+    {
+        dbo_k = &(dBOs->select.dbo_list[k]);
+        rvec_ScaledAdd( vpi[dbo_k->wrt], Cpi, dbo_k->dBOpi );
+        rvec_ScaledAdd( vpi2[dbo_k->wrt], Cpi2, dbo_k->dBOpi2 );
+    }
 }
 
 
 void Add_dBO_to_Forces( reax_system *system, list **lists, 
-		int i, int pj, real C )
+        int i, int pj, real C )
 {
-	list *bonds = (*lists) + BONDS;
-	list *dBOs = (*lists) + DBO;
-	int start_pj, end_pj, k;
+    list *bonds = (*lists) + BONDS;
+    list *dBOs = (*lists) + DBO;
+    int start_pj, end_pj, k;
 
-	pj = bonds->select.bond_list[pj].dbond_index;
-	start_pj = Start_Index(pj, dBOs);
-	end_pj = End_Index(pj, dBOs);
+    pj = bonds->select.bond_list[pj].dbond_index;
+    start_pj = Start_Index(pj, dBOs);
+    end_pj = End_Index(pj, dBOs);
 
-	for( k = start_pj; k < end_pj; ++k )
-		rvec_ScaledAdd( system->atoms[dBOs->select.dbo_list[k].wrt].f, 
-				C, dBOs->select.dbo_list[k].dBO );
+    for( k = start_pj; k < end_pj; ++k )
+        rvec_ScaledAdd( system->atoms[dBOs->select.dbo_list[k].wrt].f, 
+                C, dBOs->select.dbo_list[k].dBO );
 }
 
 
 void Add_dBOpinpi2_to_Forces( reax_system *system, list **lists, 
-		int i, int pj, real Cpi, real Cpi2 )
+        int i, int pj, real Cpi, real Cpi2 )
 {
-	list *bonds = (*lists) + BONDS;
-	list *dBOs = (*lists) + DBO;
-	dbond_data *dbo_k;
-	int start_pj, end_pj, k;
-
-	pj = bonds->select.bond_list[pj].dbond_index;
-	start_pj = Start_Index(pj, dBOs);
-	end_pj = End_Index(pj, dBOs);
-
-	for( k = start_pj; k < end_pj; ++k )
-	{
-		dbo_k = &(dBOs->select.dbo_list[k]);
-		rvec_ScaledAdd( system->atoms[dbo_k->wrt].f, Cpi, dbo_k->dBOpi );
-		rvec_ScaledAdd( system->atoms[dbo_k->wrt].f, Cpi2, dbo_k->dBOpi2 );
-	}
+    list *bonds = (*lists) + BONDS;
+    list *dBOs = (*lists) + DBO;
+    dbond_data *dbo_k;
+    int start_pj, end_pj, k;
+
+    pj = bonds->select.bond_list[pj].dbond_index;
+    start_pj = Start_Index(pj, dBOs);
+    end_pj = End_Index(pj, dBOs);
+
+    for( k = start_pj; k < end_pj; ++k )
+    {
+        dbo_k = &(dBOs->select.dbo_list[k]);
+        rvec_ScaledAdd( system->atoms[dbo_k->wrt].f, Cpi, dbo_k->dBOpi );
+        rvec_ScaledAdd( system->atoms[dbo_k->wrt].f, Cpi2, dbo_k->dBOpi2 );
+    }
 }
 
 
 void Add_dDelta( reax_system *system, list **lists, int i, real C, rvec *v )
 {
-	list *dDeltas = &((*lists)[DDELTA]);
-	int start = Start_Index(i, dDeltas);
-	int end = End_Index(i, dDeltas);
-	int k;
-
-	for( k = start; k < end; ++k )
-		rvec_ScaledAdd( v[dDeltas->select.dDelta_list[k].wrt], 
-				C, dDeltas->select.dDelta_list[k].dVal );
+    list *dDeltas = &((*lists)[DDELTA]);
+    int start = Start_Index(i, dDeltas);
+    int end = End_Index(i, dDeltas);
+    int k;
+
+    for( k = start; k < end; ++k )
+        rvec_ScaledAdd( v[dDeltas->select.dDelta_list[k].wrt], 
+                C, dDeltas->select.dDelta_list[k].dVal );
 }
 
 
 void Add_dDelta_to_Forces( reax_system *system, list **lists, int i, real C )
 {
-	list *dDeltas = &((*lists)[DDELTA]);
-	int start = Start_Index(i, dDeltas);
-	int end = End_Index(i, dDeltas);
-	int k;
-
-	for( k = start; k < end; ++k )
-		rvec_ScaledAdd( system->atoms[dDeltas->select.dDelta_list[k].wrt].f, 
-				C, dDeltas->select.dDelta_list[k].dVal );
+    list *dDeltas = &((*lists)[DDELTA]);
+    int start = Start_Index(i, dDeltas);
+    int end = End_Index(i, dDeltas);
+    int k;
+
+    for( k = start; k < end; ++k )
+        rvec_ScaledAdd( system->atoms[dDeltas->select.dDelta_list[k].wrt].f, 
+                C, dDeltas->select.dDelta_list[k].dVal );
 }
 
 
 
 HOST_DEVICE void Calculate_dBO( int i, int pj, static_storage p_workspace, 
-		list p_bonds, list p_dBOs, int *top )
+        list p_bonds, list p_dBOs, int *top )
 {
-	/* Initializations */
-	int j, k, l, start_i, end_i, end_j;
-	rvec dDeltap_self, dBOp;
-	list *bonds, *dBOs;
-	bond_data *nbr_l, *nbr_k;
-	bond_order_data *bo_ij;
-	dbond_data *top_dbo;
-
-	list *bonds = &p_bonds;
-	list *dBOs = &p_dBOs;
-	static_storage *workspace = &p_workspace;
-
-	j = bonds->select.bond_list[pj].nbr;
-	bo_ij = &(bonds->select.bond_list[pj].bo_data);
-
-	/*rvec due_j[1000], due_i[1000];
-	  rvec due_j_pi[1000], due_i_pi[1000];
-
-	  memset(due_j, 0, sizeof(rvec)*1000 );
-	  memset(due_i, 0, sizeof(rvec)*1000 );
-	  memset(due_j_pi, 0, sizeof(rvec)*1000 );
-	  memset(due_i_pi, 0, sizeof(rvec)*1000 );*/
-
-	//fprintf( stderr,"dbo %d-%d\n",workspace->orig_id[i],workspace->orig_id[j] );
-
-	start_i = Start_Index(i, bonds);
-	end_i = End_Index(i, bonds);
-
-	l = Start_Index(j, bonds);
-	end_j = End_Index(j, bonds);
-
-	top_dbo = &(dBOs->select.dbo_list[ (*top) ]);
-
-	for( k = start_i; k < end_i; ++k ) {
-		nbr_k = &(bonds->select.bond_list[k]);
-		//fprintf( stderr, "\tnbr_k = %d\n", workspace->orig_id[nbr_k->nbr] );
-
-		for( ; l < end_j && bonds->select.bond_list[l].nbr < nbr_k->nbr; ++l ) {
-			/* These are the neighbors of j which aren't in the neighbor_list of i
-			   Note that they might also include i! */
-			nbr_l = &(bonds->select.bond_list[l]);
-			top_dbo->wrt = nbr_l->nbr;
-			rvec_Copy( dBOp, nbr_l->bo_data.dBOp );
-			//fprintf( stderr,"\t\tnbr_l = %d\n",workspace->orig_id[nbr_l->nbr] );
-
-			rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp ); 	// dBO, 3rd
-			rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp );  // dBOpi, 4th
-			rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );// dBOpipi, 4th
-			//rvec_ScaledAdd(due_j[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ji, dBOp);
-
-			if( nbr_l->nbr == i ) {
-				rvec_Copy( dDeltap_self, workspace->dDeltap_self[i] );
-
-				/* dBO */
-				rvec_ScaledAdd( top_dbo->dBO, bo_ij->C1dbo, bo_ij->dBOp ); //1st
-				rvec_ScaledAdd( top_dbo->dBO, bo_ij->C2dbo, dDeltap_self ); //2nd
-
-				/* dBOpi */
-				rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C1dbopi,bo_ij->dln_BOp_pi);//1
-				rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C2dbopi,bo_ij->dBOp);  //2nd
-				rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C3dbopi,dDeltap_self); //3rd
-
-				/* dBOpp, 1st */
-				rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C1dbopi2,bo_ij->dln_BOp_pi2);
-				rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C2dbopi2,bo_ij->dBOp); //2nd
-				rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C3dbopi2,dDeltap_self);//3rd
-
-				/* do the adjustments on i */       
-				//rvec_ScaledAdd( due_i[i], 
-				//bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij, bo_ij->dBOp );//1st,dBO
-				//rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij, 
-				//dDeltap_self ); //2nd, dBO
-			}
-
-			//rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO );
-			++(*top), ++top_dbo;
-		}
-
-		/* Now we are processing neighbor k of i. */ 
-		top_dbo->wrt = nbr_k->nbr;
-		rvec_Copy( dBOp, nbr_k->bo_data.dBOp );
-
-		rvec_Scale( top_dbo->dBO, -bo_ij->C2dbo, dBOp );      //dBO-2
-		rvec_Scale( top_dbo->dBOpi, -bo_ij->C3dbopi, dBOp );  //dBOpi-3
-		rvec_Scale( top_dbo->dBOpi2, -bo_ij->C3dbopi2, dBOp );//dBOpp-3
-		//rvec_ScaledAdd(due_i[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ij,dBOp);//dBO-2
-
-		// fprintf( stderr, "\tnbr_k = %d, nbr_l = %d, l = %d, end_j = %d\n", 
-		//      workspace->orig_id[nbr_k->nbr], 
-		//       workspace->orig_id[bonds->select.bond_list[l].nbr], l, end_j );
-
-		if( l < end_j && bonds->select.bond_list[l].nbr == nbr_k->nbr ) {
-			/* This is a common neighbor of i and j. */
-			nbr_l = &(bonds->select.bond_list[l]);
-			rvec_Copy( dBOp, nbr_l->bo_data.dBOp );
-
-			rvec_ScaledAdd( top_dbo->dBO, -bo_ij->C3dbo, dBOp );      //dBO,3rd
-			rvec_ScaledAdd( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp );  //dBOpi,4th
-			rvec_ScaledAdd( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//dBOpp.4th
-			++l;
-
-			//rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji, 
-			//nbr_l->bo_data.dBOp ); //3rd, dBO
-		}
-		else if( k == pj ) {  
-			/* This negihbor is j. */
-			rvec_Copy( dDeltap_self, workspace->dDeltap_self[j] );
-
-			rvec_ScaledAdd( top_dbo->dBO, -bo_ij->C1dbo, bo_ij->dBOp );// 1st, dBO
-			rvec_ScaledAdd( top_dbo->dBO, bo_ij->C3dbo, dDeltap_self );// 3rd, dBO
-
-			/* dBOpi, 1st */
-			rvec_ScaledAdd(top_dbo->dBOpi,-bo_ij->C1dbopi,bo_ij->dln_BOp_pi);
-			rvec_ScaledAdd(top_dbo->dBOpi,-bo_ij->C2dbopi,bo_ij->dBOp);      //2nd
-			rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C4dbopi, dDeltap_self );  //4th
-
-			/* dBOpi2, 1st */
-			rvec_ScaledAdd(top_dbo->dBOpi2,-bo_ij->C1dbopi2,bo_ij->dln_BOp_pi2 );
-			rvec_ScaledAdd(top_dbo->dBOpi2,-bo_ij->C2dbopi2,bo_ij->dBOp ); //2nd
-			rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C4dbopi2,dDeltap_self ); //4th
-
-			//rvec_ScaledAdd( due_j[j], -(bo_ij->A0_ij + bo_ij->BO*bo_ij->A1_ij),
-			//bo_ij->dBOp ); //1st, dBO
-			//rvec_ScaledAdd( due_j[j], bo_ij->BO * bo_ij->A2_ji, 
-			//workspace->dDeltap_self[j] ); //3rd, dBO
-		}
-
-		// rvec_Add( workspace->dDelta[nbr_k->nbr], top_dbo->dBO );
-		++(*top), ++top_dbo;
-	}
-
-	for( ; l < end_j; ++l ) {
-		/* These are the remaining neighbors of j which are not in the
-		   neighbor_list of i. Note that they might also include i!*/
-		nbr_l = &(bonds->select.bond_list[l]);
-		top_dbo->wrt = nbr_l->nbr;
-		rvec_Copy( dBOp, nbr_l->bo_data.dBOp );
-		//fprintf( stderr,"\tl=%d, nbr_l=%d\n",l,workspace->orig_id[nbr_l->nbr] );
-
-		rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp );      //3rd, dBO
-		rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp );  //4th, dBOpi
-		rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//4th, dBOpp
-
-		// rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji, 
-		// nbr_l->bo_data.dBOp );
-
-		if( nbr_l->nbr == i ) {
-			/* do the adjustments on i */
-			rvec_Copy( dDeltap_self, workspace->dDeltap_self[i] );
-
-			/* dBO, 1st */
-			rvec_ScaledAdd( top_dbo->dBO, bo_ij->C1dbo, bo_ij->dBOp );
-			rvec_ScaledAdd( top_dbo->dBO, bo_ij->C2dbo, dDeltap_self ); //2nd, dBO
-
-			/* dBOpi, 1st */
-			rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C1dbopi, bo_ij->dln_BOp_pi );
-			rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C2dbopi, bo_ij->dBOp );  //2nd
-			rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C3dbopi, dDeltap_self ); //3rd
-
-			/* dBOpipi, 1st */
-			rvec_ScaledAdd(top_dbo->dBOpi2, bo_ij->C1dbopi2, bo_ij->dln_BOp_pi2);
-			rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C2dbopi2, bo_ij->dBOp ); //2nd
-			rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C3dbopi2, dDeltap_self );//3rd
-
-			//rvec_ScaledAdd( due_i[i], bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij, 
-			//bo_ij->dBOp );  /*1st, dBO*/
-			//rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij, 
-			//dDeltap_self ); /*2nd, dBO*/
-		}
-
-		// rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO );
-		++(*top), ++top_dbo;
-	}
-
-	/*for( k = 0; k < 21; ++k ){
-	  fprintf( stderr, "%d %d %d, due_i:[%g %g %g]\n", 
-	  i+1, j+1, k+1, due_i[k][0], due_i[k][1], due_i[k][2] );
-	  fprintf( stderr, "%d %d %d, due_j:[%g %g %g]\n", 
-	  i+1, j+1, k+1, due_j[k][0], due_j[k][1], due_j[k][2] );
-	  }*/
+    /* Initializations */
+    int j, k, l, start_i, end_i, end_j;
+    rvec dDeltap_self, dBOp;
+    list *bonds, *dBOs;
+    bond_data *nbr_l, *nbr_k;
+    bond_order_data *bo_ij;
+    dbond_data *top_dbo;
+
+    list *bonds = &p_bonds;
+    list *dBOs = &p_dBOs;
+    static_storage *workspace = &p_workspace;
+
+    j = bonds->select.bond_list[pj].nbr;
+    bo_ij = &(bonds->select.bond_list[pj].bo_data);
+
+    /*rvec due_j[1000], due_i[1000];
+      rvec due_j_pi[1000], due_i_pi[1000];
+
+      memset(due_j, 0, sizeof(rvec)*1000 );
+      memset(due_i, 0, sizeof(rvec)*1000 );
+      memset(due_j_pi, 0, sizeof(rvec)*1000 );
+      memset(due_i_pi, 0, sizeof(rvec)*1000 );*/
+
+    //fprintf( stderr,"dbo %d-%d\n",workspace->orig_id[i],workspace->orig_id[j] );
+
+    start_i = Start_Index(i, bonds);
+    end_i = End_Index(i, bonds);
+
+    l = Start_Index(j, bonds);
+    end_j = End_Index(j, bonds);
+
+    top_dbo = &(dBOs->select.dbo_list[ (*top) ]);
+
+    for( k = start_i; k < end_i; ++k ) {
+        nbr_k = &(bonds->select.bond_list[k]);
+        //fprintf( stderr, "\tnbr_k = %d\n", workspace->orig_id[nbr_k->nbr] );
+
+        for( ; l < end_j && bonds->select.bond_list[l].nbr < nbr_k->nbr; ++l ) {
+            /* These are the neighbors of j which aren't in the neighbor_list of i
+               Note that they might also include i! */
+            nbr_l = &(bonds->select.bond_list[l]);
+            top_dbo->wrt = nbr_l->nbr;
+            rvec_Copy( dBOp, nbr_l->bo_data.dBOp );
+            //fprintf( stderr,"\t\tnbr_l = %d\n",workspace->orig_id[nbr_l->nbr] );
+
+            rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp );     // dBO, 3rd
+            rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp );  // dBOpi, 4th
+            rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );// dBOpipi, 4th
+            //rvec_ScaledAdd(due_j[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ji, dBOp);
+
+            if( nbr_l->nbr == i ) {
+                rvec_Copy( dDeltap_self, workspace->dDeltap_self[i] );
+
+                /* dBO */
+                rvec_ScaledAdd( top_dbo->dBO, bo_ij->C1dbo, bo_ij->dBOp ); //1st
+                rvec_ScaledAdd( top_dbo->dBO, bo_ij->C2dbo, dDeltap_self ); //2nd
+
+                /* dBOpi */
+                rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C1dbopi,bo_ij->dln_BOp_pi);//1
+                rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C2dbopi,bo_ij->dBOp);  //2nd
+                rvec_ScaledAdd(top_dbo->dBOpi,bo_ij->C3dbopi,dDeltap_self); //3rd
+
+                /* dBOpp, 1st */
+                rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C1dbopi2,bo_ij->dln_BOp_pi2);
+                rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C2dbopi2,bo_ij->dBOp); //2nd
+                rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C3dbopi2,dDeltap_self);//3rd
+
+                /* do the adjustments on i */       
+                //rvec_ScaledAdd( due_i[i], 
+                //bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij, bo_ij->dBOp );//1st,dBO
+                //rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij, 
+                //dDeltap_self ); //2nd, dBO
+            }
+
+            //rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO );
+            ++(*top), ++top_dbo;
+        }
+
+        /* Now we are processing neighbor k of i. */ 
+        top_dbo->wrt = nbr_k->nbr;
+        rvec_Copy( dBOp, nbr_k->bo_data.dBOp );
+
+        rvec_Scale( top_dbo->dBO, -bo_ij->C2dbo, dBOp );      //dBO-2
+        rvec_Scale( top_dbo->dBOpi, -bo_ij->C3dbopi, dBOp );  //dBOpi-3
+        rvec_Scale( top_dbo->dBOpi2, -bo_ij->C3dbopi2, dBOp );//dBOpp-3
+        //rvec_ScaledAdd(due_i[top_dbo->wrt],-bo_ij->BO*bo_ij->A2_ij,dBOp);//dBO-2
+
+        // fprintf( stderr, "\tnbr_k = %d, nbr_l = %d, l = %d, end_j = %d\n", 
+        //      workspace->orig_id[nbr_k->nbr], 
+        //       workspace->orig_id[bonds->select.bond_list[l].nbr], l, end_j );
+
+        if( l < end_j && bonds->select.bond_list[l].nbr == nbr_k->nbr ) {
+            /* This is a common neighbor of i and j. */
+            nbr_l = &(bonds->select.bond_list[l]);
+            rvec_Copy( dBOp, nbr_l->bo_data.dBOp );
+
+            rvec_ScaledAdd( top_dbo->dBO, -bo_ij->C3dbo, dBOp );      //dBO,3rd
+            rvec_ScaledAdd( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp );  //dBOpi,4th
+            rvec_ScaledAdd( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//dBOpp.4th
+            ++l;
+
+            //rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji, 
+            //nbr_l->bo_data.dBOp ); //3rd, dBO
+        }
+        else if( k == pj ) {  
+            /* This negihbor is j. */
+            rvec_Copy( dDeltap_self, workspace->dDeltap_self[j] );
+
+            rvec_ScaledAdd( top_dbo->dBO, -bo_ij->C1dbo, bo_ij->dBOp );// 1st, dBO
+            rvec_ScaledAdd( top_dbo->dBO, bo_ij->C3dbo, dDeltap_self );// 3rd, dBO
+
+            /* dBOpi, 1st */
+            rvec_ScaledAdd(top_dbo->dBOpi,-bo_ij->C1dbopi,bo_ij->dln_BOp_pi);
+            rvec_ScaledAdd(top_dbo->dBOpi,-bo_ij->C2dbopi,bo_ij->dBOp);      //2nd
+            rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C4dbopi, dDeltap_self );  //4th
+
+            /* dBOpi2, 1st */
+            rvec_ScaledAdd(top_dbo->dBOpi2,-bo_ij->C1dbopi2,bo_ij->dln_BOp_pi2 );
+            rvec_ScaledAdd(top_dbo->dBOpi2,-bo_ij->C2dbopi2,bo_ij->dBOp ); //2nd
+            rvec_ScaledAdd(top_dbo->dBOpi2,bo_ij->C4dbopi2,dDeltap_self ); //4th
+
+            //rvec_ScaledAdd( due_j[j], -(bo_ij->A0_ij + bo_ij->BO*bo_ij->A1_ij),
+            //bo_ij->dBOp ); //1st, dBO
+            //rvec_ScaledAdd( due_j[j], bo_ij->BO * bo_ij->A2_ji, 
+            //workspace->dDeltap_self[j] ); //3rd, dBO
+        }
+
+        // rvec_Add( workspace->dDelta[nbr_k->nbr], top_dbo->dBO );
+        ++(*top), ++top_dbo;
+    }
+
+    for( ; l < end_j; ++l ) {
+        /* These are the remaining neighbors of j which are not in the
+           neighbor_list of i. Note that they might also include i!*/
+        nbr_l = &(bonds->select.bond_list[l]);
+        top_dbo->wrt = nbr_l->nbr;
+        rvec_Copy( dBOp, nbr_l->bo_data.dBOp );
+        //fprintf( stderr,"\tl=%d, nbr_l=%d\n",l,workspace->orig_id[nbr_l->nbr] );
+
+        rvec_Scale( top_dbo->dBO, -bo_ij->C3dbo, dBOp );      //3rd, dBO
+        rvec_Scale( top_dbo->dBOpi, -bo_ij->C4dbopi, dBOp );  //4th, dBOpi
+        rvec_Scale( top_dbo->dBOpi2, -bo_ij->C4dbopi2, dBOp );//4th, dBOpp
+
+        // rvec_ScaledAdd( due_j[top_dbo->wrt], -bo_ij->BO * bo_ij->A2_ji, 
+        // nbr_l->bo_data.dBOp );
+
+        if( nbr_l->nbr == i ) {
+            /* do the adjustments on i */
+            rvec_Copy( dDeltap_self, workspace->dDeltap_self[i] );
+
+            /* dBO, 1st */
+            rvec_ScaledAdd( top_dbo->dBO, bo_ij->C1dbo, bo_ij->dBOp );
+            rvec_ScaledAdd( top_dbo->dBO, bo_ij->C2dbo, dDeltap_self ); //2nd, dBO
+
+            /* dBOpi, 1st */
+            rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C1dbopi, bo_ij->dln_BOp_pi );
+            rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C2dbopi, bo_ij->dBOp );  //2nd
+            rvec_ScaledAdd( top_dbo->dBOpi, bo_ij->C3dbopi, dDeltap_self ); //3rd
+
+            /* dBOpipi, 1st */
+            rvec_ScaledAdd(top_dbo->dBOpi2, bo_ij->C1dbopi2, bo_ij->dln_BOp_pi2);
+            rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C2dbopi2, bo_ij->dBOp ); //2nd
+            rvec_ScaledAdd( top_dbo->dBOpi2, bo_ij->C3dbopi2, dDeltap_self );//3rd
+
+            //rvec_ScaledAdd( due_i[i], bo_ij->A0_ij + bo_ij->BO * bo_ij->A1_ij, 
+            //bo_ij->dBOp );  /*1st, dBO*/
+            //rvec_ScaledAdd( due_i[i], bo_ij->BO * bo_ij->A2_ij, 
+            //dDeltap_self ); /*2nd, dBO*/
+        }
+
+        // rvec_Add( workspace->dDelta[nbr_l->nbr], top_dbo->dBO );
+        ++(*top), ++top_dbo;
+    }
+
+    /*for( k = 0; k < 21; ++k ){
+      fprintf( stderr, "%d %d %d, due_i:[%g %g %g]\n", 
+      i+1, j+1, k+1, due_i[k][0], due_i[k][1], due_i[k][2] );
+      fprintf( stderr, "%d %d %d, due_j:[%g %g %g]\n", 
+      i+1, j+1, k+1, due_j[k][0], due_j[k][1], due_j[k][2] );
+      }*/
 }
 #endif
 
 
 
 void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists )
+        simulation_data *data, static_storage *workspace, 
+        list **lists )
 {
-	list *bonds = (*lists) + BONDS;
-	bond_data *nbr_j, *nbr_k;
-	bond_order_data *bo_ij, *bo_ji; 
-	dbond_coefficients coef;
-	rvec temp, ext_press;
-	ivec rel_box;
-	int pk, k, j;
-
-	/* Initializations */
-	nbr_j = &(bonds->select.bond_list[pj]);
-	j = nbr_j->nbr;
-	bo_ij = &(nbr_j->bo_data);
-	bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-
-	coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-	coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-	coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-
-	coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-
-	coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-
-	coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-	coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-	coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-
-
-	/************************************
-	 * forces related to atom i          *
-	 * first neighbors of atom i         *
-	 ************************************/
-	for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-		nbr_k = &(bonds->select.bond_list[pk]);
-		k = nbr_k->nbr;
-
-		rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp );       /*2nd,dBO*/
-		rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/
-		rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/
-		rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/
-
-		/* force */
-		rvec_Add( system->atoms[k].f, temp );
-		/* pressure */
-		rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
-		rvec_Add( data->ext_press, ext_press );
-
-		/* if( !ivec_isZero( nbr_k->rel_box ) )
-		   fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] 
-		   ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
-		   i+1, 
-		   system->atoms[i].x[0],system->atoms[i].x[1],system->atoms[i].x[2], 
-		   j+1, k+1,
-		   system->atoms[k].x[0], system->atoms[k].x[1], system->atoms[k].x[2],
-		   nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
-		   nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2],
-		   temp[0], temp[1], temp[2] ); */
-	}
-
-	/* then atom i itself  */
-	rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st, dBO*/
-	rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd, dBO*/
-
-	rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st, dBO*/
-	rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/
-
-	rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );         /*1st,dBOpi*/
-	rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );               /*2nd,dBOpi*/
-	rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/
-
-	rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ;      /*1st,dBO_pi2*/
-	rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBO_pi2*/
-	rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/
-
-	/* force */
-	rvec_Add( system->atoms[i].f, temp );
-	/* ext pressure due to i dropped, counting force on j only will be enough */
-
-
-	/****************************************************************************
-	 * forces and pressure related to atom j                                    *
-	 * first neighbors of atom j                                                *
-	 ***************************************************************************/
-	for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
-		nbr_k = &(bonds->select.bond_list[pk]);
-		k = nbr_k->nbr;
-
-		rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );       /*3rd,dBO*/
-		rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ 
-		rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/
-		rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/
-
-		/* force */
-		rvec_Add( system->atoms[k].f, temp );
-		/* pressure */
-		if( k != i ) {
-			ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box  wrt i
-			rvec_iMultiply( ext_press, rel_box, temp );
-			rvec_Add( data->ext_press, ext_press );
-
-			/* if( !ivec_isZero( rel_box ) )
-			   fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] 
-			   ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
-			   i+1, j+1, 
-			   system->atoms[j].x[0],system->atoms[j].x[1],system->atoms[j].x[2],
-			   k+1, 
-			   system->atoms[k].x[0], system->atoms[k].x[1], system->atoms[k].x[2],
-			   nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
-			   rel_box[0], rel_box[1], rel_box[2],
-			   temp[0], temp[1], temp[2] ); */
-		}
-	}
-
-	/* then atom j itself */
-	rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                     /*1st, dBO*/
-	rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );   /*2nd, dBO*/
-
-	rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );              /*1st, dBO*/
-	rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/
-
-	rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
-	rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
-	rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/
-
-	rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2);       /*1st,dBOpi2*/
-	rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBOpi2*/
-	rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
-
-	/* force */
-	rvec_Add( system->atoms[j].f, temp );
-	/* pressure */
-	rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
-	rvec_Add( data->ext_press, ext_press );
-
-	/* if( !ivec_isZero( nbr_j->rel_box ) )
-	   fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] 
-	   ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
-	   i+1, system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
-	   j+1, system->atoms[j].x[0], system->atoms[j].x[1], system->atoms[j].x[2],
-	   j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2],
-	   nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2],
-	   temp[0], temp[1], temp[2] ); */
+    list *bonds = (*lists) + BONDS;
+    bond_data *nbr_j, *nbr_k;
+    bond_order_data *bo_ij, *bo_ji; 
+    dbond_coefficients coef;
+    rvec temp, ext_press;
+    ivec rel_box;
+    int pk, k, j;
+
+    /* Initializations */
+    nbr_j = &(bonds->select.bond_list[pj]);
+    j = nbr_j->nbr;
+    bo_ij = &(nbr_j->bo_data);
+    bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+
+    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+
+    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+
+    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+
+    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+
+
+    /************************************
+     * forces related to atom i          *
+     * first neighbors of atom i         *
+     ************************************/
+    for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        k = nbr_k->nbr;
+
+        rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp );       /*2nd,dBO*/
+        rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/
+        rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/
+        rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/
+
+        /* force */
+        rvec_Add( system->atoms[k].f, temp );
+        /* pressure */
+        rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
+        rvec_Add( data->ext_press, ext_press );
+
+        /* if( !ivec_isZero( nbr_k->rel_box ) )
+           fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] 
+           ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
+           i+1, 
+           system->atoms[i].x[0],system->atoms[i].x[1],system->atoms[i].x[2], 
+           j+1, k+1,
+           system->atoms[k].x[0], system->atoms[k].x[1], system->atoms[k].x[2],
+           nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
+           nbr_k->rel_box[0], nbr_k->rel_box[1], nbr_k->rel_box[2],
+           temp[0], temp[1], temp[2] ); */
+    }
+
+    /* then atom i itself  */
+    rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd, dBO*/
+
+    rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/
+
+    rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );         /*1st,dBOpi*/
+    rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );               /*2nd,dBOpi*/
+    rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/
+
+    rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ;      /*1st,dBO_pi2*/
+    rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBO_pi2*/
+    rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/
+
+    /* force */
+    rvec_Add( system->atoms[i].f, temp );
+    /* ext pressure due to i dropped, counting force on j only will be enough */
+
+
+    /****************************************************************************
+     * forces and pressure related to atom j                                    *
+     * first neighbors of atom j                                                *
+     ***************************************************************************/
+    for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        k = nbr_k->nbr;
+
+        rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );       /*3rd,dBO*/
+        rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ 
+        rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/
+        rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/
+
+        /* force */
+        rvec_Add( system->atoms[k].f, temp );
+        /* pressure */
+        if( k != i ) {
+            ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box  wrt i
+            rvec_iMultiply( ext_press, rel_box, temp );
+            rvec_Add( data->ext_press, ext_press );
+
+            /* if( !ivec_isZero( rel_box ) )
+               fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] 
+               ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
+               i+1, j+1, 
+               system->atoms[j].x[0],system->atoms[j].x[1],system->atoms[j].x[2],
+               k+1, 
+               system->atoms[k].x[0], system->atoms[k].x[1], system->atoms[k].x[2],
+               nbr_k->dvec[0], nbr_k->dvec[1], nbr_k->dvec[2],
+               rel_box[0], rel_box[1], rel_box[2],
+               temp[0], temp[1], temp[2] ); */
+        }
+    }
+
+    /* then atom j itself */
+    rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                     /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );   /*2nd, dBO*/
+
+    rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );              /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/
+
+    rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
+    rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
+    rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/
+
+    rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2);       /*1st,dBOpi2*/
+    rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBOpi2*/
+    rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
+
+    /* force */
+    rvec_Add( system->atoms[j].f, temp );
+    /* pressure */
+    rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
+    rvec_Add( data->ext_press, ext_press );
+
+    /* if( !ivec_isZero( nbr_j->rel_box ) )
+       fprintf( stderr, "%3d %3d %3d: dvec[%10.6f %10.6f %10.6f] 
+       ext[%3d %3d %3d] f[%10.6f %10.6f %10.6f]\n",
+       i+1, system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
+       j+1, system->atoms[j].x[0], system->atoms[j].x[1], system->atoms[j].x[2],
+       j+1, nbr_j->dvec[0], nbr_j->dvec[1], nbr_j->dvec[2],
+       nbr_j->rel_box[0], nbr_j->rel_box[1], nbr_j->rel_box[2],
+       temp[0], temp[1], temp[2] ); */
 }
 
 /////////////////////////////////////////////////////////////
@@ -526,388 +526,388 @@ void Add_dBond_to_Forces_NPT( int i, int pj, reax_system *system,
 /////////////////////////////////////////////////////////////
 
 HOST_DEVICE void Cuda_Add_dBond_to_Forces_NPT( int i, int pj, reax_atom *atoms, 
-		simulation_data *data, static_storage *workspace, 
-		list *bonds )
+        simulation_data *data, static_storage *workspace, 
+        list *bonds )
 {
-	bond_data *nbr_j, *nbr_k;
-	bond_order_data *bo_ij, *bo_ji; 
-	dbond_coefficients coef;
-	rvec temp, ext_press;
-	ivec rel_box;
-	int pk, k, j;
-
-	/* Initializations */
-	nbr_j = &(bonds->select.bond_list[pj]);
-	j = nbr_j->nbr;
-	bo_ij = &(nbr_j->bo_data);
-	bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-
-	coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-	coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-	coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-
-	coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-
-	coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-
-	coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-	coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-	coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-
-
-	/************************************
-	 * forces related to atom i          *
-	 * first neighbors of atom i         *
-	 ************************************/
-	for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-		nbr_k = &(bonds->select.bond_list[pk]);
-		k = nbr_k->nbr;
-
-		rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp );       /*2nd,dBO*/
-		rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/
-		rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/
-		rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/
-
-		/* force */
-		rvec_Add( atoms[k].f, temp );
-		/* pressure */
-		rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
-		rvec_Add( data->ext_press, ext_press );
-	}
-
-	/* then atom i itself  */
-	rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st, dBO*/
-	rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd, dBO*/
-
-	rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st, dBO*/
-	rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/
-
-	rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );         /*1st,dBOpi*/
-	rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );               /*2nd,dBOpi*/
-	rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/
-
-	rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ;      /*1st,dBO_pi2*/
-	rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBO_pi2*/
-	rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/
-
-	/* force */
-	rvec_Add( atoms[i].f, temp );
-	/* ext pressure due to i dropped, counting force on j only will be enough */
-
-
-	/****************************************************************************
-	 * forces and pressure related to atom j                                    *
-	 * first neighbors of atom j                                                *
-	 ***************************************************************************/
-	for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
-		nbr_k = &(bonds->select.bond_list[pk]);
-		k = nbr_k->nbr;
-
-		rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );       /*3rd,dBO*/
-		rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ 
-		rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/
-		rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/
-
-		/* force */
-		rvec_Add( atoms[k].f, temp );
-		/* pressure */
-		if( k != i ) {
-			ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box  wrt i
-			rvec_iMultiply( ext_press, rel_box, temp );
-			rvec_Add( data->ext_press, ext_press );
-		}
-	}
-
-	/* then atom j itself */
-	rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                     /*1st, dBO*/
-	rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );   /*2nd, dBO*/
-
-	rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );              /*1st, dBO*/
-	rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/
-
-	rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
-	rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
-	rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/
-
-	rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2);       /*1st,dBOpi2*/
-	rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBOpi2*/
-	rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
-
-	/* force */
-	rvec_Add( atoms[j].f, temp );
-	/* pressure */
-	rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
-	rvec_Add( data->ext_press, ext_press );
+    bond_data *nbr_j, *nbr_k;
+    bond_order_data *bo_ij, *bo_ji; 
+    dbond_coefficients coef;
+    rvec temp, ext_press;
+    ivec rel_box;
+    int pk, k, j;
+
+    /* Initializations */
+    nbr_j = &(bonds->select.bond_list[pj]);
+    j = nbr_j->nbr;
+    bo_ij = &(nbr_j->bo_data);
+    bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+
+    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+
+    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+
+    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+
+    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+
+
+    /************************************
+     * forces related to atom i          *
+     * first neighbors of atom i         *
+     ************************************/
+    for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        k = nbr_k->nbr;
+
+        rvec_Scale( temp, -coef.C2dbo, nbr_k->bo_data.dBOp );       /*2nd,dBO*/
+        rvec_ScaledAdd( temp, -coef.C2dDelta, nbr_k->bo_data.dBOp );/*dDelta*/
+        rvec_ScaledAdd( temp, -coef.C3dbopi, nbr_k->bo_data.dBOp ); /*3rd,dBOpi*/
+        rvec_ScaledAdd( temp, -coef.C3dbopi2, nbr_k->bo_data.dBOp );/*3rd,dBOpi2*/
+
+        /* force */
+        rvec_Add( atoms[k].f, temp );
+        /* pressure */
+        rvec_iMultiply( ext_press, nbr_k->rel_box, temp );
+        rvec_Add( data->ext_press, ext_press );
+    }
+
+    /* then atom i itself  */
+    rvec_Scale( temp, coef.C1dbo, bo_ij->dBOp );                      /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C2dbo, workspace->dDeltap_self[i] );   /*2nd, dBO*/
+
+    rvec_ScaledAdd( temp, coef.C1dDelta, bo_ij->dBOp );               /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C2dDelta, workspace->dDeltap_self[i] );/*2nd, dBO*/
+
+    rvec_ScaledAdd( temp, coef.C1dbopi, bo_ij->dln_BOp_pi );         /*1st,dBOpi*/
+    rvec_ScaledAdd( temp, coef.C2dbopi, bo_ij->dBOp );               /*2nd,dBOpi*/
+    rvec_ScaledAdd( temp, coef.C3dbopi, workspace->dDeltap_self[i] );/*3rd,dBOpi*/
+
+    rvec_ScaledAdd(temp, coef.C1dbopi2, bo_ij->dln_BOp_pi2) ;      /*1st,dBO_pi2*/
+    rvec_ScaledAdd(temp, coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBO_pi2*/
+    rvec_ScaledAdd(temp, coef.C3dbopi2, workspace->dDeltap_self[i]);/*3rd,dBO_pi2*/
+
+    /* force */
+    rvec_Add( atoms[i].f, temp );
+    /* ext pressure due to i dropped, counting force on j only will be enough */
+
+
+    /****************************************************************************
+     * forces and pressure related to atom j                                    *
+     * first neighbors of atom j                                                *
+     ***************************************************************************/
+    for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        k = nbr_k->nbr;
+
+        rvec_Scale( temp, -coef.C3dbo, nbr_k->bo_data.dBOp );       /*3rd,dBO*/
+        rvec_ScaledAdd( temp, -coef.C3dDelta, nbr_k->bo_data.dBOp );/*dDelta*/ 
+        rvec_ScaledAdd( temp, -coef.C4dbopi, nbr_k->bo_data.dBOp ); /*4th,dBOpi*/
+        rvec_ScaledAdd( temp, -coef.C4dbopi2, nbr_k->bo_data.dBOp );/*4th,dBOpi2*/
+
+        /* force */
+        rvec_Add( atoms[k].f, temp );
+        /* pressure */
+        if( k != i ) {
+            ivec_Sum(rel_box, nbr_k->rel_box, nbr_j->rel_box);//k's rel_box  wrt i
+            rvec_iMultiply( ext_press, rel_box, temp );
+            rvec_Add( data->ext_press, ext_press );
+        }
+    }
+
+    /* then atom j itself */
+    rvec_Scale( temp, -coef.C1dbo, bo_ij->dBOp );                     /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C3dbo, workspace->dDeltap_self[j] );   /*2nd, dBO*/
+
+    rvec_ScaledAdd( temp, -coef.C1dDelta, bo_ij->dBOp );              /*1st, dBO*/
+    rvec_ScaledAdd( temp, coef.C3dDelta, workspace->dDeltap_self[j] );/*2nd, dBO*/
+
+    rvec_ScaledAdd( temp, -coef.C1dbopi, bo_ij->dln_BOp_pi );        /*1st,dBOpi*/
+    rvec_ScaledAdd( temp, -coef.C2dbopi, bo_ij->dBOp );              /*2nd,dBOpi*/
+    rvec_ScaledAdd( temp, coef.C4dbopi, workspace->dDeltap_self[j] );/*3rd,dBOpi*/
+
+    rvec_ScaledAdd(temp, -coef.C1dbopi2, bo_ij->dln_BOp_pi2);       /*1st,dBOpi2*/
+    rvec_ScaledAdd(temp, -coef.C2dbopi2, bo_ij->dBOp);              /*2nd,dBOpi2*/
+    rvec_ScaledAdd(temp, coef.C4dbopi2, workspace->dDeltap_self[j]);/*3rd,dBOpi2*/
+
+    /* force */
+    rvec_Add( atoms[j].f, temp );
+    /* pressure */
+    rvec_iMultiply( ext_press, nbr_j->rel_box, temp );
+    rvec_Add( data->ext_press, ext_press );
 }
 
 /////////////////////////////////////////////////////////////
 //Cuda Functions
 /////////////////////////////////////////////////////////////
 void Add_dBond_to_Forces( int i, int pj, reax_system *system, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists )
+        simulation_data *data, static_storage *workspace, 
+        list **lists )
 {
-	list *bonds = (*lists) + BONDS;
-	bond_data *nbr_j, *nbr_k;
-	bond_order_data *bo_ij, *bo_ji; 
-	dbond_coefficients coef;
-	int pk, k, j;
-
-	/* Initializations */ 
-	nbr_j = &(bonds->select.bond_list[pj]);
-	j = nbr_j->nbr;
-	bo_ij = &(nbr_j->bo_data);
-	bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-
-	coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-	coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-	coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-
-	coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-
-	coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-
-	coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-	coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-	coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-
-	for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-		nbr_k = &(bonds->select.bond_list[pk]);
-		k = nbr_k->nbr;
-
-		rvec_ScaledAdd( system->atoms[k].f, -coef.C2dbo, nbr_k->bo_data.dBOp ); 
-		/*2nd, dBO*/
-		rvec_ScaledAdd( system->atoms[k].f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
-		/*dDelta*/
-		rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
-		/*3rd, dBOpi*/
-		rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
-		/*3rd, dBOpi2*/
-	}
-
-	rvec_ScaledAdd( system->atoms[i].f, coef.C1dbo, bo_ij->dBOp );
-	/*1st, dBO*/
-	rvec_ScaledAdd( system->atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] );
-	/*2nd, dBO*/
-
-	rvec_ScaledAdd(system->atoms[i].f, coef.C1dDelta, bo_ij->dBOp);
-	/*1st, dBO*/
-	rvec_ScaledAdd(system->atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]);
-	/*2nd, dBO*/
-
-	rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi );
-	/*1st, dBOpi*/
-	rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi, bo_ij->dBOp );
-	/*2nd, dBOpi*/
-	rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]);
-	/*3rd, dBOpi*/
-
-	rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-	/*1st, dBO_pi2*/
-	rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi2, bo_ij->dBOp );
-	/*2nd, dBO_pi2*/
-	rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]);
-	/*3rd, dBO_pi2*/
-
-
-	for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
-		nbr_k = &(bonds->select.bond_list[pk]);
-		k = nbr_k->nbr;
-
-		rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbo, nbr_k->bo_data.dBOp );
-		/*3rd, dBO*/
-		rvec_ScaledAdd( system->atoms[k].f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
-		/*dDelta*/ 
-		rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
-		/*4th, dBOpi*/
-		rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
-		/*4th, dBOpi2*/
-	}
-
-	rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbo, bo_ij->dBOp );
-	/*1st, dBO*/
-	rvec_ScaledAdd( system->atoms[j].f, coef.C3dbo, workspace->dDeltap_self[j] );
-	/*2nd, dBO*/
-
-	rvec_ScaledAdd( system->atoms[j].f, -coef.C1dDelta, bo_ij->dBOp );
-	/*1st, dBO*/
-	rvec_ScaledAdd(system->atoms[j].f, coef.C3dDelta, workspace->dDeltap_self[j]);
-	/*2nd, dBO*/
-
-	rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi, bo_ij->dln_BOp_pi );
-	/*1st, dBOpi*/
-	rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi, bo_ij->dBOp );
-	/*2nd, dBOpi*/
-	rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi, workspace->dDeltap_self[j]);
-	/*3rd, dBOpi*/
-
-	rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-	/*1st, dBOpi2*/
-	rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi2, bo_ij->dBOp );
-	/*2nd, dBOpi2*/
-	rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi2, workspace->dDeltap_self[j]);
-	/*3rd, dBOpi2*/
+    list *bonds = (*lists) + BONDS;
+    bond_data *nbr_j, *nbr_k;
+    bond_order_data *bo_ij, *bo_ji; 
+    dbond_coefficients coef;
+    int pk, k, j;
+
+    /* Initializations */ 
+    nbr_j = &(bonds->select.bond_list[pj]);
+    j = nbr_j->nbr;
+    bo_ij = &(nbr_j->bo_data);
+    bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+
+    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+
+    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+
+    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+
+    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+
+    for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        k = nbr_k->nbr;
+
+        rvec_ScaledAdd( system->atoms[k].f, -coef.C2dbo, nbr_k->bo_data.dBOp ); 
+        /*2nd, dBO*/
+        rvec_ScaledAdd( system->atoms[k].f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
+        /*dDelta*/
+        rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
+        /*3rd, dBOpi*/
+        rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
+        /*3rd, dBOpi2*/
+    }
+
+    rvec_ScaledAdd( system->atoms[i].f, coef.C1dbo, bo_ij->dBOp );
+    /*1st, dBO*/
+    rvec_ScaledAdd( system->atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] );
+    /*2nd, dBO*/
+
+    rvec_ScaledAdd(system->atoms[i].f, coef.C1dDelta, bo_ij->dBOp);
+    /*1st, dBO*/
+    rvec_ScaledAdd(system->atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]);
+    /*2nd, dBO*/
+
+    rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi );
+    /*1st, dBOpi*/
+    rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi, bo_ij->dBOp );
+    /*2nd, dBOpi*/
+    rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]);
+    /*3rd, dBOpi*/
+
+    rvec_ScaledAdd( system->atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+    /*1st, dBO_pi2*/
+    rvec_ScaledAdd( system->atoms[i].f, coef.C2dbopi2, bo_ij->dBOp );
+    /*2nd, dBO_pi2*/
+    rvec_ScaledAdd(system->atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]);
+    /*3rd, dBO_pi2*/
+
+
+    for( pk = Start_Index(j, bonds); pk < End_Index(j, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        k = nbr_k->nbr;
+
+        rvec_ScaledAdd( system->atoms[k].f, -coef.C3dbo, nbr_k->bo_data.dBOp );
+        /*3rd, dBO*/
+        rvec_ScaledAdd( system->atoms[k].f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
+        /*dDelta*/ 
+        rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
+        /*4th, dBOpi*/
+        rvec_ScaledAdd( system->atoms[k].f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
+        /*4th, dBOpi2*/
+    }
+
+    rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbo, bo_ij->dBOp );
+    /*1st, dBO*/
+    rvec_ScaledAdd( system->atoms[j].f, coef.C3dbo, workspace->dDeltap_self[j] );
+    /*2nd, dBO*/
+
+    rvec_ScaledAdd( system->atoms[j].f, -coef.C1dDelta, bo_ij->dBOp );
+    /*1st, dBO*/
+    rvec_ScaledAdd(system->atoms[j].f, coef.C3dDelta, workspace->dDeltap_self[j]);
+    /*2nd, dBO*/
+
+    rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi, bo_ij->dln_BOp_pi );
+    /*1st, dBOpi*/
+    rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi, bo_ij->dBOp );
+    /*2nd, dBOpi*/
+    rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi, workspace->dDeltap_self[j]);
+    /*3rd, dBOpi*/
+
+    rvec_ScaledAdd( system->atoms[j].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+    /*1st, dBOpi2*/
+    rvec_ScaledAdd( system->atoms[j].f, -coef.C2dbopi2, bo_ij->dBOp );
+    /*2nd, dBOpi2*/
+    rvec_ScaledAdd(system->atoms[j].f, coef.C4dbopi2, workspace->dDeltap_self[j]);
+    /*3rd, dBOpi2*/
 }
 
 HOST_DEVICE void Cuda_Add_dBond_to_Forces ( int i, int pj, reax_atom *atoms, 
-		static_storage *workspace, list *bonds )
+        static_storage *workspace, list *bonds )
 {
-	bond_data *nbr_j, *nbr_k;
-	bond_order_data *bo_ij, *bo_ji; 
-	dbond_coefficients coef;
-	int pk, k, j;
-	rvec t_f;
-
-	/* Initializations */ 
-	nbr_j = &(bonds->select.bond_list[pj]);
-	j = nbr_j->nbr;
-
-	if (i < j)
-	{
-		bo_ij = &(nbr_j->bo_data);
-		bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-	} else {
-		bo_ji = &(nbr_j->bo_data);
-		bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
-	}
-
-	coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-	coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-	coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
-
-	coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-	coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
-
-	coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-	coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
-
-	coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-	coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-	coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
-
-	if ( i < j) {
-		for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-			nbr_k = &(bonds->select.bond_list[pk]);
-			k = nbr_k->nbr;
-			rvec_MakeZero (t_f);
-
-			rvec_ScaledAdd( t_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); 
-			/*2nd, dBO*/
-			rvec_ScaledAdd( t_f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
-			/*dDelta*/
-			rvec_ScaledAdd( t_f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
-			/*3rd, dBOpi*/
-			rvec_ScaledAdd( t_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
-			/*3rd, dBOpi2*/
-
-			//Store in the temp place
-			rvec_Add (nbr_k->t_f, t_f);
-		}
-
-		rvec_ScaledAdd( atoms[i].f, coef.C1dbo, bo_ij->dBOp );
-		/*1st, dBO*/
-		rvec_ScaledAdd( atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] );
-		/*2nd, dBO*/
-
-		rvec_ScaledAdd(atoms[i].f, coef.C1dDelta, bo_ij->dBOp);
-		/*1st, dBO*/
-		rvec_ScaledAdd(atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]);
-		/*2nd, dBO*/
-
-		rvec_ScaledAdd( atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi );
-		/*1st, dBOpi*/
-		rvec_ScaledAdd( atoms[i].f, coef.C2dbopi, bo_ij->dBOp );
-		/*2nd, dBOpi*/
-		rvec_ScaledAdd( atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]);
-		/*3rd, dBOpi*/
-
-		rvec_ScaledAdd( atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-		/*1st, dBO_pi2*/
-		rvec_ScaledAdd( atoms[i].f, coef.C2dbopi2, bo_ij->dBOp );
-		/*2nd, dBO_pi2*/
-		rvec_ScaledAdd( atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]);
-		/*3rd, dBO_pi2*/
-	}
-	else 
-	{
-		for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-			nbr_k = &(bonds->select.bond_list[pk]);
-			k = nbr_k->nbr;
-			rvec_MakeZero (t_f);
-
-			rvec_ScaledAdd( t_f, -coef.C3dbo, nbr_k->bo_data.dBOp );
-			/*3rd, dBO*/
-			rvec_ScaledAdd( t_f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
-			/*dDelta*/ 
-			rvec_ScaledAdd( t_f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
-			/*4th, dBOpi*/
-			rvec_ScaledAdd( t_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
-			/*4th, dBOpi2*/
-
-			//Store in the temp place
-			rvec_Add (nbr_k->t_f, t_f);
-		}
-
-		rvec_ScaledAdd( atoms[i].f, -coef.C1dbo, bo_ij->dBOp );
-		/*1st, dBO*/
-		rvec_ScaledAdd( atoms[i].f, coef.C3dbo, workspace->dDeltap_self[i] );
-		/*2nd, dBO*/
-
-		rvec_ScaledAdd( atoms[i].f, -coef.C1dDelta, bo_ij->dBOp );
-		/*1st, dBO*/
-		rvec_ScaledAdd(atoms[i].f, coef.C3dDelta, workspace->dDeltap_self[i]);
-		/*2nd, dBO*/
-
-		rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi, bo_ij->dln_BOp_pi );
-		/*1st, dBOpi*/
-		rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi, bo_ij->dBOp );
-		/*2nd, dBOpi*/
-		rvec_ScaledAdd(atoms[i].f, coef.C4dbopi, workspace->dDeltap_self[i]);
-		/*3rd, dBOpi*/
-
-		rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
-		/*1st, dBOpi2*/
-		rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi2, bo_ij->dBOp );
-		/*2nd, dBOpi2*/
-		rvec_ScaledAdd(atoms[i].f, coef.C4dbopi2, workspace->dDeltap_self[i]);
-		/*3rd, dBOpi2*/
-	}
+    bond_data *nbr_j, *nbr_k;
+    bond_order_data *bo_ij, *bo_ji; 
+    dbond_coefficients coef;
+    int pk, k, j;
+    rvec t_f;
+
+    /* Initializations */ 
+    nbr_j = &(bonds->select.bond_list[pj]);
+    j = nbr_j->nbr;
+
+    if (i < j)
+    {
+        bo_ij = &(nbr_j->bo_data);
+        bo_ji = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    } else {
+        bo_ji = &(nbr_j->bo_data);
+        bo_ij = &(bonds->select.bond_list[ nbr_j->sym_index ].bo_data);
+    }
+
+    coef.C1dbo = bo_ij->C1dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C2dbo = bo_ij->C2dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+    coef.C3dbo = bo_ij->C3dbo * (bo_ij->Cdbo + bo_ji->Cdbo);
+
+    coef.C1dbopi = bo_ij->C1dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C2dbopi = bo_ij->C2dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C3dbopi = bo_ij->C3dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+    coef.C4dbopi = bo_ij->C4dbopi * (bo_ij->Cdbopi + bo_ji->Cdbopi);
+
+    coef.C1dbopi2 = bo_ij->C1dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C2dbopi2 = bo_ij->C2dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C3dbopi2 = bo_ij->C3dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+    coef.C4dbopi2 = bo_ij->C4dbopi2 * (bo_ij->Cdbopi2 + bo_ji->Cdbopi2);
+
+    coef.C1dDelta = bo_ij->C1dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C2dDelta = bo_ij->C2dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+    coef.C3dDelta = bo_ij->C3dbo * (workspace->CdDelta[i]+workspace->CdDelta[j]);
+
+    if ( i < j) {
+        for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+            nbr_k = &(bonds->select.bond_list[pk]);
+            k = nbr_k->nbr;
+            rvec_MakeZero (t_f);
+
+            rvec_ScaledAdd( t_f, -coef.C2dbo, nbr_k->bo_data.dBOp ); 
+            /*2nd, dBO*/
+            rvec_ScaledAdd( t_f, -coef.C2dDelta, nbr_k->bo_data.dBOp );
+            /*dDelta*/
+            rvec_ScaledAdd( t_f, -coef.C3dbopi, nbr_k->bo_data.dBOp );
+            /*3rd, dBOpi*/
+            rvec_ScaledAdd( t_f, -coef.C3dbopi2, nbr_k->bo_data.dBOp );
+            /*3rd, dBOpi2*/
+
+            //Store in the temp place
+            rvec_Add (nbr_k->t_f, t_f);
+        }
+
+        rvec_ScaledAdd( atoms[i].f, coef.C1dbo, bo_ij->dBOp );
+        /*1st, dBO*/
+        rvec_ScaledAdd( atoms[i].f, coef.C2dbo, workspace->dDeltap_self[i] );
+        /*2nd, dBO*/
+
+        rvec_ScaledAdd(atoms[i].f, coef.C1dDelta, bo_ij->dBOp);
+        /*1st, dBO*/
+        rvec_ScaledAdd(atoms[i].f, coef.C2dDelta, workspace->dDeltap_self[i]);
+        /*2nd, dBO*/
+
+        rvec_ScaledAdd( atoms[i].f, coef.C1dbopi, bo_ij->dln_BOp_pi );
+        /*1st, dBOpi*/
+        rvec_ScaledAdd( atoms[i].f, coef.C2dbopi, bo_ij->dBOp );
+        /*2nd, dBOpi*/
+        rvec_ScaledAdd( atoms[i].f, coef.C3dbopi, workspace->dDeltap_self[i]);
+        /*3rd, dBOpi*/
+
+        rvec_ScaledAdd( atoms[i].f, coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+        /*1st, dBO_pi2*/
+        rvec_ScaledAdd( atoms[i].f, coef.C2dbopi2, bo_ij->dBOp );
+        /*2nd, dBO_pi2*/
+        rvec_ScaledAdd( atoms[i].f, coef.C3dbopi2, workspace->dDeltap_self[i]);
+        /*3rd, dBO_pi2*/
+    }
+    else 
+    {
+        for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+            nbr_k = &(bonds->select.bond_list[pk]);
+            k = nbr_k->nbr;
+            rvec_MakeZero (t_f);
+
+            rvec_ScaledAdd( t_f, -coef.C3dbo, nbr_k->bo_data.dBOp );
+            /*3rd, dBO*/
+            rvec_ScaledAdd( t_f, -coef.C3dDelta, nbr_k->bo_data.dBOp );
+            /*dDelta*/ 
+            rvec_ScaledAdd( t_f, -coef.C4dbopi, nbr_k->bo_data.dBOp );
+            /*4th, dBOpi*/
+            rvec_ScaledAdd( t_f, -coef.C4dbopi2, nbr_k->bo_data.dBOp );
+            /*4th, dBOpi2*/
+
+            //Store in the temp place
+            rvec_Add (nbr_k->t_f, t_f);
+        }
+
+        rvec_ScaledAdd( atoms[i].f, -coef.C1dbo, bo_ij->dBOp );
+        /*1st, dBO*/
+        rvec_ScaledAdd( atoms[i].f, coef.C3dbo, workspace->dDeltap_self[i] );
+        /*2nd, dBO*/
+
+        rvec_ScaledAdd( atoms[i].f, -coef.C1dDelta, bo_ij->dBOp );
+        /*1st, dBO*/
+        rvec_ScaledAdd(atoms[i].f, coef.C3dDelta, workspace->dDeltap_self[i]);
+        /*2nd, dBO*/
+
+        rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi, bo_ij->dln_BOp_pi );
+        /*1st, dBOpi*/
+        rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi, bo_ij->dBOp );
+        /*2nd, dBOpi*/
+        rvec_ScaledAdd(atoms[i].f, coef.C4dbopi, workspace->dDeltap_self[i]);
+        /*3rd, dBOpi*/
+
+        rvec_ScaledAdd( atoms[i].f, -coef.C1dbopi2, bo_ij->dln_BOp_pi2 );
+        /*1st, dBOpi2*/
+        rvec_ScaledAdd( atoms[i].f, -coef.C2dbopi2, bo_ij->dBOp );
+        /*2nd, dBOpi2*/
+        rvec_ScaledAdd(atoms[i].f, coef.C4dbopi2, workspace->dDeltap_self[i]);
+        /*3rd, dBOpi2*/
+    }
 }
 
 HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list *bonds)
 {
-	int pk;
-	bond_data *nbr_k, *nbr_k_sym;
-
-	/*
-	   for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-	   nbr_k = &(bonds->select.bond_list[pk]);
-	   rvec_Add (atoms[i].f, nbr_k->t_f);
-	   }
-	 */
-
-	for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
-		nbr_k = &(bonds->select.bond_list[pk]);
-		nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] );
-
-		rvec_Add (atoms[i].f, nbr_k_sym->t_f);
-	}
+    int pk;
+    bond_data *nbr_k, *nbr_k_sym;
+
+    /*
+       for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+       nbr_k = &(bonds->select.bond_list[pk]);
+       rvec_Add (atoms[i].f, nbr_k->t_f);
+       }
+     */
+
+    for( pk = Start_Index(i, bonds); pk < End_Index(i, bonds); ++pk ) {
+        nbr_k = &(bonds->select.bond_list[pk]);
+        nbr_k_sym = &( bonds->select.bond_list [nbr_k->sym_index] );
+
+        rvec_Add (atoms[i].f, nbr_k_sym->t_f);
+    }
 }
 
 /* Locate j on i's list.
@@ -915,52 +915,52 @@ HOST_DEVICE void Cuda_dbond_to_Forces_postprocess (int i, reax_atom *atoms, list
    And this is the case given our method of neighbor generation*/
 int Locate_Symmetric_Bond( list *bonds, int i, int j )
 {
-	int start = Start_Index(i, bonds);
-	int end = End_Index(i, bonds);
-	int mid = (start + end) / 2;
-	int mid_nbr;
-
-	while( (mid_nbr = bonds->select.bond_list[mid].nbr) != j ) {
-		/*fprintf( stderr, "\tstart: %d   end: %d   mid: %d\n", 
-		  start, end, mid );*/
-		if( mid_nbr < j )
-			start = mid+1;
-		else end = mid - 1;
-
-		mid = (start + end) / 2;
-	}
-
-	return mid;
+    int start = Start_Index(i, bonds);
+    int end = End_Index(i, bonds);
+    int mid = (start + end) / 2;
+    int mid_nbr;
+
+    while( (mid_nbr = bonds->select.bond_list[mid].nbr) != j ) {
+        /*fprintf( stderr, "\tstart: %d   end: %d   mid: %d\n", 
+          start, end, mid );*/
+        if( mid_nbr < j )
+            start = mid+1;
+        else end = mid - 1;
+
+        mid = (start + end) / 2;
+    }
+
+    return mid;
 }
 
 
 inline void Copy_Neighbor_Data( bond_data *dest, near_neighbor_data *src )
 {
-	dest->nbr = src->nbr;
-	dest->d = src->d;
-	rvec_Copy( dest->dvec, src->dvec );
-	ivec_Copy( dest->rel_box, src->rel_box );
-	/* rvec_Copy( dest->ext_factor, src->ext_factor );*/
+    dest->nbr = src->nbr;
+    dest->d = src->d;
+    rvec_Copy( dest->dvec, src->dvec );
+    ivec_Copy( dest->rel_box, src->rel_box );
+    /* rvec_Copy( dest->ext_factor, src->ext_factor );*/
 }
 
 
 inline void Copy_Bond_Order_Data( bond_order_data *dest, bond_order_data *src )
 {
-	dest->BO = src->BO;
-	dest->BO_s = src->BO_s;
-	dest->BO_pi = src->BO_pi;
-	dest->BO_pi2 = src->BO_pi2;
-
-	rvec_Scale( dest->dBOp, -1.0, src->dBOp );
-	rvec_Scale( dest->dln_BOp_s, -1.0, src->dln_BOp_s );
-	rvec_Scale( dest->dln_BOp_pi, -1.0, src->dln_BOp_pi );
-	rvec_Scale( dest->dln_BOp_pi2, -1.0, src->dln_BOp_pi2 );
+    dest->BO = src->BO;
+    dest->BO_s = src->BO_s;
+    dest->BO_pi = src->BO_pi;
+    dest->BO_pi2 = src->BO_pi2;
+
+    rvec_Scale( dest->dBOp, -1.0, src->dBOp );
+    rvec_Scale( dest->dln_BOp_s, -1.0, src->dln_BOp_s );
+    rvec_Scale( dest->dln_BOp_pi, -1.0, src->dln_BOp_pi );
+    rvec_Scale( dest->dln_BOp_pi2, -1.0, src->dln_BOp_pi2 );
 }
 
 
 int compare_bonds( const void *p1, const void *p2 )
 {
-	return ((bond_data *)p1)->nbr - ((bond_data *)p2)->nbr;
+    return ((bond_data *)p1)->nbr - ((bond_data *)p2)->nbr;
 }
 
 
@@ -968,257 +968,257 @@ int compare_bonds( const void *p1, const void *p2 )
    belonging to a different atom in nbrhoods->nbr_list is sorted in its own.
    This can either be done in the general coordinator function or here */
 void Calculate_Bond_Orders( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
-	int i, j, pj, type_i, type_j;
-	int start_i, end_i;
-	int num_bonds, sym_index;
-	real p_boc1, p_boc2;
-	real val_i, Deltap_i, Deltap_boc_i;
-	real val_j, Deltap_j, Deltap_boc_j;
-	real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
-	real exp_p1i,	exp_p2i, exp_p1j, exp_p2j;
-	real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
-	real Cf45_ij, Cf45_ji, p_lp1;
-	real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
-	real explp1;
-	two_body_parameters *twbp;
-	bond_order_data *bo_ij, *bo_ji;
-	single_body_parameters *sbp_i, *sbp_j;
-	list *bonds = (*lists) + BONDS;
+    int i, j, pj, type_i, type_j;
+    int start_i, end_i;
+    int num_bonds, sym_index;
+    real p_boc1, p_boc2;
+    real val_i, Deltap_i, Deltap_boc_i;
+    real val_j, Deltap_j, Deltap_boc_j;
+    real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
+    real exp_p1i,    exp_p2i, exp_p1j, exp_p2j;
+    real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
+    real Cf45_ij, Cf45_ji, p_lp1;
+    real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
+    real explp1;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij, *bo_ji;
+    single_body_parameters *sbp_i, *sbp_j;
+    list *bonds = (*lists) + BONDS;
 #if defined(TEST_FORCES)
-	int  k, pk, start_j, end_j;
-	int  top_dbo=0, top_dDelta=0;
-	dbond_data *pdbo;
-	dDelta_data *ptop_dDelta;
-	list *dDeltas = (*lists) + DDELTA;
-	list *dBOs = (*lists) + DBO;
+    int  k, pk, start_j, end_j;
+    int  top_dbo=0, top_dDelta=0;
+    dbond_data *pdbo;
+    dDelta_data *ptop_dDelta;
+    list *dDeltas = (*lists) + DDELTA;
+    list *dBOs = (*lists) + DBO;
 #endif
 
-	num_bonds = 0;
-	p_boc1 = system->reaxprm.gp.l[0];
-	p_boc2 = system->reaxprm.gp.l[1];
-
-	/* Calculate Deltaprime, Deltaprime_boc values */
-	for( i = 0; i < system->N; ++i ) {
-		type_i = system->atoms[i].type;
-		sbp_i = &(system->reaxprm.sbp[type_i]);
-		workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
-		workspace->Deltap_boc[i] = 
-			workspace->total_bond_order[i] - sbp_i->valency_val;
-		workspace->total_bond_order[i] = 0;
-	}
-	// fprintf( stderr, "done with uncorrected bond orders\n" );
-
-
-	/* Corrected Bond Order calculations */
-	for( i = 0; i < system->N; ++i ) {
-		type_i = system->atoms[i].type;
-		sbp_i = &(system->reaxprm.sbp[type_i]);
-		val_i = sbp_i->valency;
-		Deltap_i = workspace->Deltap[i];
-		Deltap_boc_i = workspace->Deltap_boc[i];
-		start_i = Start_Index(i, bonds);
-		end_i = End_Index(i, bonds);
-		//fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
-		//       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
-
-		for( pj = start_i; pj < end_i; ++pj ) {
-			j = bonds->select.bond_list[pj].nbr;
-			type_j = system->atoms[j].type;
-			bo_ij = &( bonds->select.bond_list[pj].bo_data );
-			//fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
-
-			if( i < j ) {
-				twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] );	      
+    num_bonds = 0;
+    p_boc1 = system->reaxprm.gp.l[0];
+    p_boc2 = system->reaxprm.gp.l[1];
+
+    /* Calculate Deltaprime, Deltaprime_boc values */
+    for( i = 0; i < system->N; ++i ) {
+        type_i = system->atoms[i].type;
+        sbp_i = &(system->reaxprm.sbp[type_i]);
+        workspace->Deltap[i] = workspace->total_bond_order[i] - sbp_i->valency;
+        workspace->Deltap_boc[i] = 
+            workspace->total_bond_order[i] - sbp_i->valency_val;
+        workspace->total_bond_order[i] = 0;
+    }
+    // fprintf( stderr, "done with uncorrected bond orders\n" );
+
+
+    /* Corrected Bond Order calculations */
+    for( i = 0; i < system->N; ++i ) {
+        type_i = system->atoms[i].type;
+        sbp_i = &(system->reaxprm.sbp[type_i]);
+        val_i = sbp_i->valency;
+        Deltap_i = workspace->Deltap[i];
+        Deltap_boc_i = workspace->Deltap_boc[i];
+        start_i = Start_Index(i, bonds);
+        end_i = End_Index(i, bonds);
+        //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
+        //       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
+
+        for( pj = start_i; pj < end_i; ++pj ) {
+            j = bonds->select.bond_list[pj].nbr;
+            type_j = system->atoms[j].type;
+            bo_ij = &( bonds->select.bond_list[pj].bo_data );
+            //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
+
+            if( i < j ) {
+                twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] );          
 #ifdef TEST_FORCES
-				Set_Start_Index( pj, top_dbo, dBOs );
-				/* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", 
-				   workspace->reverse_map[i], workspace->reverse_map[j], 
-				   twbp->ovc, twbp->v13cor, bo_ij->BO ); */
+                Set_Start_Index( pj, top_dbo, dBOs );
+                /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", 
+                   workspace->reverse_map[i], workspace->reverse_map[j], 
+                   twbp->ovc, twbp->v13cor, bo_ij->BO ); */
 #endif
-				if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
-					/* There is no correction to bond orders nor to derivatives of 
-					   bond order prime! So we leave bond orders unchanged and 
-					   set derivative of bond order coefficients s.t. 
-					   dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
-					bo_ij->C1dbo = 1.000000;
-					bo_ij->C2dbo = 0.000000;
-					bo_ij->C3dbo = 0.000000; 
-
-					bo_ij->C1dbopi = bo_ij->BO_pi;
-					bo_ij->C2dbopi = 0.000000;
-					bo_ij->C3dbopi = 0.000000;
-					bo_ij->C4dbopi = 0.000000;
-
-					bo_ij->C1dbopi2 = bo_ij->BO_pi2; 
-					bo_ij->C2dbopi2 = 0.000000;
-					bo_ij->C3dbopi2 = 0.000000;
-					bo_ij->C4dbopi2 = 0.000000;
+                if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
+                    /* There is no correction to bond orders nor to derivatives of 
+                       bond order prime! So we leave bond orders unchanged and 
+                       set derivative of bond order coefficients s.t. 
+                       dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
+                    bo_ij->C1dbo = 1.000000;
+                    bo_ij->C2dbo = 0.000000;
+                    bo_ij->C3dbo = 0.000000; 
+
+                    bo_ij->C1dbopi = bo_ij->BO_pi;
+                    bo_ij->C2dbopi = 0.000000;
+                    bo_ij->C3dbopi = 0.000000;
+                    bo_ij->C4dbopi = 0.000000;
+
+                    bo_ij->C1dbopi2 = bo_ij->BO_pi2; 
+                    bo_ij->C2dbopi2 = 0.000000;
+                    bo_ij->C3dbopi2 = 0.000000;
+                    bo_ij->C4dbopi2 = 0.000000;
 
 #ifdef TEST_FORCES
-					pdbo = &(dBOs->select.dbo_list[ top_dbo ]);
-
-					// compute dBO_ij/dr_i
-					pdbo->wrt = i;
-					rvec_Copy( pdbo->dBO, bo_ij->dBOp );
-					rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
-					rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
-
-					// compute dBO_ij/dr_j
-					pdbo++;
-					pdbo->wrt = j;
-					rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp );
-					rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi );
-					rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 );
-
-					top_dbo += 2;
+                    pdbo = &(dBOs->select.dbo_list[ top_dbo ]);
+
+                    // compute dBO_ij/dr_i
+                    pdbo->wrt = i;
+                    rvec_Copy( pdbo->dBO, bo_ij->dBOp );
+                    rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
+                    rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
+
+                    // compute dBO_ij/dr_j
+                    pdbo++;
+                    pdbo->wrt = j;
+                    rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp );
+                    rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi );
+                    rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 );
+
+                    top_dbo += 2;
 #endif
-				}
-				else {
-					val_j = system->reaxprm.sbp[type_j].valency;
-					Deltap_j = workspace->Deltap[j];
-					Deltap_boc_j = workspace->Deltap_boc[j];
-
-					/* on page 1 */
-					if( twbp->ovc >= 0.001 ) {
-						/* Correction for overcoordination */		
-						exp_p1i = EXP( -p_boc1 * Deltap_i );
-						exp_p2i = EXP( -p_boc2 * Deltap_i );
-						exp_p1j = EXP( -p_boc1 * Deltap_j );
-						exp_p2j = EXP( -p_boc2 * Deltap_j );
-
-						f2 = exp_p1i + exp_p1j;			
-						f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
-						f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + 
-								( val_j + f2 )/( val_j + f2 + f3 ) );
-
-						/*fprintf( stderr,"%6d%6d\t%g %g   j:%g %g  p_boc:%g %g\n",
-						  i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 );
-						  fprintf( stderr,"\tf:%g  %g  %g, exp:%g %g %g %g\n", 
-						  f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
-
-						/* Now come the derivates */		
-						/* Bond Order pages 5-7, derivative of f1 */
-						temp = f2 + f3;
-						u1_ij = val_i + temp;
-						u1_ji = val_j + temp;
-						Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji ));
-						Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + 
-								( u1_ji - f3 ) / SQR( u1_ji ));
-
-						//Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
-						//          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
-						Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - 
-								((val_i+f2) / SQR(u1_ij)) * 
-								( -p_boc1 * exp_p1i + 
-								  exp_p2i / ( exp_p2i + exp_p2j ) ) + 
-								-p_boc1 * exp_p1i / u1_ji - 
-								((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i +  
-								exp_p2i / ( exp_p2i + exp_p2j ) ));
-
-						Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + 
-							Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); 
-						//fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
-					}
-					else {
-						/* No overcoordination correction! */
-						f1 = 1.0;
-						Cf1_ij = Cf1_ji = 0.0;		  
-					}
-
-					if( twbp->v13cor >= 0.001 ) {
-						/* Correction for 1-3 bond orders */
-						exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
-									Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
-						exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
-									Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
-
-						f4 = 1. / (1. + exp_f4);
-						f5 = 1. / (1. + exp_f5);
-						f4f5 = f4 * f5;
-
-						/* Bond Order pages 8-9, derivative of f4 and f5 */
-						/*temp = twbp->p_boc5 - 
-						  twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
-						  u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
-						  u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
-						  Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
-						  Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
-						Cf45_ij = -f4 * exp_f4;
-						Cf45_ji = -f5 * exp_f5;
-					}
-					else {
-						f4 = f5 = f4f5 = 1.0;
-						Cf45_ij = Cf45_ji = 0.0;
-					}
-
-					/* Bond Order page 10, derivative of total bond order */
-					A0_ij = f1 * f4f5;
-					A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * 
-						(Cf45_ij + Cf45_ji);
-					A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
-					A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
-					A3_ij = A2_ij + Cf1_ij / f1;
-					A3_ji = A2_ji + Cf1_ji / f1;
-
-					/*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f 
+                }
+                else {
+                    val_j = system->reaxprm.sbp[type_j].valency;
+                    Deltap_j = workspace->Deltap[j];
+                    Deltap_boc_j = workspace->Deltap_boc[j];
+
+                    /* on page 1 */
+                    if( twbp->ovc >= 0.001 ) {
+                        /* Correction for overcoordination */        
+                        exp_p1i = EXP( -p_boc1 * Deltap_i );
+                        exp_p2i = EXP( -p_boc2 * Deltap_i );
+                        exp_p1j = EXP( -p_boc1 * Deltap_j );
+                        exp_p2j = EXP( -p_boc2 * Deltap_j );
+
+                        f2 = exp_p1i + exp_p1j;            
+                        f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
+                        f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + 
+                                ( val_j + f2 )/( val_j + f2 + f3 ) );
+
+                        /*fprintf( stderr,"%6d%6d\t%g %g   j:%g %g  p_boc:%g %g\n",
+                          i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 );
+                          fprintf( stderr,"\tf:%g  %g  %g, exp:%g %g %g %g\n", 
+                          f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
+
+                        /* Now come the derivates */        
+                        /* Bond Order pages 5-7, derivative of f1 */
+                        temp = f2 + f3;
+                        u1_ij = val_i + temp;
+                        u1_ji = val_j + temp;
+                        Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji ));
+                        Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + 
+                                ( u1_ji - f3 ) / SQR( u1_ji ));
+
+                        //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
+                        //          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
+                        Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - 
+                                ((val_i+f2) / SQR(u1_ij)) * 
+                                ( -p_boc1 * exp_p1i + 
+                                  exp_p2i / ( exp_p2i + exp_p2j ) ) + 
+                                -p_boc1 * exp_p1i / u1_ji - 
+                                ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i +  
+                                exp_p2i / ( exp_p2i + exp_p2j ) ));
+
+                        Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + 
+                            Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); 
+                        //fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
+                    }
+                    else {
+                        /* No overcoordination correction! */
+                        f1 = 1.0;
+                        Cf1_ij = Cf1_ji = 0.0;          
+                    }
+
+                    if( twbp->v13cor >= 0.001 ) {
+                        /* Correction for 1-3 bond orders */
+                        exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
+                                    Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
+                        exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
+                                    Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
+
+                        f4 = 1. / (1. + exp_f4);
+                        f5 = 1. / (1. + exp_f5);
+                        f4f5 = f4 * f5;
+
+                        /* Bond Order pages 8-9, derivative of f4 and f5 */
+                        /*temp = twbp->p_boc5 - 
+                          twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
+                          u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
+                          u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
+                          Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
+                          Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
+                        Cf45_ij = -f4 * exp_f4;
+                        Cf45_ji = -f5 * exp_f5;
+                    }
+                    else {
+                        f4 = f5 = f4f5 = 1.0;
+                        Cf45_ij = Cf45_ji = 0.0;
+                    }
+
+                    /* Bond Order page 10, derivative of total bond order */
+                    A0_ij = f1 * f4f5;
+                    A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * 
+                        (Cf45_ij + Cf45_ji);
+                    A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
+                    A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
+                    A3_ij = A2_ij + Cf1_ij / f1;
+                    A3_ji = A2_ji + Cf1_ji / f1;
+
+                    /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f 
 A2_ji: %f, A3_ij: %f, A3_ji: %f\n",
 bo_ij->BO, A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/
 
-					/* find corrected bond order values and their deriv coefs */
-					bo_ij->BO    = bo_ij->BO    * A0_ij;
-					bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
-					bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
-					bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+                    /* find corrected bond order values and their deriv coefs */
+                    bo_ij->BO    = bo_ij->BO    * A0_ij;
+                    bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
+                    bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
+                    bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
 
-					bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
-					bo_ij->C2dbo = bo_ij->BO * A2_ij;
-					bo_ij->C3dbo = bo_ij->BO * A2_ji; 
+                    bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
+                    bo_ij->C2dbo = bo_ij->BO * A2_ij;
+                    bo_ij->C3dbo = bo_ij->BO * A2_ji; 
 
-					bo_ij->C1dbopi = f1*f1*f4*f5;
-					bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
-					bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
-					bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
+                    bo_ij->C1dbopi = f1*f1*f4*f5;
+                    bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
+                    bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
+                    bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
 
-					bo_ij->C1dbopi2 = f1*f1*f4*f5;
-					bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
-					bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
-					bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji;
+                    bo_ij->C1dbopi2 = f1*f1*f4*f5;
+                    bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
+                    bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
+                    bo_ij->C4dbopi2 = bo_ij->BO_pi2 * A3_ji;
 
 #ifdef TEST_FORCES
-					/*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", 
-					  i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/
-
-					/* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
-					//"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n",
-					workspace->orig_id[i], workspace->orig_id[j]
-					A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji
-					bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s,
-					bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, 
-					bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi,
-					bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2
-					); */
-
-					Calculate_dBO( i, pj, workspace, lists, &top_dbo );
+                    /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", 
+                      i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/
+
+                    /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
+                    //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n",
+                    workspace->orig_id[i], workspace->orig_id[j]
+                    A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji
+                    bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s,
+                    bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, 
+                    bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi,
+                    bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2
+                    ); */
+
+                    Calculate_dBO( i, pj, workspace, lists, &top_dbo );
 #endif
-				}
+                }
 
-				/* neglect bonds that are < 1e-10 */
-				if( bo_ij->BO < 1e-10 )
-					bo_ij->BO = 0.0;
-				if( bo_ij->BO_s < 1e-10 )
-					bo_ij->BO_s = 0.0;
-				if( bo_ij->BO_pi < 1e-10 )
-					bo_ij->BO_pi = 0.0;
-				if( bo_ij->BO_pi2 < 1e-10 )
-					bo_ij->BO_pi2 = 0.0;
+                /* neglect bonds that are < 1e-10 */
+                if( bo_ij->BO < 1e-10 )
+                    bo_ij->BO = 0.0;
+                if( bo_ij->BO_s < 1e-10 )
+                    bo_ij->BO_s = 0.0;
+                if( bo_ij->BO_pi < 1e-10 )
+                    bo_ij->BO_pi = 0.0;
+                if( bo_ij->BO_pi2 < 1e-10 )
+                    bo_ij->BO_pi2 = 0.0;
 
-				workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO
+                workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO
 
 
-				/* fprintf( stderr, "%d %d\t%g %g %g %g\n
+                /* fprintf( stderr, "%d %d\t%g %g %g %g\n
 Cdbo:\t%g %g %g\n
 Cdbopi:\t%g %g %g %g\n
 Cdbopi2:%g %g %g %g\n\n", 
@@ -1229,148 +1229,148 @@ bo_ij->C1dbopi, bo_ij->C2dbopi, bo_ij->C3dbopi, bo_ij->C4dbopi,
 bo_ij->C1dbopi2, bo_ij->C2dbopi2, 
 bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
 
-				/* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
-				   i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */
+                /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
+                   i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */
 
 #ifdef TEST_FORCES
-				Set_End_Index( pj, top_dbo, dBOs );
-				//Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
+                Set_End_Index( pj, top_dbo, dBOs );
+                //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
 #endif
-			}
-			else {
-				/* We only need to update bond orders from bo_ji
-				   everything else is set in uncorrected_bo calculations */
-				sym_index = bonds->select.bond_list[pj].sym_index;
-				bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
-				bo_ij->BO = bo_ji->BO;
-				bo_ij->BO_s = bo_ji->BO_s;
-				bo_ij->BO_pi = bo_ji->BO_pi;
-				bo_ij->BO_pi2 = bo_ji->BO_pi2;
-
-				workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO
+            }
+            else {
+                /* We only need to update bond orders from bo_ji
+                   everything else is set in uncorrected_bo calculations */
+                sym_index = bonds->select.bond_list[pj].sym_index;
+                bo_ji = &(bonds->select.bond_list[ sym_index ].bo_data);
+                bo_ij->BO = bo_ji->BO;
+                bo_ij->BO_s = bo_ji->BO_s;
+                bo_ij->BO_pi = bo_ji->BO_pi;
+                bo_ij->BO_pi2 = bo_ji->BO_pi2;
+
+                workspace->total_bond_order[i] += bo_ij->BO; // now keeps total_BO
 
 #ifdef TEST_FORCES
-				//Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta );
+                //Add_dBO( system, lists, j, sym_index, 1.0, workspace->dDelta );
 #endif
-			}	  
-		}
+            }      
+        }
 
 #ifdef TEST_FORCES 
-		// fprintf( stderr, "dDelta computations\nj:" );
-		Set_Start_Index( i, top_dDelta, dDeltas );
-		ptop_dDelta = &( dDeltas->select.dDelta_list[top_dDelta] );
-
-		for( pj = start_i; pj < end_i; ++pj ) {
-			j = bonds->select.bond_list[pj].nbr;
-			// fprintf( stderr, "%d  ", j );
-
-			if( !rvec_isZero( workspace->dDelta[j] ) ) {
-				ptop_dDelta->wrt = j;
-				rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[j] );
-				rvec_MakeZero( workspace->dDelta[j] );
-				++top_dDelta, ++ptop_dDelta;
-			}
-
-			start_j = Start_Index(j, bonds);
-			end_j = End_Index(j, bonds);     
-			for( pk = start_j; pk < end_j; ++pk ) {
-				k = bonds->select.bond_list[pk].nbr;    
-				if( !rvec_isZero( workspace->dDelta[k] ) ) {
-					ptop_dDelta->wrt = k;
-					rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[k] );
-					rvec_MakeZero( workspace->dDelta[k] );
-					++top_dDelta, ++ptop_dDelta;
-				}
-			}
-		}
-
-		Set_End_Index( i, top_dDelta, dDeltas );
-
-		/*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj )
-		  fprintf( stdout, "dDel: %d %d [%g %g %g]\n",
-		  i+1, dDeltas->select.dDelta_list[pj].wrt+1,
-		  dDeltas->select.dDelta_list[pj].dVal[0], 
-		  dDeltas->select.dDelta_list[pj].dVal[1], 
-		  dDeltas->select.dDelta_list[pj].dVal[2] );*/
+        // fprintf( stderr, "dDelta computations\nj:" );
+        Set_Start_Index( i, top_dDelta, dDeltas );
+        ptop_dDelta = &( dDeltas->select.dDelta_list[top_dDelta] );
+
+        for( pj = start_i; pj < end_i; ++pj ) {
+            j = bonds->select.bond_list[pj].nbr;
+            // fprintf( stderr, "%d  ", j );
+
+            if( !rvec_isZero( workspace->dDelta[j] ) ) {
+                ptop_dDelta->wrt = j;
+                rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[j] );
+                rvec_MakeZero( workspace->dDelta[j] );
+                ++top_dDelta, ++ptop_dDelta;
+            }
+
+            start_j = Start_Index(j, bonds);
+            end_j = End_Index(j, bonds);     
+            for( pk = start_j; pk < end_j; ++pk ) {
+                k = bonds->select.bond_list[pk].nbr;    
+                if( !rvec_isZero( workspace->dDelta[k] ) ) {
+                    ptop_dDelta->wrt = k;
+                    rvec_Copy( ptop_dDelta->dVal, workspace->dDelta[k] );
+                    rvec_MakeZero( workspace->dDelta[k] );
+                    ++top_dDelta, ++ptop_dDelta;
+                }
+            }
+        }
+
+        Set_End_Index( i, top_dDelta, dDeltas );
+
+        /*for( pj=Start_Index(i,dDeltas); pj<End_Index(i,dDeltas); ++pj )
+          fprintf( stdout, "dDel: %d %d [%g %g %g]\n",
+          i+1, dDeltas->select.dDelta_list[pj].wrt+1,
+          dDeltas->select.dDelta_list[pj].dVal[0], 
+          dDeltas->select.dDelta_list[pj].dVal[1], 
+          dDeltas->select.dDelta_list[pj].dVal[2] );*/
 #endif
-	}
-
-	/*fprintf(stderr,"\tCalculated actual bond orders ...\n" );
-	  fprintf(stderr,"%6s%8s%8s%8s%8s%8s%8s%8s\n", 
-	  "atom", "Delta", "Delta_e", "Delta_boc", "nlp", 
-	  "Delta_lp", "Clp", "dDelta_lp" );*/
-
-	p_lp1 = system->reaxprm.gp.l[15];
-	/* Calculate some helper variables that are  used at many places 
-	   throughout force calculations */
-	for( j = 0; j < system->N; ++j ) {
-		type_j = system->atoms[j].type;
-		sbp_j = &(system->reaxprm.sbp[ type_j ]);
-
-		workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
-		workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
-		workspace->Delta_boc[j] = workspace->total_bond_order[j] - 
-			sbp_j->valency_boc;
-
-		workspace->vlpex[j] =  workspace->Delta_e[j] - 
-			2.0 * (int)(workspace->Delta_e[j]/2.0);
-		explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
-		workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
-		workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
-		workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]);
-		/* Adri uses different dDelta_lp values than the ones in notes... */
-		workspace->dDelta_lp[j] = workspace->Clp[j];
-		//workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
-		//((fabs(workspace->Delta_e[j]/2.0 - 
-		//       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
-
-		if( sbp_j->mass > 21.0 ) {
-			workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
-			workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
-			workspace->dDelta_lp_temp[j] = 0.;
-		}
-		else {
-			workspace->nlp_temp[j] = workspace->nlp[j];
-			workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
-			workspace->dDelta_lp_temp[j] = workspace->Clp[j];
-		}
-
-		//fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",
-		//j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], 
-		//workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt,
-		//workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] );
-	}
-
-	//Print_Bonds( system, bonds, "sbonds.out" );
+    }
+
+    /*fprintf(stderr,"\tCalculated actual bond orders ...\n" );
+      fprintf(stderr,"%6s%8s%8s%8s%8s%8s%8s%8s\n", 
+      "atom", "Delta", "Delta_e", "Delta_boc", "nlp", 
+      "Delta_lp", "Clp", "dDelta_lp" );*/
+
+    p_lp1 = system->reaxprm.gp.l[15];
+    /* Calculate some helper variables that are  used at many places 
+       throughout force calculations */
+    for( j = 0; j < system->N; ++j ) {
+        type_j = system->atoms[j].type;
+        sbp_j = &(system->reaxprm.sbp[ type_j ]);
+
+        workspace->Delta[j] = workspace->total_bond_order[j] - sbp_j->valency;
+        workspace->Delta_e[j] = workspace->total_bond_order[j] - sbp_j->valency_e;
+        workspace->Delta_boc[j] = workspace->total_bond_order[j] - 
+            sbp_j->valency_boc;
+
+        workspace->vlpex[j] =  workspace->Delta_e[j] - 
+            2.0 * (int)(workspace->Delta_e[j]/2.0);
+        explp1 = EXP(-p_lp1 * SQR(2.0 + workspace->vlpex[j]));
+        workspace->nlp[j] = explp1 - (int)(workspace->Delta_e[j] / 2.0);
+        workspace->Delta_lp[j] = sbp_j->nlp_opt - workspace->nlp[j];
+        workspace->Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace->vlpex[j]);
+        /* Adri uses different dDelta_lp values than the ones in notes... */
+        workspace->dDelta_lp[j] = workspace->Clp[j];
+        //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
+        //((fabs(workspace->Delta_e[j]/2.0 - 
+        //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
+
+        if( sbp_j->mass > 21.0 ) {
+            workspace->nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
+            workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+            workspace->dDelta_lp_temp[j] = 0.;
+        }
+        else {
+            workspace->nlp_temp[j] = workspace->nlp[j];
+            workspace->Delta_lp_temp[j] = sbp_j->nlp_opt - workspace->nlp_temp[j];
+            workspace->dDelta_lp_temp[j] = workspace->Clp[j];
+        }
+
+        //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",
+        //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], 
+        //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt,
+        //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] );
+    }
+
+    //Print_Bonds( system, bonds, "sbonds.out" );
 
 #if defined(DEBUG)
-	fprintf( stderr, "Number of bonds: %d\n", num_bonds );
-	Print_Bond_Orders( system, control, data, workspace, lists, out_control );
+    fprintf( stderr, "Number of bonds: %d\n", num_bonds );
+    Print_Bond_Orders( system, control, data, workspace, lists, out_control );
 #endif
 }
 
 
 //Cuda Functions
 GLOBAL void Cuda_Calculate_Bond_Orders_Init (  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
-		static_storage workspace, int num_atom_types, int N )
+        static_storage workspace, int num_atom_types, int N )
 {
-	int i, type_i;
-	real p_boc1, p_boc2;
-	single_body_parameters *sbp_i;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	p_boc1 = g_params.l[0];
-	p_boc2 = g_params.l[1];
-
-	/* Calculate Deltaprime, Deltaprime_boc values */
-	type_i = atoms[i].type;
-	sbp_i = &(sbp[type_i]);
-	workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency;
-	workspace.Deltap_boc[i] = 
-		workspace.total_bond_order[i] - sbp_i->valency_val;
-	workspace.total_bond_order[i] = 0;
+    int i, type_i;
+    real p_boc1, p_boc2;
+    single_body_parameters *sbp_i;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    p_boc1 = g_params.l[0];
+    p_boc2 = g_params.l[1];
+
+    /* Calculate Deltaprime, Deltaprime_boc values */
+    type_i = atoms[i].type;
+    sbp_i = &(sbp[type_i]);
+    workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency;
+    workspace.Deltap_boc[i] = 
+        workspace.total_bond_order[i] - sbp_i->valency_val;
+    workspace.total_bond_order[i] = 0;
 }
 
 
@@ -1379,267 +1379,267 @@ GLOBAL void Cuda_Calculate_Bond_Orders_Init (  reax_atom *atoms, global_paramete
    This can either be done in the general coordinator function or here */
 
 GLOBAL void Cuda_Calculate_Bond_Orders (  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
-		two_body_parameters *tbp, static_storage workspace, list bonds,
-		list dDeltas, list dBOs, int num_atom_types, int N )
+        two_body_parameters *tbp, static_storage workspace, list bonds,
+        list dDeltas, list dBOs, int num_atom_types, int N )
 {
-	int i, j, pj, type_i, type_j;
-	int start_i, end_i;
-	int num_bonds, sym_index;
-	real p_boc1, p_boc2;
-	real val_i, Deltap_i, Deltap_boc_i;
-	real val_j, Deltap_j, Deltap_boc_j;
-	real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
-	real exp_p1i,	exp_p2i, exp_p1j, exp_p2j;
-	real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
-	real Cf45_ij, Cf45_ji, p_lp1;
-	real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
-	real explp1;
-	two_body_parameters *twbp;
-	bond_order_data *bo_ij, *bo_ji;
-	single_body_parameters *sbp_i, *sbp_j;
+    int i, j, pj, type_i, type_j;
+    int start_i, end_i;
+    int num_bonds, sym_index;
+    real p_boc1, p_boc2;
+    real val_i, Deltap_i, Deltap_boc_i;
+    real val_j, Deltap_j, Deltap_boc_j;
+    real temp, f1, f2, f3, f4, f5, f4f5, exp_f4, exp_f5;
+    real exp_p1i,    exp_p2i, exp_p1j, exp_p2j;
+    real u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
+    real Cf45_ij, Cf45_ji, p_lp1;
+    real A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji;
+    real explp1;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij, *bo_ji;
+    single_body_parameters *sbp_i, *sbp_j;
 
 
 #if defined(TEST_FORCES)
-	int  k, pk, start_j, end_j;
-	int  top_dbo=0, top_dDelta=0;
-	dbond_data *pdbo;
-	dDelta_data *ptop_dDelta;
+    int  k, pk, start_j, end_j;
+    int  top_dbo=0, top_dDelta=0;
+    dbond_data *pdbo;
+    dDelta_data *ptop_dDelta;
 
 #endif
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	num_bonds = 0;
-	p_boc1 = g_params.l[0];
-	p_boc2 = g_params.l[1];
-
-	/* Calculate Deltaprime, Deltaprime_boc values */
-	//for( i = 0; i < system->N; ++i ) {
-	/*
-	   if (i < N) {
-	   type_i = atoms[i].type;
-	   sbp_i = &(sbp[type_i]);
-	   workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency;
-	   workspace.Deltap_boc[i] = 
-	   workspace.total_bond_order[i] - sbp_i->valency_val;
-	   workspace.total_bond_order[i] = 0;
-
-	   }
-
-	   __syncthreads ();
-	 */
-
-
-	// fprintf( stderr, "done with uncorrected bond orders\n" );
-
-
-	/* Corrected Bond Order calculations */
-	//for( i = 0; i < system->N; ++i ) {
-	type_i = atoms[i].type;
-	sbp_i = &(sbp[type_i]);
-	val_i = sbp_i->valency;
-	Deltap_i = workspace.Deltap[i];
-	Deltap_boc_i = workspace.Deltap_boc[i];
-	start_i = Start_Index(i, &bonds);
-	end_i = End_Index(i, &bonds);
-	//fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
-	//       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
-
-	for( pj = start_i; pj < end_i; ++pj ) {
-		j = bonds.select.bond_list[pj].nbr;
-		type_j = atoms[j].type;
-		bo_ij = &( bonds.select.bond_list[pj].bo_data );
-		//fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
-
-		if( i < j ) {
-			twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );	      
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    num_bonds = 0;
+    p_boc1 = g_params.l[0];
+    p_boc2 = g_params.l[1];
+
+    /* Calculate Deltaprime, Deltaprime_boc values */
+    //for( i = 0; i < system->N; ++i ) {
+    /*
+       if (i < N) {
+       type_i = atoms[i].type;
+       sbp_i = &(sbp[type_i]);
+       workspace.Deltap[i] = workspace.total_bond_order[i] - sbp_i->valency;
+       workspace.Deltap_boc[i] = 
+       workspace.total_bond_order[i] - sbp_i->valency_val;
+       workspace.total_bond_order[i] = 0;
+
+       }
+
+       __syncthreads ();
+     */
+
+
+    // fprintf( stderr, "done with uncorrected bond orders\n" );
+
+
+    /* Corrected Bond Order calculations */
+    //for( i = 0; i < system->N; ++i ) {
+    type_i = atoms[i].type;
+    sbp_i = &(sbp[type_i]);
+    val_i = sbp_i->valency;
+    Deltap_i = workspace.Deltap[i];
+    Deltap_boc_i = workspace.Deltap_boc[i];
+    start_i = Start_Index(i, &bonds);
+    end_i = End_Index(i, &bonds);
+    //fprintf( stderr, "i:%d Dp:%g Dbocp:%g s:%d e:%d\n",
+    //       i+1, Deltap_i, Deltap_boc_i, start_i, end_i );
+
+    for( pj = start_i; pj < end_i; ++pj ) {
+        j = bonds.select.bond_list[pj].nbr;
+        type_j = atoms[j].type;
+        bo_ij = &( bonds.select.bond_list[pj].bo_data );
+        //fprintf( stderr, "\tj:%d - ubo: %8.3f\n", j+1, bo_ij->BO );
+
+        if( i < j ) {
+            twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );          
 #ifdef TEST_FORCES
-			Set_Start_Index( pj, top_dbo, &dBOs );
-			/* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", 
-			   workspace->reverse_map[i], workspace->reverse_map[j], 
-			   twbp->ovc, twbp->v13cor, bo_ij->BO ); */
+            Set_Start_Index( pj, top_dbo, &dBOs );
+            /* fprintf( stderr, "%6d%6d%23.15e%23.15e%23.15e\n", 
+               workspace->reverse_map[i], workspace->reverse_map[j], 
+               twbp->ovc, twbp->v13cor, bo_ij->BO ); */
 #endif
-			if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
-				/* There is no correction to bond orders nor to derivatives of 
-				   bond order prime! So we leave bond orders unchanged and 
-				   set derivative of bond order coefficients s.t. 
-				   dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
-				bo_ij->C1dbo = 1.000000;
-				bo_ij->C2dbo = 0.000000;
-				bo_ij->C3dbo = 0.000000; 
-
-				bo_ij->C1dbopi = bo_ij->BO_pi;
-				bo_ij->C2dbopi = 0.000000;
-				bo_ij->C3dbopi = 0.000000;
-				bo_ij->C4dbopi = 0.000000;
-
-				bo_ij->C1dbopi2 = bo_ij->BO_pi2; 
-				bo_ij->C2dbopi2 = 0.000000;
-				bo_ij->C3dbopi2 = 0.000000;
-				bo_ij->C4dbopi2 = 0.000000;
+            if( twbp->ovc < 0.001 && twbp->v13cor < 0.001 ) {
+                /* There is no correction to bond orders nor to derivatives of 
+                   bond order prime! So we leave bond orders unchanged and 
+                   set derivative of bond order coefficients s.t. 
+                   dBO = dBOp & dBOxx = dBOxxp in Add_dBO_to_Forces */
+                bo_ij->C1dbo = 1.000000;
+                bo_ij->C2dbo = 0.000000;
+                bo_ij->C3dbo = 0.000000; 
+
+                bo_ij->C1dbopi = bo_ij->BO_pi;
+                bo_ij->C2dbopi = 0.000000;
+                bo_ij->C3dbopi = 0.000000;
+                bo_ij->C4dbopi = 0.000000;
+
+                bo_ij->C1dbopi2 = bo_ij->BO_pi2; 
+                bo_ij->C2dbopi2 = 0.000000;
+                bo_ij->C3dbopi2 = 0.000000;
+                bo_ij->C4dbopi2 = 0.000000;
 
 #ifdef TEST_FORCES
-				pdbo = &(dBOs.select.dbo_list[ top_dbo ]);
-
-				// compute dBO_ij/dr_i
-				pdbo->wrt = i;
-				rvec_Copy( pdbo->dBO, bo_ij->dBOp );
-				rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
-				rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
-
-				// compute dBO_ij/dr_j
-				pdbo++;
-				pdbo->wrt = j;
-				rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp );
-				rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi );
-				rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 );
-
-				top_dbo += 2;
+                pdbo = &(dBOs.select.dbo_list[ top_dbo ]);
+
+                // compute dBO_ij/dr_i
+                pdbo->wrt = i;
+                rvec_Copy( pdbo->dBO, bo_ij->dBOp );
+                rvec_Scale( pdbo->dBOpi, bo_ij->BO_pi, bo_ij->dln_BOp_pi );
+                rvec_Scale( pdbo->dBOpi2, bo_ij->BO_pi2, bo_ij->dln_BOp_pi2 );
+
+                // compute dBO_ij/dr_j
+                pdbo++;
+                pdbo->wrt = j;
+                rvec_Scale( pdbo->dBO,-1.0,bo_ij->dBOp );
+                rvec_Scale( pdbo->dBOpi,-bo_ij->BO_pi,bo_ij->dln_BOp_pi );
+                rvec_Scale( pdbo->dBOpi2,-bo_ij->BO_pi2,bo_ij->dln_BOp_pi2 );
+
+                top_dbo += 2;
 #endif
-			}
-			else {
-				val_j = sbp[type_j].valency;
-				Deltap_j = workspace.Deltap[j];
-				Deltap_boc_j = workspace.Deltap_boc[j];
-
-				/* on page 1 */
-				if( twbp->ovc >= 0.001 ) {
-					/* Correction for overcoordination */		
-					exp_p1i = EXP( -p_boc1 * Deltap_i );
-					exp_p2i = EXP( -p_boc2 * Deltap_i );
-					exp_p1j = EXP( -p_boc1 * Deltap_j );
-					exp_p2j = EXP( -p_boc2 * Deltap_j );
-
-					f2 = exp_p1i + exp_p1j;			
-					f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
-					f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + 
-							( val_j + f2 )/( val_j + f2 + f3 ) );
-
-					/*fprintf( stderr,"%6d%6d\t%g %g   j:%g %g  p_boc:%g %g\n",
-					  i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 );
-					  fprintf( stderr,"\tf:%g  %g  %g, exp:%g %g %g %g\n", 
-					  f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
-
-					/* Now come the derivates */		
-					/* Bond Order pages 5-7, derivative of f1 */
-					temp = f2 + f3;
-					u1_ij = val_i + temp;
-					u1_ji = val_j + temp;
-					Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji ));
-					Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + 
-							( u1_ji - f3 ) / SQR( u1_ji ));
-
-					//Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
-					//          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
-					Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - 
-							((val_i+f2) / SQR(u1_ij)) * 
-							( -p_boc1 * exp_p1i + 
-							  exp_p2i / ( exp_p2i + exp_p2j ) ) + 
-							-p_boc1 * exp_p1i / u1_ji - 
-							((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i +  
-							exp_p2i / ( exp_p2i + exp_p2j ) ));
-
-					Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + 
-						Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); 
-					//fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
-				}
-				else {
-					/* No overcoordination correction! */
-					f1 = 1.0;
-					Cf1_ij = Cf1_ji = 0.0;		  
-				}
-
-				if( twbp->v13cor >= 0.001 ) {
-					/* Correction for 1-3 bond orders */
-					exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
-								Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
-					exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
-								Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
-
-					f4 = 1. / (1. + exp_f4);
-					f5 = 1. / (1. + exp_f5);
-					f4f5 = f4 * f5;
-
-					/* Bond Order pages 8-9, derivative of f4 and f5 */
-					/*temp = twbp->p_boc5 - 
-					  twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
-					  u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
-					  u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
-					  Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
-					  Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
-					Cf45_ij = -f4 * exp_f4;
-					Cf45_ji = -f5 * exp_f5;
-				}
-				else {
-					f4 = f5 = f4f5 = 1.0;
-					Cf45_ij = Cf45_ji = 0.0;
-				}
-
-				/* Bond Order page 10, derivative of total bond order */
-				A0_ij = f1 * f4f5;
-				A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * 
-					(Cf45_ij + Cf45_ji);
-				A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
-				A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
-				A3_ij = A2_ij + Cf1_ij / f1;
-				A3_ji = A2_ji + Cf1_ji / f1;
-
-				/*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f 
+            }
+            else {
+                val_j = sbp[type_j].valency;
+                Deltap_j = workspace.Deltap[j];
+                Deltap_boc_j = workspace.Deltap_boc[j];
+
+                /* on page 1 */
+                if( twbp->ovc >= 0.001 ) {
+                    /* Correction for overcoordination */        
+                    exp_p1i = EXP( -p_boc1 * Deltap_i );
+                    exp_p2i = EXP( -p_boc2 * Deltap_i );
+                    exp_p1j = EXP( -p_boc1 * Deltap_j );
+                    exp_p2j = EXP( -p_boc2 * Deltap_j );
+
+                    f2 = exp_p1i + exp_p1j;            
+                    f3 = -1.0 / p_boc2 * log( 0.5 * ( exp_p2i  + exp_p2j ) );
+                    f1 = 0.5 * ( ( val_i + f2 )/( val_i + f2 + f3 ) + 
+                            ( val_j + f2 )/( val_j + f2 + f3 ) );
+
+                    /*fprintf( stderr,"%6d%6d\t%g %g   j:%g %g  p_boc:%g %g\n",
+                      i+1, j+1, val_i, Deltap_i, val_j, Deltap_j, p_boc1, p_boc2 );
+                      fprintf( stderr,"\tf:%g  %g  %g, exp:%g %g %g %g\n", 
+                      f1, f2, f3, exp_p1i, exp_p2i, exp_p1j, exp_p2j );*/
+
+                    /* Now come the derivates */        
+                    /* Bond Order pages 5-7, derivative of f1 */
+                    temp = f2 + f3;
+                    u1_ij = val_i + temp;
+                    u1_ji = val_j + temp;
+                    Cf1A_ij = 0.5 * f3 * (1.0 / SQR( u1_ij ) + 1.0 / SQR( u1_ji ));
+                    Cf1B_ij = -0.5 * (( u1_ij - f3 ) / SQR( u1_ij ) + 
+                            ( u1_ji - f3 ) / SQR( u1_ji ));
+
+                    //Cf1_ij = -Cf1A_ij * p_boc1 * exp_p1i + 
+                    //          Cf1B_ij * exp_p2i / ( exp_p2i + exp_p2j );
+                    Cf1_ij = 0.50 * ( -p_boc1 * exp_p1i / u1_ij - 
+                            ((val_i+f2) / SQR(u1_ij)) * 
+                            ( -p_boc1 * exp_p1i + 
+                              exp_p2i / ( exp_p2i + exp_p2j ) ) + 
+                            -p_boc1 * exp_p1i / u1_ji - 
+                            ((val_j+f2)/SQR(u1_ji)) * ( -p_boc1*exp_p1i +  
+                            exp_p2i / ( exp_p2i + exp_p2j ) ));
+
+                    Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + 
+                        Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j ); 
+                    //fprintf( stderr, "\tCf1:%g  %g\n", Cf1_ij, Cf1_ji );
+                }
+                else {
+                    /* No overcoordination correction! */
+                    f1 = 1.0;
+                    Cf1_ij = Cf1_ji = 0.0;          
+                }
+
+                if( twbp->v13cor >= 0.001 ) {
+                    /* Correction for 1-3 bond orders */
+                    exp_f4 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
+                                Deltap_boc_i) * twbp->p_boc3 + twbp->p_boc5);
+                    exp_f5 =EXP(-(twbp->p_boc4 * SQR( bo_ij->BO ) - 
+                                Deltap_boc_j) * twbp->p_boc3 + twbp->p_boc5);
+
+                    f4 = 1. / (1. + exp_f4);
+                    f5 = 1. / (1. + exp_f5);
+                    f4f5 = f4 * f5;
+
+                    /* Bond Order pages 8-9, derivative of f4 and f5 */
+                    /*temp = twbp->p_boc5 - 
+                      twbp->p_boc3 * twbp->p_boc4 * SQR( bo_ij->BO );
+                      u_ij = temp + twbp->p_boc3 * Deltap_boc_i;
+                      u_ji = temp + twbp->p_boc3 * Deltap_boc_j;
+                      Cf45_ij = Cf45( u_ij, u_ji ) / f4f5;
+                      Cf45_ji = Cf45( u_ji, u_ij ) / f4f5;*/
+                    Cf45_ij = -f4 * exp_f4;
+                    Cf45_ji = -f5 * exp_f5;
+                }
+                else {
+                    f4 = f5 = f4f5 = 1.0;
+                    Cf45_ij = Cf45_ji = 0.0;
+                }
+
+                /* Bond Order page 10, derivative of total bond order */
+                A0_ij = f1 * f4f5;
+                A1_ij = -2 * twbp->p_boc3 * twbp->p_boc4 * bo_ij->BO * 
+                    (Cf45_ij + Cf45_ji);
+                A2_ij = Cf1_ij / f1 + twbp->p_boc3 * Cf45_ij;
+                A2_ji = Cf1_ji / f1 + twbp->p_boc3 * Cf45_ji;
+                A3_ij = A2_ij + Cf1_ij / f1;
+                A3_ji = A2_ji + Cf1_ji / f1;
+
+                /*fprintf( stderr, "\tBO: %f, A0: %f, A1: %f, A2_ij: %f 
 A2_ji: %f, A3_ij: %f, A3_ji: %f\n",
 bo_ij->BO, A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji );*/
 
-				/* find corrected bond order values and their deriv coefs */
-				bo_ij->BO    = bo_ij->BO    * A0_ij;
-				bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
-				bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
-				bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+                /* find corrected bond order values and their deriv coefs */
+                bo_ij->BO    = bo_ij->BO    * A0_ij;
+                bo_ij->BO_pi = bo_ij->BO_pi * A0_ij *f1;
+                bo_ij->BO_pi2= bo_ij->BO_pi2* A0_ij *f1;
+                bo_ij->BO_s  = bo_ij->BO - ( bo_ij->BO_pi + bo_ij->BO_pi2 );
 
-				bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
-				bo_ij->C2dbo = bo_ij->BO * A2_ij;
-				bo_ij->C3dbo = bo_ij->BO * A2_ji; 
+                bo_ij->C1dbo = A0_ij + bo_ij->BO * A1_ij;
+                bo_ij->C2dbo = bo_ij->BO * A2_ij;
+                bo_ij->C3dbo = bo_ij->BO * A2_ji; 
 
-				bo_ij->C1dbopi = f1*f1*f4*f5;
-				bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
-				bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
-				bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
+                bo_ij->C1dbopi = f1*f1*f4*f5;
+                bo_ij->C2dbopi = bo_ij->BO_pi * A1_ij;
+                bo_ij->C3dbopi = bo_ij->BO_pi * A3_ij;
+                bo_ij->C4dbopi = bo_ij->BO_pi * A3_ji;
 
-				bo_ij->C1dbopi2 = f1*f1*f4*f5;
-				bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
-				bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
+                bo_ij->C1dbopi2 = f1*f1*f4*f5;
+                bo_ij->C2dbopi2 = bo_ij->BO_pi2 * A1_ij;
+                bo_ij->C3dbopi2 = bo_ij->BO_pi2 * A3_ij;
 
 #ifdef TEST_FORCES
-				/*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", 
-				  i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/
-
-				/* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
-				//"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n",
-				workspace->orig_id[i], workspace->orig_id[j]
-				A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji
-				bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s,
-				bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, 
-				bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi,
-				bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2
-				); */
-
-				Calculate_dBO( i, pj, workspace, lists, &top_dbo );
+                /*fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n", 
+                  i+1, j+1, bo_ij->BO, bo_ij->C1dbo, Cf45_ij, Cf45_ji );*/
+
+                /* fprintf( stderr, "%6d%6d%13.6f%13.6f%13.6f%13.6f\n",
+                //"%6d%6d%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n%10.6f%10.6f%10.6f%10.6f\n\n",
+                workspace->orig_id[i], workspace->orig_id[j]
+                A0_ij, A1_ij, A2_ij, A2_ji, A3_ij, A3_ji
+                bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2, bo_ij->BO_s,
+                bo_ij->C1dbo, bo_ij->C2dbo, bo_ij->C3dbo, 
+                bo_ij->C1dbopi,bo_ij->C2dbopi,bo_ij->C3dbopi,bo_ij->C4dbopi,
+                bo_ij->C1dbopi2,bo_ij->C2dbopi2,bo_ij->C3dbopi2,bo_ij->C4dbopi2
+                ); */
+
+                Calculate_dBO( i, pj, workspace, lists, &top_dbo );
 #endif
-			}
+            }
 
-			/* neglect bonds that are < 1e-10 */
-			if( bo_ij->BO < 1e-10 )
-				bo_ij->BO = 0.0;
-			if( bo_ij->BO_s < 1e-10 )
-				bo_ij->BO_s = 0.0;
-			if( bo_ij->BO_pi < 1e-10 )
-				bo_ij->BO_pi = 0.0;
-			if( bo_ij->BO_pi2 < 1e-10 )
-				bo_ij->BO_pi2 = 0.0;
+            /* neglect bonds that are < 1e-10 */
+            if( bo_ij->BO < 1e-10 )
+                bo_ij->BO = 0.0;
+            if( bo_ij->BO_s < 1e-10 )
+                bo_ij->BO_s = 0.0;
+            if( bo_ij->BO_pi < 1e-10 )
+                bo_ij->BO_pi = 0.0;
+            if( bo_ij->BO_pi2 < 1e-10 )
+                bo_ij->BO_pi2 = 0.0;
 
-			workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
+            workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
 
 
-			/* fprintf( stderr, "%d %d\t%g %g %g %g\n
+            /* fprintf( stderr, "%d %d\t%g %g %g %g\n
 Cdbo:\t%g %g %g\n
 Cdbopi:\t%g %g %g %g\n
 Cdbopi2:%g %g %g %g\n\n", 
@@ -1650,32 +1650,32 @@ bo_ij->C1dbopi, bo_ij->C2dbopi, bo_ij->C3dbopi, bo_ij->C4dbopi,
 bo_ij->C1dbopi2, bo_ij->C2dbopi2, 
 bo_ij->C3dbopi2, bo_ij->C4dbopi2 ); */
 
-			/* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
-			   i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */
+            /* fprintf( stderr, "%d %d, BO:%f BO_s:%f BO_pi:%f BO_pi2:%f\n",
+               i+1,j+1,bo_ij->BO,bo_ij->BO_s,bo_ij->BO_pi,bo_ij->BO_pi2 ); */
 
 #ifdef TEST_FORCES
-			Set_End_Index( pj, top_dbo, &dBOs );
-			//Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
+            Set_End_Index( pj, top_dbo, &dBOs );
+            //Add_dBO( system, lists, i, pj, 1.0, workspace->dDelta );
 #endif
-		}
-		/*
-		   else {
-		// We only need to update bond orders from bo_ji
-		//   everything else is set in uncorrected_bo calculations 
-		sym_index = bonds.select.bond_list[pj].sym_index;
-		bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data);
-		bo_ij->BO = bo_ji->BO;
-		bo_ij->BO_s = bo_ji->BO_s;
-		bo_ij->BO_pi = bo_ji->BO_pi;
-		bo_ij->BO_pi2 = bo_ji->BO_pi2;
-
-		workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
+        }
+        /*
+           else {
+        // We only need to update bond orders from bo_ji
+        //   everything else is set in uncorrected_bo calculations 
+        sym_index = bonds.select.bond_list[pj].sym_index;
+        bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data);
+        bo_ij->BO = bo_ji->BO;
+        bo_ij->BO_s = bo_ji->BO_s;
+        bo_ij->BO_pi = bo_ji->BO_pi;
+        bo_ij->BO_pi2 = bo_ji->BO_pi2;
+
+        workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
 
 #ifdef TEST_FORCES
-		//Add_dBO( system, lists, j, sym_index, 1.0, workspace.dDelta );
+        //Add_dBO( system, lists, j, sym_index, 1.0, workspace.dDelta );
 #endif
-}	  
-		 */
+}      
+         */
 }
 
 #ifdef TEST_FORCES 
@@ -1684,27 +1684,27 @@ Set_Start_Index( i, top_dDelta, &dDeltas );
 ptop_dDelta = &( dDeltas.select.dDelta_list[top_dDelta] );
 
 for( pj = start_i; pj < end_i; ++pj ) {
-	j = bonds.select.bond_list[pj].nbr;
-	// fprintf( stderr, "%d  ", j );
-
-	if( !rvec_isZero( workspace.dDelta[j] ) ) {
-		ptop_dDelta->wrt = j;
-		rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[j] );
-		rvec_MakeZero( workspace.dDelta[j] );
-		++top_dDelta, ++ptop_dDelta;
-	}
-
-	start_j = Start_Index(j, &bonds);
-	end_j = End_Index(j, &bonds);     
-	for( pk = start_j; pk < end_j; ++pk ) {
-		k = bonds.select.bond_list[pk].nbr;    
-		if( !rvec_isZero( workspace.dDelta[k] ) ) {
-			ptop_dDelta->wrt = k;
-			rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[k] );
-			rvec_MakeZero( workspace.dDelta[k] );
-			++top_dDelta, ++ptop_dDelta;
-		}
-	}
+    j = bonds.select.bond_list[pj].nbr;
+    // fprintf( stderr, "%d  ", j );
+
+    if( !rvec_isZero( workspace.dDelta[j] ) ) {
+        ptop_dDelta->wrt = j;
+        rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[j] );
+        rvec_MakeZero( workspace.dDelta[j] );
+        ++top_dDelta, ++ptop_dDelta;
+    }
+
+    start_j = Start_Index(j, &bonds);
+    end_j = End_Index(j, &bonds);     
+    for( pk = start_j; pk < end_j; ++pk ) {
+        k = bonds.select.bond_list[pk].nbr;    
+        if( !rvec_isZero( workspace.dDelta[k] ) ) {
+            ptop_dDelta->wrt = k;
+            rvec_Copy( ptop_dDelta->dVal, workspace.dDelta[k] );
+            rvec_MakeZero( workspace.dDelta[k] );
+            ++top_dDelta, ++ptop_dDelta;
+        }
+    }
 }
 
 Set_End_Index( i, top_dDelta, &dDeltas );
@@ -1780,125 +1780,125 @@ workspace.dDelta_lp_temp[j] = workspace.Clp[j];
 
 GLOBAL void Cuda_Update_Uncorrected_BO (  static_storage workspace, list bonds, int N )
 {
-	int i, j, pj;
-	int start_i, end_i;
-	int sym_index;
+    int i, j, pj;
+    int start_i, end_i;
+    int sym_index;
 
-	bond_order_data *bo_ij, *bo_ji;
+    bond_order_data *bo_ij, *bo_ji;
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
 
-	start_i = Start_Index(i, &bonds);
-	end_i = End_Index(i, &bonds);
+    start_i = Start_Index(i, &bonds);
+    end_i = End_Index(i, &bonds);
 
-	for( pj = start_i; pj < end_i; ++pj ) {
+    for( pj = start_i; pj < end_i; ++pj ) {
 
-		j = bonds.select.bond_list[pj].nbr;
-		bo_ij = &( bonds.select.bond_list[pj].bo_data );
+        j = bonds.select.bond_list[pj].nbr;
+        bo_ij = &( bonds.select.bond_list[pj].bo_data );
 
-		if( i >= j ) {
-			// We only need to update bond orders from bo_ji
-			//   everything else is set in uncorrected_bo calculations 
-			sym_index = bonds.select.bond_list[pj].sym_index;
-			bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data);
-			bo_ij->BO = bo_ji->BO;
-			bo_ij->BO_s = bo_ji->BO_s;
-			bo_ij->BO_pi = bo_ji->BO_pi;
-			bo_ij->BO_pi2 = bo_ji->BO_pi2;
+        if( i >= j ) {
+            // We only need to update bond orders from bo_ji
+            //   everything else is set in uncorrected_bo calculations 
+            sym_index = bonds.select.bond_list[pj].sym_index;
+            bo_ji = &(bonds.select.bond_list[ sym_index ].bo_data);
+            bo_ij->BO = bo_ji->BO;
+            bo_ij->BO_s = bo_ji->BO_s;
+            bo_ij->BO_pi = bo_ji->BO_pi;
+            bo_ij->BO_pi2 = bo_ji->BO_pi2;
 
-			workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
-		}
-	}
+            workspace.total_bond_order[i] += bo_ij->BO; // now keeps total_BO
+        }
+    }
 }
 
 GLOBAL void Cuda_Update_Workspace_After_Bond_Orders(  reax_atom *atoms, global_parameters g_params, single_body_parameters *sbp,
-		static_storage workspace, int N )
+        static_storage workspace, int N )
 {
-	int j, type_j;
-	real explp1;
-	real p_lp1;
-	single_body_parameters *sbp_i, *sbp_j;
-
-	j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= N) return;
-
-	p_lp1 = g_params.l[15];
-
-	/* Calculate some helper variables that are  used at many places 
-	   throughout force calculations */
-	//for( j = 0; j < system->N; ++j ) {
-	type_j = atoms[j].type;
-	sbp_j = &(sbp[ type_j ]);
-
-	workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency;
-	workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e;
-	workspace.Delta_boc[j] = workspace.total_bond_order[j] - 
-		sbp_j->valency_boc;
-
-	workspace.vlpex[j] =  workspace.Delta_e[j] - 
-		2.0 * (int)(workspace.Delta_e[j]/2.0);
-	explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j]));
-	workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0);
-	workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j];
-	workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]);
-	/* Adri uses different dDelta_lp values than the ones in notes... */
-	workspace.dDelta_lp[j] = workspace.Clp[j];
-	//workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
-	//((fabs(workspace->Delta_e[j]/2.0 - 
-	//       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
-
-	if( sbp_j->mass > 21.0 ) {
-		workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
-		workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
-		workspace.dDelta_lp_temp[j] = 0.;
-	}
-	else {
-		workspace.nlp_temp[j] = workspace.nlp[j];
-		workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
-		workspace.dDelta_lp_temp[j] = workspace.Clp[j];
-	}
-
-	//fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",
-	//j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], 
-	//workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt,
-	//workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] );
-	//}
+    int j, type_j;
+    real explp1;
+    real p_lp1;
+    single_body_parameters *sbp_i, *sbp_j;
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
+
+    p_lp1 = g_params.l[15];
+
+    /* Calculate some helper variables that are  used at many places 
+       throughout force calculations */
+    //for( j = 0; j < system->N; ++j ) {
+    type_j = atoms[j].type;
+    sbp_j = &(sbp[ type_j ]);
+
+    workspace.Delta[j] = workspace.total_bond_order[j] - sbp_j->valency;
+    workspace.Delta_e[j] = workspace.total_bond_order[j] - sbp_j->valency_e;
+    workspace.Delta_boc[j] = workspace.total_bond_order[j] - 
+        sbp_j->valency_boc;
+
+    workspace.vlpex[j] =  workspace.Delta_e[j] - 
+        2.0 * (int)(workspace.Delta_e[j]/2.0);
+    explp1 = EXP(-p_lp1 * SQR(2.0 + workspace.vlpex[j]));
+    workspace.nlp[j] = explp1 - (int)(workspace.Delta_e[j] / 2.0);
+    workspace.Delta_lp[j] = sbp_j->nlp_opt - workspace.nlp[j];
+    workspace.Clp[j] = 2.0 * p_lp1 * explp1 * (2.0 + workspace.vlpex[j]);
+    /* Adri uses different dDelta_lp values than the ones in notes... */
+    workspace.dDelta_lp[j] = workspace.Clp[j];
+    //workspace->dDelta_lp[j] = workspace->Clp[j] + (0.5-workspace->Clp[j]) *
+    //((fabs(workspace->Delta_e[j]/2.0 - 
+    //       (int)(workspace->Delta_e[j]/2.0)) < 0.1) ? 1 : 0 );
+
+    if( sbp_j->mass > 21.0 ) {
+        workspace.nlp_temp[j] = 0.5 * (sbp_j->valency_e - sbp_j->valency);
+        workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
+        workspace.dDelta_lp_temp[j] = 0.;
+    }
+    else {
+        workspace.nlp_temp[j] = workspace.nlp[j];
+        workspace.Delta_lp_temp[j] = sbp_j->nlp_opt - workspace.nlp_temp[j];
+        workspace.dDelta_lp_temp[j] = workspace.Clp[j];
+    }
+
+    //fprintf( stderr, "%d\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\t%lf\n",
+    //j, workspace->Delta[j], workspace->Delta_e[j], workspace->Delta_boc[j], 
+    //workspace->nlp[j], system->reaxprm.sbp[type_j].nlp_opt,
+    //workspace->Delta_lp[j], workspace->Clp[j], workspace->dDelta_lp[j] );
+    //}
 
 }
 
 //Import from the forces file. 
 
 GLOBAL void Cuda_Compute_Total_Force (reax_atom *atoms, simulation_data *data, 
-		static_storage workspace, list p_bonds, int ensemble, int N)
+        static_storage workspace, list p_bonds, int ensemble, int N)
 {
-	int i, pj;
-	list *bonds = &p_bonds;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i < N) 
-	{
-		for (pj = Start_Index (i, bonds); pj < End_Index (i, bonds); ++pj)
-		{
-			//int j = bonds->select.bond_list[pj].nbr;
-			if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) 
-				Cuda_Add_dBond_to_Forces (i, pj, atoms, &workspace, bonds );
-			else 
-				Cuda_Add_dBond_to_Forces_NPT (i, pj, atoms, data, &workspace, bonds );
-		}
-	}
+    int i, pj;
+    list *bonds = &p_bonds;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) 
+    {
+        for (pj = Start_Index (i, bonds); pj < End_Index (i, bonds); ++pj)
+        {
+            //int j = bonds->select.bond_list[pj].nbr;
+            if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) 
+                Cuda_Add_dBond_to_Forces (i, pj, atoms, &workspace, bonds );
+            else 
+                Cuda_Add_dBond_to_Forces_NPT (i, pj, atoms, data, &workspace, bonds );
+        }
+    }
 }
 
 GLOBAL void Cuda_Compute_Total_Force_PostProcess (reax_atom *atoms, simulation_data *data, 
-		static_storage workspace, list p_bonds, int ensemble, int N)
+        static_storage workspace, list p_bonds, int ensemble, int N)
 {
-	int i, pj;
-	list *bonds = &p_bonds;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i < N) 
-	{
-		if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) 
-			Cuda_dbond_to_Forces_postprocess (i, atoms, bonds );
-	}
+    int i, pj;
+    list *bonds = &p_bonds;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < N) 
+    {
+        if (ensemble == NVE || ensemble == NVT || ensemble == bNVT) 
+            Cuda_dbond_to_Forces_postprocess (i, atoms, bonds );
+    }
 }
diff --git a/PuReMD-GPU/src/box.cu b/PuReMD-GPU/src/box.cu
index ae9a07ed..e42395c5 100644
--- a/PuReMD-GPU/src/box.cu
+++ b/PuReMD-GPU/src/box.cu
@@ -23,295 +23,295 @@
 
 
 void Init_Box_From_CRYST(real a, real b, real c, 
-		real alpha, real beta, real gamma, 
-		simulation_box* box )
+        real alpha, real beta, real gamma, 
+        simulation_box* box )
 {
-	double c_alpha, c_beta, c_gamma, s_gamma, zi;
+    double c_alpha, c_beta, c_gamma, s_gamma, zi;
 
-	c_alpha = cos(DEG2RAD(alpha));
-	c_beta  = cos(DEG2RAD(beta));
-	c_gamma = cos(DEG2RAD(gamma));
-	s_gamma = sin(DEG2RAD(gamma));
+    c_alpha = cos(DEG2RAD(alpha));
+    c_beta  = cos(DEG2RAD(beta));
+    c_gamma = cos(DEG2RAD(gamma));
+    s_gamma = sin(DEG2RAD(gamma));
 
-	zi = (c_alpha - c_beta * c_gamma)/s_gamma; 
+    zi = (c_alpha - c_beta * c_gamma)/s_gamma; 
 
-	box->box[0][0] = a; 
-	box->box[0][1] = 0.0; 
-	box->box[0][2] = 0.0;
+    box->box[0][0] = a; 
+    box->box[0][1] = 0.0; 
+    box->box[0][2] = 0.0;
 
-	box->box[1][0] = b * c_gamma; 
-	box->box[1][1] = b * s_gamma; 
-	box->box[1][2] = 0.0; 
+    box->box[1][0] = b * c_gamma; 
+    box->box[1][1] = b * s_gamma; 
+    box->box[1][2] = 0.0; 
 
-	box->box[2][0] = c * c_beta;
-	box->box[2][1] = c * zi;
-	box->box[2][2] = c * SQRT(1.0 - SQR(c_beta) - SQR(zi));
+    box->box[2][0] = c * c_beta;
+    box->box[2][1] = c * zi;
+    box->box[2][2] = c * SQRT(1.0 - SQR(c_beta) - SQR(zi));
 
-	Make_Consistent( box );
+    Make_Consistent( box );
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "box is %8.2f x %8.2f x %8.2f\n", 
-			box->box[0][0], box->box[1][1], box->box[2][2] );
+    fprintf( stderr, "box is %8.2f x %8.2f x %8.2f\n", 
+            box->box[0][0], box->box[1][1], box->box[2][2] );
 #endif
 }
 
 
 void Update_Box( rtensor box_tensor, simulation_box* box )
 {
-	int i, j;
+    int i, j;
 
-	for (i=0; i < 3; i++)
-		for (j=0; j < 3; j++)
-			box->box[i][j] = box_tensor[i][j];
+    for (i=0; i < 3; i++)
+        for (j=0; j < 3; j++)
+            box->box[i][j] = box_tensor[i][j];
 
-	Make_Consistent( box );
+    Make_Consistent( box );
 }
 
 
 void Update_Box_Isotropic( simulation_box *box, real mu )
 {
-	/*box->box[0][0] = 
-	  POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 );
-	  box->box[1][1] = box->box[0][0] * box->side_prop[1];
-	  box->box[2][2] = box->box[0][0] * box->side_prop[2]; 
-	 */
-	rtensor_Copy( box->old_box, box->box );
-	box->box[0][0] *= mu;
-	box->box[1][1] *= mu;
-	box->box[2][2] *= mu;
-
-	box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2];
-	Make_Consistent(box/*, periodic*/);
+    /*box->box[0][0] = 
+      POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 );
+      box->box[1][1] = box->box[0][0] * box->side_prop[1];
+      box->box[2][2] = box->box[0][0] * box->side_prop[2]; 
+     */
+    rtensor_Copy( box->old_box, box->box );
+    box->box[0][0] *= mu;
+    box->box[1][1] *= mu;
+    box->box[2][2] *= mu;
+
+    box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2];
+    Make_Consistent(box/*, periodic*/);
 }
 
 
 void Update_Box_SemiIsotropic( simulation_box *box, rvec mu )
 {
-	/*box->box[0][0] = 
-	  POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 );
-	  box->box[1][1] = box->box[0][0] * box->side_prop[1];
-	  box->box[2][2] = box->box[0][0] * box->side_prop[2]; */
-	rtensor_Copy( box->old_box, box->box );
-	box->box[0][0] *= mu[0];
-	box->box[1][1] *= mu[1];
-	box->box[2][2] *= mu[2];
-
-	box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2];
-	Make_Consistent(box);
+    /*box->box[0][0] = 
+      POW( V_new / ( box->side_prop[1] * box->side_prop[2] ), 1.0/3.0 );
+      box->box[1][1] = box->box[0][0] * box->side_prop[1];
+      box->box[2][2] = box->box[0][0] * box->side_prop[2]; */
+    rtensor_Copy( box->old_box, box->box );
+    box->box[0][0] *= mu[0];
+    box->box[1][1] *= mu[1];
+    box->box[2][2] *= mu[2];
+
+    box->volume = box->box[0][0]*box->box[1][1]*box->box[2][2];
+    Make_Consistent(box);
 }
 
 
 void Make_Consistent(simulation_box* box)
 {
-	real one_vol;
-
-	box->volume = 
-		box->box[0][0] * (box->box[1][1]*box->box[2][2] - 
-				box->box[2][1]*box->box[2][1]) +
-		box->box[0][1] * (box->box[2][0]*box->box[1][2] -
-				box->box[1][0]*box->box[2][2]) +
-		box->box[0][2] * (box->box[1][0]*box->box[2][1] -
-				box->box[2][0]*box->box[1][1]);
-
-	one_vol = 1.0/box->volume;
-
-	box->box_inv[0][0] = (box->box[1][1]*box->box[2][2] -
-			box->box[1][2]*box->box[2][1]) * one_vol;
-	box->box_inv[0][1] = (box->box[0][2]*box->box[2][1] -
-			box->box[0][1]*box->box[2][2]) * one_vol;
-	box->box_inv[0][2] = (box->box[0][1]*box->box[1][2] -
-			box->box[0][2]*box->box[1][1]) * one_vol;
-
-	box->box_inv[1][0] = (box->box[1][2]*box->box[2][0] -
-			box->box[1][0]*box->box[2][2]) * one_vol;
-	box->box_inv[1][1] = (box->box[0][0]*box->box[2][2] -
-			box->box[0][2]*box->box[2][0]) * one_vol;
-	box->box_inv[1][2] = (box->box[0][2]*box->box[1][0] -
-			box->box[0][0]*box->box[1][2]) * one_vol;
-
-	box->box_inv[2][0] = (box->box[1][0]*box->box[2][1] -
-			box->box[1][1]*box->box[2][0]) * one_vol;
-	box->box_inv[2][1] = (box->box[0][1]*box->box[2][0] -
-			box->box[0][0]*box->box[2][1]) * one_vol;
-	box->box_inv[2][2] = (box->box[0][0]*box->box[1][1] -
-			box->box[0][1]*box->box[1][0]) * one_vol;
-
-	box->box_norms[0] = SQRT( SQR(box->box[0][0]) +
-			SQR(box->box[0][1]) +
-			SQR(box->box[0][2]) );
-	box->box_norms[1] = SQRT( SQR(box->box[1][0]) +
-			SQR(box->box[1][1]) +
-			SQR(box->box[1][2]) );
-	box->box_norms[2] = SQRT( SQR(box->box[2][0]) +
-			SQR(box->box[2][1]) +
-			SQR(box->box[2][2]) );
-
-	box->trans[0][0] = box->box[0][0]/box->box_norms[0]; 
-	box->trans[0][1] = box->box[1][0]/box->box_norms[0];
-	box->trans[0][2] = box->box[2][0]/box->box_norms[0];
-
-	box->trans[1][0] = box->box[0][1]/box->box_norms[1]; 
-	box->trans[1][1] = box->box[1][1]/box->box_norms[1];
-	box->trans[1][2] = box->box[2][1]/box->box_norms[1];
-
-	box->trans[2][0] = box->box[0][2]/box->box_norms[2]; 
-	box->trans[2][1] = box->box[1][2]/box->box_norms[2];
-	box->trans[2][2] = box->box[2][2]/box->box_norms[2];
-
-	one_vol = box->box_norms[0]*box->box_norms[1]*box->box_norms[2]*one_vol;
-
-	box->trans_inv[0][0] = (box->trans[1][1]*box->trans[2][2] -
-			box->trans[1][2]*box->trans[2][1]) * one_vol;
-	box->trans_inv[0][1] = (box->trans[0][2]*box->trans[2][1] -
-			box->trans[0][1]*box->trans[2][2]) * one_vol;
-	box->trans_inv[0][2] = (box->trans[0][1]*box->trans[1][2] -
-			box->trans[0][2]*box->trans[1][1]) * one_vol;
-
-	box->trans_inv[1][0] = (box->trans[1][2]*box->trans[2][0] -
-			box->trans[1][0]*box->trans[2][2]) * one_vol;
-	box->trans_inv[1][1] = (box->trans[0][0]*box->trans[2][2] -
-			box->trans[0][2]*box->trans[2][0]) * one_vol;
-	box->trans_inv[1][2] = (box->trans[0][2]*box->trans[1][0] -
-			box->trans[0][0]*box->trans[1][2]) * one_vol;
-
-	box->trans_inv[2][0] = (box->trans[1][0]*box->trans[2][1] -
-			box->trans[1][1]*box->trans[2][0]) * one_vol;
-	box->trans_inv[2][1] = (box->trans[0][1]*box->trans[2][0] -
-			box->trans[0][0]*box->trans[2][1]) * one_vol;
-	box->trans_inv[2][2] = (box->trans[0][0]*box->trans[1][1] -
-			box->trans[0][1]*box->trans[1][0]) * one_vol;
-
-	//   for (i=0; i < 3; i++)
-	//     {
-	//       for (j=0; j < 3; j++)
-	// 	fprintf(stderr,"%lf\t",box->trans[i][j]);
-	//       fprintf(stderr,"\n");
-	//     }
-	//   fprintf(stderr,"\n");
-	//   for (i=0; i < 3; i++)
-	//     {
-	//       for (j=0; j < 3; j++)
-	// 	fprintf(stderr,"%lf\t",box->trans_inv[i][j]);
-	//       fprintf(stderr,"\n");
-	//     }
-
-
-	box->g[0][0] = box->box[0][0] * box->box[0][0] +
-		box->box[0][1] * box->box[0][1] +
-		box->box[0][2] * box->box[0][2];
-	box->g[1][0] = 
-		box->g[0][1] = box->box[0][0] * box->box[1][0] +
-		box->box[0][1] * box->box[1][1] +
-		box->box[0][2] * box->box[1][2];
-	box->g[2][0] =
-		box->g[0][2] = box->box[0][0] * box->box[2][0] +
-		box->box[0][1] * box->box[2][1] +
-		box->box[0][2] * box->box[2][2];
-
-	box->g[1][1] = box->box[1][0] * box->box[1][0] +
-		box->box[1][1] * box->box[1][1] +
-		box->box[1][2] * box->box[1][2];
-	box->g[1][2] =
-		box->g[2][1] = box->box[1][0] * box->box[2][0] +
-		box->box[1][1] * box->box[2][1] +
-		box->box[1][2] * box->box[2][2];
-
-	box->g[2][2] = box->box[2][0] * box->box[2][0] +
-		box->box[2][1] * box->box[2][1] +
-		box->box[2][2] * box->box[2][2];
-
-	// These proportions are only used for isotropic_NPT!
-	box->side_prop[0] = box->box[0][0] / box->box[0][0];
-	box->side_prop[1] = box->box[1][1] / box->box[0][0];
-	box->side_prop[2] = box->box[2][2] / box->box[0][0];
+    real one_vol;
+
+    box->volume = 
+        box->box[0][0] * (box->box[1][1]*box->box[2][2] - 
+                box->box[2][1]*box->box[2][1]) +
+        box->box[0][1] * (box->box[2][0]*box->box[1][2] -
+                box->box[1][0]*box->box[2][2]) +
+        box->box[0][2] * (box->box[1][0]*box->box[2][1] -
+                box->box[2][0]*box->box[1][1]);
+
+    one_vol = 1.0/box->volume;
+
+    box->box_inv[0][0] = (box->box[1][1]*box->box[2][2] -
+            box->box[1][2]*box->box[2][1]) * one_vol;
+    box->box_inv[0][1] = (box->box[0][2]*box->box[2][1] -
+            box->box[0][1]*box->box[2][2]) * one_vol;
+    box->box_inv[0][2] = (box->box[0][1]*box->box[1][2] -
+            box->box[0][2]*box->box[1][1]) * one_vol;
+
+    box->box_inv[1][0] = (box->box[1][2]*box->box[2][0] -
+            box->box[1][0]*box->box[2][2]) * one_vol;
+    box->box_inv[1][1] = (box->box[0][0]*box->box[2][2] -
+            box->box[0][2]*box->box[2][0]) * one_vol;
+    box->box_inv[1][2] = (box->box[0][2]*box->box[1][0] -
+            box->box[0][0]*box->box[1][2]) * one_vol;
+
+    box->box_inv[2][0] = (box->box[1][0]*box->box[2][1] -
+            box->box[1][1]*box->box[2][0]) * one_vol;
+    box->box_inv[2][1] = (box->box[0][1]*box->box[2][0] -
+            box->box[0][0]*box->box[2][1]) * one_vol;
+    box->box_inv[2][2] = (box->box[0][0]*box->box[1][1] -
+            box->box[0][1]*box->box[1][0]) * one_vol;
+
+    box->box_norms[0] = SQRT( SQR(box->box[0][0]) +
+            SQR(box->box[0][1]) +
+            SQR(box->box[0][2]) );
+    box->box_norms[1] = SQRT( SQR(box->box[1][0]) +
+            SQR(box->box[1][1]) +
+            SQR(box->box[1][2]) );
+    box->box_norms[2] = SQRT( SQR(box->box[2][0]) +
+            SQR(box->box[2][1]) +
+            SQR(box->box[2][2]) );
+
+    box->trans[0][0] = box->box[0][0]/box->box_norms[0]; 
+    box->trans[0][1] = box->box[1][0]/box->box_norms[0];
+    box->trans[0][2] = box->box[2][0]/box->box_norms[0];
+
+    box->trans[1][0] = box->box[0][1]/box->box_norms[1]; 
+    box->trans[1][1] = box->box[1][1]/box->box_norms[1];
+    box->trans[1][2] = box->box[2][1]/box->box_norms[1];
+
+    box->trans[2][0] = box->box[0][2]/box->box_norms[2]; 
+    box->trans[2][1] = box->box[1][2]/box->box_norms[2];
+    box->trans[2][2] = box->box[2][2]/box->box_norms[2];
+
+    one_vol = box->box_norms[0]*box->box_norms[1]*box->box_norms[2]*one_vol;
+
+    box->trans_inv[0][0] = (box->trans[1][1]*box->trans[2][2] -
+            box->trans[1][2]*box->trans[2][1]) * one_vol;
+    box->trans_inv[0][1] = (box->trans[0][2]*box->trans[2][1] -
+            box->trans[0][1]*box->trans[2][2]) * one_vol;
+    box->trans_inv[0][2] = (box->trans[0][1]*box->trans[1][2] -
+            box->trans[0][2]*box->trans[1][1]) * one_vol;
+
+    box->trans_inv[1][0] = (box->trans[1][2]*box->trans[2][0] -
+            box->trans[1][0]*box->trans[2][2]) * one_vol;
+    box->trans_inv[1][1] = (box->trans[0][0]*box->trans[2][2] -
+            box->trans[0][2]*box->trans[2][0]) * one_vol;
+    box->trans_inv[1][2] = (box->trans[0][2]*box->trans[1][0] -
+            box->trans[0][0]*box->trans[1][2]) * one_vol;
+
+    box->trans_inv[2][0] = (box->trans[1][0]*box->trans[2][1] -
+            box->trans[1][1]*box->trans[2][0]) * one_vol;
+    box->trans_inv[2][1] = (box->trans[0][1]*box->trans[2][0] -
+            box->trans[0][0]*box->trans[2][1]) * one_vol;
+    box->trans_inv[2][2] = (box->trans[0][0]*box->trans[1][1] -
+            box->trans[0][1]*box->trans[1][0]) * one_vol;
+
+    //   for (i=0; i < 3; i++)
+    //     {
+    //       for (j=0; j < 3; j++)
+    //     fprintf(stderr,"%lf\t",box->trans[i][j]);
+    //       fprintf(stderr,"\n");
+    //     }
+    //   fprintf(stderr,"\n");
+    //   for (i=0; i < 3; i++)
+    //     {
+    //       for (j=0; j < 3; j++)
+    //     fprintf(stderr,"%lf\t",box->trans_inv[i][j]);
+    //       fprintf(stderr,"\n");
+    //     }
+
+
+    box->g[0][0] = box->box[0][0] * box->box[0][0] +
+        box->box[0][1] * box->box[0][1] +
+        box->box[0][2] * box->box[0][2];
+    box->g[1][0] = 
+        box->g[0][1] = box->box[0][0] * box->box[1][0] +
+        box->box[0][1] * box->box[1][1] +
+        box->box[0][2] * box->box[1][2];
+    box->g[2][0] =
+        box->g[0][2] = box->box[0][0] * box->box[2][0] +
+        box->box[0][1] * box->box[2][1] +
+        box->box[0][2] * box->box[2][2];
+
+    box->g[1][1] = box->box[1][0] * box->box[1][0] +
+        box->box[1][1] * box->box[1][1] +
+        box->box[1][2] * box->box[1][2];
+    box->g[1][2] =
+        box->g[2][1] = box->box[1][0] * box->box[2][0] +
+        box->box[1][1] * box->box[2][1] +
+        box->box[1][2] * box->box[2][2];
+
+    box->g[2][2] = box->box[2][0] * box->box[2][0] +
+        box->box[2][1] * box->box[2][1] +
+        box->box[2][2] * box->box[2][2];
+
+    // These proportions are only used for isotropic_NPT!
+    box->side_prop[0] = box->box[0][0] / box->box[0][0];
+    box->side_prop[1] = box->box[1][1] / box->box[0][0];
+    box->side_prop[2] = box->box[2][2] / box->box[0][0];
 }
 
 
 void Transform( rvec x1, simulation_box *box, char flag, rvec x2 )
 {
-	int i, j;
-	real tmp;
-
-	//  printf(">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2]);
-
-	if (flag > 0) {
-		for (i=0; i < 3; i++) {
-			tmp = 0.0;
-			for (j=0; j < 3; j++)
-				tmp += box->trans[i][j]*x1[j]; 
-			x2[i] = tmp;
-		}
-	}
-	else {
-		for (i=0; i < 3; i++) {
-			tmp = 0.0;
-			for (j=0; j < 3; j++)
-				tmp += box->trans_inv[i][j]*x1[j]; 
-			x2[i] = tmp;
-		}
-	}
-	//  printf(">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2]);  
+    int i, j;
+    real tmp;
+
+    //  printf(">x1: (%lf, %lf, %lf)\n",x1[0],x1[1],x1[2]);
+
+    if (flag > 0) {
+        for (i=0; i < 3; i++) {
+            tmp = 0.0;
+            for (j=0; j < 3; j++)
+                tmp += box->trans[i][j]*x1[j]; 
+            x2[i] = tmp;
+        }
+    }
+    else {
+        for (i=0; i < 3; i++) {
+            tmp = 0.0;
+            for (j=0; j < 3; j++)
+                tmp += box->trans_inv[i][j]*x1[j]; 
+            x2[i] = tmp;
+        }
+    }
+    //  printf(">x2: (%lf, %lf, %lf)\n", x2[0], x2[1], x2[2]);  
 }
 
 
 void Transform_to_UnitBox( rvec x1, simulation_box *box, char flag, rvec x2 )
 {
-	Transform( x1, box, flag, x2 );
+    Transform( x1, box, flag, x2 );
 
-	x2[0] /= box->box_norms[0];
-	x2[1] /= box->box_norms[1];
-	x2[2] /= box->box_norms[2];
+    x2[0] /= box->box_norms[0];
+    x2[1] /= box->box_norms[1];
+    x2[2] /= box->box_norms[2];
 }
 
 
 void Distance_on_T3_Gen( rvec x1, rvec x2, simulation_box* box, rvec r )
 {
-	rvec xa, xb, ra;
+    rvec xa, xb, ra;
 
-	Transform( x1, box, -1, xa );
-	Transform( x2, box, -1, xb );
+    Transform( x1, box, -1, xa );
+    Transform( x2, box, -1, xb );
 
-	//printf(">xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]);
-	//printf(">xb: (%lf, %lf, %lf)\n",xb[0],xb[1],xb[2]);
+    //printf(">xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]);
+    //printf(">xb: (%lf, %lf, %lf)\n",xb[0],xb[1],xb[2]);
 
-	Sq_Distance_on_T3( xa, xb, box, ra );
+    Sq_Distance_on_T3( xa, xb, box, ra );
 
-	Transform( ra, box, 1, r );
+    Transform( ra, box, 1, r );
 }
 
 
 void Inc_on_T3_Gen( rvec x, rvec dx, simulation_box* box )
 {
-	rvec xa, dxa;
+    rvec xa, dxa;
 
-	Transform( x, box, -1, xa );
-	Transform( dx, box, -1, dxa );
+    Transform( x, box, -1, xa );
+    Transform( dx, box, -1, dxa );
 
-	//printf(">xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]);
-	//printf(">dxa: (%lf, %lf, %lf)\n",dxa[0],dxa[1],dxa[2]);
+    //printf(">xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]);
+    //printf(">dxa: (%lf, %lf, %lf)\n",dxa[0],dxa[1],dxa[2]);
 
-	Inc_on_T3( xa, dxa, box );
+    Inc_on_T3( xa, dxa, box );
 
-	//printf(">new_xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]);
+    //printf(">new_xa: (%lf, %lf, %lf)\n",xa[0],xa[1],xa[2]);
 
-	Transform( xa, box, 1, x );
+    Transform( xa, box, 1, x );
 }
 
 
 real Metric_Product( rvec x1, rvec x2, simulation_box* box )
 {
-	int i, j;
-	real dist=0.0, tmp;
-
-	for( i = 0; i < 3; i++ )
-	{
-		tmp = 0.0;
-		for( j = 0; j < 3; j++ )
-			tmp += box->g[i][j] * x2[j];
-		dist += x1[i] * tmp;
-	}
-
-	return dist;
+    int i, j;
+    real dist=0.0, tmp;
+
+    for( i = 0; i < 3; i++ )
+    {
+        tmp = 0.0;
+        for( j = 0; j < 3; j++ )
+            tmp += box->g[i][j] * x2[j];
+        dist += x1[i] * tmp;
+    }
+
+    return dist;
 }
 
 
@@ -319,23 +319,23 @@ real Metric_Product( rvec x1, rvec x2, simulation_box* box )
    If so, this neighborhood is added to the list of far neighbors.
    Periodic boundary conditions do not apply. */
 void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
-		control_params *control, 
-		far_neighbor_data *new_nbrs, int *count )
+        control_params *control, 
+        far_neighbor_data *new_nbrs, int *count )
 {
-	real norm_sqr;
+    real norm_sqr;
 
-	rvec_ScaledSum( new_nbrs[0].dvec, 1.0, x2, -1.0, x1 );
+    rvec_ScaledSum( new_nbrs[0].dvec, 1.0, x2, -1.0, x1 );
 
-	norm_sqr = rvec_Norm_Sqr( new_nbrs[0].dvec );
+    norm_sqr = rvec_Norm_Sqr( new_nbrs[0].dvec );
 
-	if( norm_sqr <= SQR( control->vlist_cut ) ) {
-		*count = 1;
-		new_nbrs[0].d = SQRT( norm_sqr );
+    if( norm_sqr <= SQR( control->vlist_cut ) ) {
+        *count = 1;
+        new_nbrs[0].d = SQRT( norm_sqr );
 
-		ivec_MakeZero( new_nbrs[0].rel_box );
-		// rvec_MakeZero( new_nbrs[0].ext_factor );
-	}
-	else *count = 0;
+        ivec_MakeZero( new_nbrs[0].rel_box );
+        // rvec_MakeZero( new_nbrs[0].ext_factor );
+    }
+    else *count = 0;
 }
 
 
@@ -344,49 +344,49 @@ void Get_NonPeriodic_Far_Neighbors( rvec x1, rvec x2, simulation_box *box,
    If the periodic distance between x1 and x2 is than vlist_cut, this 
    neighborhood is added to the list of far neighbors. */
 void Get_Periodic_Far_Neighbors_Big_Box( rvec x1, rvec x2, simulation_box *box, 
-		control_params *control, 
-		far_neighbor_data *periodic_nbrs, 
-		int *count )
+        control_params *control, 
+        far_neighbor_data *periodic_nbrs, 
+        int *count )
 {
-	real norm_sqr, d, tmp;
-	int i;
-
-	norm_sqr = 0;
-
-	for( i = 0; i < 3; i++ ) {
-		d = x2[i] - x1[i];
-		tmp = SQR(d);
-		// fprintf(out,"Inside Sq_Distance_on_T3, %d, %lf, %lf\n",
-		// i,tmp,SQR(box->box_norms[i]/2.0));
-
-		if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {	
-			if( x2[i] > x1[i] ) {
-				d -= box->box_norms[i];
-				periodic_nbrs[0].rel_box[i] = -1;
-				// periodic_nbrs[0].ext_factor[i] = +1;
-			}
-			else {
-				d += box->box_norms[i];
-				periodic_nbrs[0].rel_box[i] = +1;
-				// periodic_nbrs[0].ext_factor[i] = -1;
-			}
-
-			periodic_nbrs[0].dvec[i] = d;
-			norm_sqr += SQR(d);
-		}
-		else {
-			periodic_nbrs[0].dvec[i] = d;
-			norm_sqr += tmp;
-			periodic_nbrs[0].rel_box[i]   = 0;
-			// periodic_nbrs[0].ext_factor[i] = 0;
-		} 
-	}
-
-	if( norm_sqr <= SQR( control->vlist_cut ) ) {
-		*count = 1;
-		periodic_nbrs[0].d = SQRT( norm_sqr );
-	}
-	else *count = 0;
+    real norm_sqr, d, tmp;
+    int i;
+
+    norm_sqr = 0;
+
+    for( i = 0; i < 3; i++ ) {
+        d = x2[i] - x1[i];
+        tmp = SQR(d);
+        // fprintf(out,"Inside Sq_Distance_on_T3, %d, %lf, %lf\n",
+        // i,tmp,SQR(box->box_norms[i]/2.0));
+
+        if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
+            if( x2[i] > x1[i] ) {
+                d -= box->box_norms[i];
+                periodic_nbrs[0].rel_box[i] = -1;
+                // periodic_nbrs[0].ext_factor[i] = +1;
+            }
+            else {
+                d += box->box_norms[i];
+                periodic_nbrs[0].rel_box[i] = +1;
+                // periodic_nbrs[0].ext_factor[i] = -1;
+            }
+
+            periodic_nbrs[0].dvec[i] = d;
+            norm_sqr += SQR(d);
+        }
+        else {
+            periodic_nbrs[0].dvec[i] = d;
+            norm_sqr += tmp;
+            periodic_nbrs[0].rel_box[i]   = 0;
+            // periodic_nbrs[0].ext_factor[i] = 0;
+        } 
+    }
+
+    if( norm_sqr <= SQR( control->vlist_cut ) ) {
+        *count = 1;
+        periodic_nbrs[0].d = SQRT( norm_sqr );
+    }
+    else *count = 0;
 }
 
 
@@ -398,69 +398,69 @@ might get too small (such as <5 A!). In this case we have to consider the
 periodic images of x2 that are two boxs away!!!
  */
 void Get_Periodic_Far_Neighbors_Small_Box( rvec x1, rvec x2, simulation_box *box,
-		control_params *control, 
-		far_neighbor_data *periodic_nbrs, 
-		int *count )
+        control_params *control, 
+        far_neighbor_data *periodic_nbrs, 
+        int *count )
 {
-	int i, j, k;
-	int imax, jmax, kmax;
-	real sqr_norm, d_i, d_j, d_k;
-
-	*count = 0;
-	/* determine the max stretch of imaginary boxs in each direction
-	   to handle periodic boundary conditions correctly. */
-	imax = (int)(control->vlist_cut / box->box_norms[0] + 1);
-	jmax = (int)(control->vlist_cut / box->box_norms[1] + 1);
-	kmax = (int)(control->vlist_cut / box->box_norms[2] + 1);
-	/*if( imax > 1 || jmax > 1 || kmax > 1 )
-	  fprintf( stderr, "box %8.3f x %8.3f x %8.3f --> %2d %2d %2d\n",
-	  box->box_norms[0], box->box_norms[1], box->box_norms[2],
-	  imax, jmax, kmax ); */
-
-
-	for( i = -imax; i <= imax; ++i )
-		if(fabs(d_i=((x2[0]+i*box->box_norms[0])-x1[0]))<=control->vlist_cut) {
-			for( j = -jmax; j <= jmax; ++j )
-				if(fabs(d_j=((x2[1]+j*box->box_norms[1])-x1[1]))<=control->vlist_cut) {
-					for( k = -kmax; k <= kmax; ++k )
-						if(fabs(d_k=((x2[2]+k*box->box_norms[2])-x1[2]))<=control->vlist_cut) {
-							sqr_norm = SQR(d_i) + SQR(d_j) + SQR(d_k);
-							if( sqr_norm <= SQR(control->vlist_cut) ) {
-								periodic_nbrs[ *count ].d = SQRT( sqr_norm );
-
-								periodic_nbrs[ *count ].dvec[0] = d_i;
-								periodic_nbrs[ *count ].dvec[1] = d_j;
-								periodic_nbrs[ *count ].dvec[2] = d_k;
-
-								periodic_nbrs[ *count ].rel_box[0] = i;
-								periodic_nbrs[ *count ].rel_box[1] = j;
-								periodic_nbrs[ *count ].rel_box[2] = k;
-
-								/* if( i || j || k ) {
-								   fprintf(stderr, "x1: %.2f %.2f %.2f\n", x1[0], x1[1], x1[2]);
-								   fprintf(stderr, "x2: %.2f %.2f %.2f\n", x2[0], x2[1], x2[2]);
-								   fprintf( stderr, "d : %8.2f%8.2f%8.2f\n\n", d_i, d_j, d_k );
-								   } */
-
-								/* if(i) periodic_nbrs[*count].ext_factor[0] = (real)i/-abs(i);
-								   else  periodic_nbrs[*count].ext_factor[0] = 0;
-
-								   if(j) periodic_nbrs[*count].ext_factor[1] = (real)j/-abs(j);
-								   else  periodic_nbrs[*count].ext_factor[1] = 0;
-
-								   if(k) periodic_nbrs[*count].ext_factor[2] = (real)k/-abs(k);
-								   else  periodic_nbrs[*count].ext_factor[2] = 0; */
-
-
-								/* if( i == 0 && j == 0 && k == 0 )
-								 *  periodic_nbrs[ *count ].imaginary = 0;
-								 *  else periodic_nbrs[ *count ].imaginary = 1;
-								 */
-								++(*count);
-							}
-						}
-				}
-		}
+    int i, j, k;
+    int imax, jmax, kmax;
+    real sqr_norm, d_i, d_j, d_k;
+
+    *count = 0;
+    /* determine the max stretch of imaginary boxs in each direction
+       to handle periodic boundary conditions correctly. */
+    imax = (int)(control->vlist_cut / box->box_norms[0] + 1);
+    jmax = (int)(control->vlist_cut / box->box_norms[1] + 1);
+    kmax = (int)(control->vlist_cut / box->box_norms[2] + 1);
+    /*if( imax > 1 || jmax > 1 || kmax > 1 )
+      fprintf( stderr, "box %8.3f x %8.3f x %8.3f --> %2d %2d %2d\n",
+      box->box_norms[0], box->box_norms[1], box->box_norms[2],
+      imax, jmax, kmax ); */
+
+
+    for( i = -imax; i <= imax; ++i )
+        if(fabs(d_i=((x2[0]+i*box->box_norms[0])-x1[0]))<=control->vlist_cut) {
+            for( j = -jmax; j <= jmax; ++j )
+                if(fabs(d_j=((x2[1]+j*box->box_norms[1])-x1[1]))<=control->vlist_cut) {
+                    for( k = -kmax; k <= kmax; ++k )
+                        if(fabs(d_k=((x2[2]+k*box->box_norms[2])-x1[2]))<=control->vlist_cut) {
+                            sqr_norm = SQR(d_i) + SQR(d_j) + SQR(d_k);
+                            if( sqr_norm <= SQR(control->vlist_cut) ) {
+                                periodic_nbrs[ *count ].d = SQRT( sqr_norm );
+
+                                periodic_nbrs[ *count ].dvec[0] = d_i;
+                                periodic_nbrs[ *count ].dvec[1] = d_j;
+                                periodic_nbrs[ *count ].dvec[2] = d_k;
+
+                                periodic_nbrs[ *count ].rel_box[0] = i;
+                                periodic_nbrs[ *count ].rel_box[1] = j;
+                                periodic_nbrs[ *count ].rel_box[2] = k;
+
+                                /* if( i || j || k ) {
+                                   fprintf(stderr, "x1: %.2f %.2f %.2f\n", x1[0], x1[1], x1[2]);
+                                   fprintf(stderr, "x2: %.2f %.2f %.2f\n", x2[0], x2[1], x2[2]);
+                                   fprintf( stderr, "d : %8.2f%8.2f%8.2f\n\n", d_i, d_j, d_k );
+                                   } */
+
+                                /* if(i) periodic_nbrs[*count].ext_factor[0] = (real)i/-abs(i);
+                                   else  periodic_nbrs[*count].ext_factor[0] = 0;
+
+                                   if(j) periodic_nbrs[*count].ext_factor[1] = (real)j/-abs(j);
+                                   else  periodic_nbrs[*count].ext_factor[1] = 0;
+
+                                   if(k) periodic_nbrs[*count].ext_factor[2] = (real)k/-abs(k);
+                                   else  periodic_nbrs[*count].ext_factor[2] = 0; */
+
+
+                                /* if( i == 0 && j == 0 && k == 0 )
+                                 *  periodic_nbrs[ *count ].imaginary = 0;
+                                 *  else periodic_nbrs[ *count ].imaginary = 1;
+                                 */
+                                ++(*count);
+                            }
+                        }
+                }
+        }
 }
 
 
@@ -505,39 +505,39 @@ rvec_Add( box->nbr_box_press[map], v );
 
 void Print_Box_Information( simulation_box* box, FILE *out )
 {
-	int i, j;
-
-	fprintf( out, "box: {" );
-	for( i = 0; i < 3; ++i )
-	{
-		fprintf( out, "{" );
-		for( j = 0; j < 3; ++j )
-			fprintf( out, "%8.3f ", box->box[i][j] );
-		fprintf( out, "}" );
-	}
-	fprintf( out, "}\n" );
-
-	fprintf( out, "V: %8.3f\tdims: {%8.3f, %8.3f, %8.3f}\n", 
-			box->volume, 
-			box->box_norms[0], box->box_norms[1], box->box_norms[2] );
-
-	fprintf( out, "box_trans: {" );
-	for( i = 0; i < 3; ++i )
-	{
-		fprintf( out, "{" );
-		for( j = 0; j < 3; ++j )
-			fprintf( out, "%8.3f ", box->trans[i][j] );
-		fprintf( out, "}" );
-	}
-	fprintf( out, "}\n" );
-
-	fprintf( out, "box_trinv: {" );
-	for( i = 0; i < 3; ++i )
-	{
-		fprintf( out, "{" );
-		for( j = 0; j < 3; ++j )
-			fprintf( out, "%8.3f ", box->trans_inv[i][j] );
-		fprintf( out, "}" );
-	}
-	fprintf( out, "}\n" );
+    int i, j;
+
+    fprintf( out, "box: {" );
+    for( i = 0; i < 3; ++i )
+    {
+        fprintf( out, "{" );
+        for( j = 0; j < 3; ++j )
+            fprintf( out, "%8.3f ", box->box[i][j] );
+        fprintf( out, "}" );
+    }
+    fprintf( out, "}\n" );
+
+    fprintf( out, "V: %8.3f\tdims: {%8.3f, %8.3f, %8.3f}\n", 
+            box->volume, 
+            box->box_norms[0], box->box_norms[1], box->box_norms[2] );
+
+    fprintf( out, "box_trans: {" );
+    for( i = 0; i < 3; ++i )
+    {
+        fprintf( out, "{" );
+        for( j = 0; j < 3; ++j )
+            fprintf( out, "%8.3f ", box->trans[i][j] );
+        fprintf( out, "}" );
+    }
+    fprintf( out, "}\n" );
+
+    fprintf( out, "box_trinv: {" );
+    for( i = 0; i < 3; ++i )
+    {
+        fprintf( out, "{" );
+        for( j = 0; j < 3; ++j )
+            fprintf( out, "%8.3f ", box->trans_inv[i][j] );
+        fprintf( out, "}" );
+    }
+    fprintf( out, "}\n" );
 }
diff --git a/PuReMD-GPU/src/center_mass.cu b/PuReMD-GPU/src/center_mass.cu
index d81e769e..ea8f7998 100644
--- a/PuReMD-GPU/src/center_mass.cu
+++ b/PuReMD-GPU/src/center_mass.cu
@@ -25,235 +25,235 @@
 #include "vector.h"
 
 GLOBAL void center_of_mass_blocks (single_body_parameters *sbp, reax_atom *atoms,
-		rvec *res_xcm, 
-		rvec *res_vcm, 
-		rvec *res_amcm, 
-		size_t n)
+        rvec *res_xcm, 
+        rvec *res_vcm, 
+        rvec *res_amcm, 
+        size_t n)
 {
-	extern __shared__ rvec xcm[];
-	extern __shared__ rvec vcm[];
-	extern __shared__ rvec amcm[];
-
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-	unsigned int xcm_id = threadIdx.x;
-	unsigned int vcm_id = blockDim.x;
-	unsigned int amcm_id = 2 *(blockDim.x);
-
-	unsigned int index = 0;
-	rvec tmp;
-	real m;
-
-	rvec_MakeZero (xcm [threadIdx.x]);
-	rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
-	rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
-	rvec_MakeZero (tmp);
-
-	if (i < n){
-		m = sbp [ atoms[i].type ].mass;
-		rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
-		rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
-		rvec_Cross (tmp, atoms[i].x, atoms [i].v);
-		rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
-	}
-	__syncthreads ();
-
-	for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-		if ((threadIdx.x < offset)) {
-			index = threadIdx.x + offset;
-			rvec_Add (xcm [threadIdx.x], xcm[index]);
-			rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
-			rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
-		} 
-		__syncthreads ();
-	}
-
-	if ((threadIdx.x == 0)){
-		rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
-		rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
-		rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
-	}
+    extern __shared__ rvec xcm[];
+    extern __shared__ rvec vcm[];
+    extern __shared__ rvec amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 *(blockDim.x);
+
+    unsigned int index = 0;
+    rvec tmp;
+    real m;
+
+    rvec_MakeZero (xcm [threadIdx.x]);
+    rvec_MakeZero (vcm [vcm_id + threadIdx.x]);
+    rvec_MakeZero (amcm[amcm_id + threadIdx.x]);
+    rvec_MakeZero (tmp);
+
+    if (i < n){
+        m = sbp [ atoms[i].type ].mass;
+        rvec_ScaledAdd (xcm [threadIdx.x], m, atoms [i].x);
+        rvec_ScaledAdd (vcm [vcm_id + threadIdx.x], m, atoms [i].v);
+        rvec_Cross (tmp, atoms[i].x, atoms [i].v);
+        rvec_ScaledAdd (amcm[amcm_id + threadIdx.x], m, tmp);
+    }
+    __syncthreads ();
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+
+        if ((threadIdx.x < offset)) {
+            index = threadIdx.x + offset;
+            rvec_Add (xcm [threadIdx.x], xcm[index]);
+            rvec_Add (vcm [vcm_id  + threadIdx.x], vcm[vcm_id + index]);
+            rvec_Add (amcm[amcm_id + threadIdx.x], amcm[amcm_id + index]);
+        } 
+        __syncthreads ();
+    }
+
+    if ((threadIdx.x == 0)){
+        rvec_Copy (res_xcm[blockIdx.x], xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], amcm[amcm_id]);
+    }
 }
 
 GLOBAL void center_of_mass (rvec *xcm, 
-		rvec *vcm, 
-		rvec *amcm, 
-		rvec *res_xcm,
-		rvec *res_vcm,
-		rvec *res_amcm,
-		size_t n)
+        rvec *vcm, 
+        rvec *amcm, 
+        rvec *res_xcm,
+        rvec *res_vcm,
+        rvec *res_amcm,
+        size_t n)
 {
-	extern __shared__ rvec sh_xcm[];
-	extern __shared__ rvec sh_vcm[];
-	extern __shared__ rvec sh_amcm[];
-
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-	unsigned int xcm_id = threadIdx.x;
-	unsigned int vcm_id = blockDim.x;
-	unsigned int amcm_id = 2 * (blockDim.x);
-
-	unsigned int index = 0;
-	rvec t_xcm, t_vcm, t_amcm;
-
-	rvec_MakeZero (t_xcm);
-	rvec_MakeZero (t_vcm);
-	rvec_MakeZero (t_amcm);
-
-	if (i < n){
-		rvec_Copy ( t_xcm, xcm[threadIdx.x]);
-		rvec_Copy ( t_vcm, vcm[threadIdx.x]);
-		rvec_Copy ( t_amcm, amcm[threadIdx.x]);
-	}
-
-	rvec_Copy (sh_xcm[xcm_id], t_xcm);
-	rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
-	rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
-
-	__syncthreads ();
-
-	for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
-
-		if (threadIdx.x < offset) {
-			index = threadIdx.x + offset;
-			rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
-			rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
-			rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
-		} 
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0){
-		rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
-		rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
-		rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
-	}
+    extern __shared__ rvec sh_xcm[];
+    extern __shared__ rvec sh_vcm[];
+    extern __shared__ rvec sh_amcm[];
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    unsigned int xcm_id = threadIdx.x;
+    unsigned int vcm_id = blockDim.x;
+    unsigned int amcm_id = 2 * (blockDim.x);
+
+    unsigned int index = 0;
+    rvec t_xcm, t_vcm, t_amcm;
+
+    rvec_MakeZero (t_xcm);
+    rvec_MakeZero (t_vcm);
+    rvec_MakeZero (t_amcm);
+
+    if (i < n){
+        rvec_Copy ( t_xcm, xcm[threadIdx.x]);
+        rvec_Copy ( t_vcm, vcm[threadIdx.x]);
+        rvec_Copy ( t_amcm, amcm[threadIdx.x]);
+    }
+
+    rvec_Copy (sh_xcm[xcm_id], t_xcm);
+    rvec_Copy (sh_vcm[vcm_id + threadIdx.x], t_vcm);
+    rvec_Copy (sh_amcm[amcm_id + threadIdx.x], t_amcm);
+
+    __syncthreads ();
+
+    for( int offset = blockDim.x / 2; offset > 0; offset >>= 1 ) { 
+
+        if (threadIdx.x < offset) {
+            index = threadIdx.x + offset;
+            rvec_Add (sh_xcm [threadIdx.x], sh_xcm[index]);
+            rvec_Add (sh_vcm [vcm_id + threadIdx.x], sh_vcm[vcm_id + index]);
+            rvec_Add (sh_amcm [amcm_id + threadIdx.x], sh_amcm[amcm_id + index]);
+        } 
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0){
+        rvec_Copy (res_xcm[blockIdx.x], sh_xcm[0]);
+        rvec_Copy (res_vcm[blockIdx.x], sh_vcm[vcm_id]);
+        rvec_Copy (res_amcm[blockIdx.x], sh_amcm[amcm_id]);
+    }
 }
 
 GLOBAL void compute_center_mass (single_body_parameters *sbp, 
-		reax_atom *atoms,
-		real *results, 
-		real xcm0, real xcm1, real xcm2,
-		size_t n)
+        reax_atom *atoms,
+        real *results, 
+        real xcm0, real xcm1, real xcm2,
+        size_t n)
 {
-	extern __shared__ real xx[];
-	extern __shared__ real xy[];
-	extern __shared__ real xz[];
-	extern __shared__ real yy[];
-	extern __shared__ real yz[];
-	extern __shared__ real zz[];
-
-	unsigned int xx_i = threadIdx.x;
-	unsigned int xy_i = blockDim.x;
-	unsigned int xz_i = 2 * blockDim.x;
-	unsigned int yy_i = 3 * blockDim.x;
-	unsigned int yz_i = 4 * blockDim.x;
-	unsigned int zz_i = 5 * blockDim.x;
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int index = 0;
-
-	rvec diff, xcm;
-	real m = 0;
-	rvec_MakeZero (diff);
-	xcm[0] = xcm0;
-	xcm[1] = xcm1;
-	xcm[2] = xcm2;
-
-
-	xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-		yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-	if (i < n){
-		m = sbp[ atoms[i].type ].mass;
-		rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
-		xx[ xx_i ] = diff[0] * diff[0] * m;
-		xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
-		xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
-		yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
-		yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
-		zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
-	}
-	__syncthreads ();
-
-	for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
-		if (threadIdx.x < offset){
-			index = threadIdx.x + offset;
-			xx[ threadIdx.x ] += xx[ index ];
-			xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-			xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-			yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-			yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-			zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-		}
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0) {
-		results [ blockIdx.x*6 ] = xx [ 0 ];
-		results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
-		results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
-		results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
-		results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
-		results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
-	}
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    rvec diff, xcm;
+    real m = 0;
+    rvec_MakeZero (diff);
+    xcm[0] = xcm0;
+    xcm[1] = xcm1;
+    xcm[2] = xcm2;
+
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n){
+        m = sbp[ atoms[i].type ].mass;
+        rvec_ScaledSum( diff, 1., atoms[i].x, -1., xcm );
+        xx[ xx_i ] = diff[0] * diff[0] * m;
+        xy[ xy_i + threadIdx.x ] = diff[0] * diff[1] * m;
+        xz[ xz_i + threadIdx.x ] = diff[0] * diff[2] * m;
+        yy[ yy_i + threadIdx.x ] = diff[1] * diff[1] * m;
+        yz[ yz_i + threadIdx.x ] = diff[1] * diff[2] * m;
+        zz[ zz_i + threadIdx.x ] = diff[2] * diff[2] * m;    
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1){
+        if (threadIdx.x < offset){
+            index = threadIdx.x + offset;
+            xx[ threadIdx.x ] += xx[ index ];
+            xy[ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz[ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy[ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz[ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz[ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        results [ blockIdx.x*6 ] = xx [ 0 ];
+        results [ blockIdx.x*6 + 1 ] = xy [ xy_i + 0 ];
+        results [ blockIdx.x*6 + 2 ] = xz [ xz_i + 0 ];
+        results [ blockIdx.x*6 + 3 ] = yy [ yy_i + 0 ];
+        results [ blockIdx.x*6 + 4 ] = yz [ yz_i + 0 ];
+        results [ blockIdx.x*6 + 5 ] = zz [ zz_i + 0 ];
+    }
 }
 
 GLOBAL void compute_center_mass (real *input, real *output, size_t n)
 {
-	extern __shared__ real xx[];
-	extern __shared__ real xy[];
-	extern __shared__ real xz[];
-	extern __shared__ real yy[];
-	extern __shared__ real yz[];
-	extern __shared__ real zz[];
-
-	unsigned int xx_i = threadIdx.x;
-	unsigned int xy_i = blockDim.x;
-	unsigned int xz_i = 2 * blockDim.x;
-	unsigned int yy_i = 3 * blockDim.x;
-	unsigned int yz_i = 4 * blockDim.x;
-	unsigned int zz_i = 5 * blockDim.x;
-
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	unsigned int index = 0;
-
-	xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
-		yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
-
-	if (i < n)
-	{
-		xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
-		xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
-		xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
-		yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
-		yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
-		zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
-	}
-	__syncthreads ();
-
-	for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if (threadIdx.x < offset )
-		{
-			index = threadIdx.x + offset;
-			xx [ threadIdx.x ] += xx [ index ];
-			xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
-			xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
-			yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
-			yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
-			zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
-		}
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0)
-	{
-		output[0] = xx[0];
-		output[1] = xy[xy_i];
-		output[2] = xz[xz_i];
-		output[3] = xz[yy_i];
-		output[4] = xz[yz_i];
-		output[5] = xz[zz_i];
-	}
+    extern __shared__ real xx[];
+    extern __shared__ real xy[];
+    extern __shared__ real xz[];
+    extern __shared__ real yy[];
+    extern __shared__ real yz[];
+    extern __shared__ real zz[];
+
+    unsigned int xx_i = threadIdx.x;
+    unsigned int xy_i = blockDim.x;
+    unsigned int xz_i = 2 * blockDim.x;
+    unsigned int yy_i = 3 * blockDim.x;
+    unsigned int yz_i = 4 * blockDim.x;
+    unsigned int zz_i = 5 * blockDim.x;
+
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int index = 0;
+
+    xx[xx_i] = xy [xy_i + threadIdx.x] = xz[xz_i + threadIdx.x] = 
+        yy[yy_i + threadIdx.x] = yz[yz_i + threadIdx.x] = zz[zz_i + threadIdx.x] = 0;
+
+    if (i < n)
+    {
+        xx [ xx_i ] = input [ threadIdx.x*6 + 0 ];
+        xy [ xy_i + threadIdx.x ] = input [ threadIdx.x*6 + 1 ];
+        xz [ xz_i + threadIdx.x ] = input [ threadIdx.x*6 + 2 ];
+        yy [ yy_i + threadIdx.x ] = input [ threadIdx.x*6 + 3 ];
+        yz [ yz_i + threadIdx.x ] = input [ threadIdx.x*6 + 4 ];
+        zz [ zz_i + threadIdx.x ] = input [ threadIdx.x*6 + 5 ];
+    }
+    __syncthreads ();
+
+    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if (threadIdx.x < offset )
+        {
+            index = threadIdx.x + offset;
+            xx [ threadIdx.x ] += xx [ index ];
+            xy [ xy_i + threadIdx.x ] += xy [ xy_i + index ];
+            xz [ xz_i + threadIdx.x ] += xz [ xz_i + index ];
+            yy [ yy_i + threadIdx.x ] += yy [ yy_i + index ];
+            yz [ yz_i + threadIdx.x ] += yz [ yz_i + index ];
+            zz [ zz_i + threadIdx.x ] += zz [ zz_i + index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0)
+    {
+        output[0] = xx[0];
+        output[1] = xy[xy_i];
+        output[2] = xz[xz_i];
+        output[3] = xz[yy_i];
+        output[4] = xz[yz_i];
+        output[5] = xz[zz_i];
+    }
 }
diff --git a/PuReMD-GPU/src/cuda_copy.cu b/PuReMD-GPU/src/cuda_copy.cu
index 6ce94690..2db79e37 100644
--- a/PuReMD-GPU/src/cuda_copy.cu
+++ b/PuReMD-GPU/src/cuda_copy.cu
@@ -26,70 +26,70 @@
 
 void Sync_Host_Device (grid *host, grid *dev, enum cudaMemcpyKind dir)
 {
-	copy_host_device (host->top, dev->top, 
-			INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP);
+    copy_host_device (host->top, dev->top, 
+            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_TOP);
 
-	copy_host_device (host->mark, dev->mark, 
-			INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_MARK);
+    copy_host_device (host->mark, dev->mark, 
+            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_MARK);
 
-	copy_host_device (host->start, dev->start, 
-			INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_START);
+    copy_host_device (host->start, dev->start, 
+            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_START);
 
-	copy_host_device (host->end, dev->end, 
-			INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_END);
+    copy_host_device (host->end, dev->end, 
+            INT_SIZE * host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_END);
 
-	copy_host_device (host->atoms, dev->atoms, 
-			INT_SIZE * host->max_atoms*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_ATOMS);
+    copy_host_device (host->atoms, dev->atoms, 
+            INT_SIZE * host->max_atoms*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_ATOMS);
 
-	copy_host_device (host->nbrs, dev->nbrs, 
-			IVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS);
+    copy_host_device (host->nbrs, dev->nbrs, 
+            IVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS);
 
-	copy_host_device (host->nbrs_cp, dev->nbrs_cp, 
-			RVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS_CP);
+    copy_host_device (host->nbrs_cp, dev->nbrs_cp, 
+            RVEC_SIZE * host->max_nbrs*host->ncell[0]*host->ncell[1]*host->ncell[2], dir, RES_GRID_NBRS_CP);
 }
 
 
 void Sync_Host_Device (reax_system *sys, enum cudaMemcpyKind dir)
 {
 
-	copy_host_device (sys->atoms, sys->d_atoms, 
-			REAX_ATOM_SIZE * sys->N, dir, RES_SYSTEM_ATOMS);
+    copy_host_device (sys->atoms, sys->d_atoms, 
+            REAX_ATOM_SIZE * sys->N, dir, RES_SYSTEM_ATOMS);
 
-	copy_host_device (&(sys->box), sys->d_box, SIMULATION_BOX_SIZE, dir, RES_SYSTEM_SIMULATION_BOX );
+    copy_host_device (&(sys->box), sys->d_box, SIMULATION_BOX_SIZE, dir, RES_SYSTEM_SIMULATION_BOX );
 
-	//synch bonds here.
-	copy_host_device (sys->reaxprm.sbp, sys->reaxprm.d_sbp, SBP_SIZE * sys->reaxprm.num_atom_types, 
-			dir, RES_REAX_INT_SBP );
-	copy_host_device (sys->reaxprm.tbp, sys->reaxprm.d_tbp, TBP_SIZE * pow (sys->reaxprm.num_atom_types, 2), 
-			dir, RES_REAX_INT_TBP );
-	copy_host_device (sys->reaxprm.thbp, sys->reaxprm.d_thbp, THBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), 
-			dir, RES_REAX_INT_THBP );
-	copy_host_device (sys->reaxprm.hbp, sys->reaxprm.d_hbp, HBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), 
-			dir, RES_REAX_INT_HBP );
-	copy_host_device (sys->reaxprm.fbp, sys->reaxprm.d_fbp, FBP_SIZE * pow (sys->reaxprm.num_atom_types, 4),
-			dir, RES_REAX_INT_FBP );
+    //synch bonds here.
+    copy_host_device (sys->reaxprm.sbp, sys->reaxprm.d_sbp, SBP_SIZE * sys->reaxprm.num_atom_types, 
+            dir, RES_REAX_INT_SBP );
+    copy_host_device (sys->reaxprm.tbp, sys->reaxprm.d_tbp, TBP_SIZE * pow (sys->reaxprm.num_atom_types, 2), 
+            dir, RES_REAX_INT_TBP );
+    copy_host_device (sys->reaxprm.thbp, sys->reaxprm.d_thbp, THBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), 
+            dir, RES_REAX_INT_THBP );
+    copy_host_device (sys->reaxprm.hbp, sys->reaxprm.d_hbp, HBP_SIZE * pow (sys->reaxprm.num_atom_types, 3), 
+            dir, RES_REAX_INT_HBP );
+    copy_host_device (sys->reaxprm.fbp, sys->reaxprm.d_fbp, FBP_SIZE * pow (sys->reaxprm.num_atom_types, 4),
+            dir, RES_REAX_INT_FBP );
 
-	copy_host_device (sys->reaxprm.gp.l, sys->reaxprm.d_gp.l, REAL_SIZE * sys->reaxprm.gp.n_global, 
-			dir, RES_GLOBAL_PARAMS );
+    copy_host_device (sys->reaxprm.gp.l, sys->reaxprm.d_gp.l, REAL_SIZE * sys->reaxprm.gp.n_global, 
+            dir, RES_GLOBAL_PARAMS );
 
-	sys->reaxprm.d_gp.n_global = sys->reaxprm.gp.n_global; 
-	sys->reaxprm.d_gp.vdw_type = sys->reaxprm.gp.vdw_type; 
+    sys->reaxprm.d_gp.n_global = sys->reaxprm.gp.n_global; 
+    sys->reaxprm.d_gp.vdw_type = sys->reaxprm.gp.vdw_type; 
 }
 
 void Sync_Host_Device (simulation_data *host, simulation_data *dev, enum cudaMemcpyKind dir)
 {
-	copy_host_device (host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA );
+    copy_host_device (host, dev, SIMULATION_DATA_SIZE, dir, RES_SIMULATION_DATA );
 }
 
 void Sync_Host_Device (sparse_matrix *L, sparse_matrix *U, enum cudaMemcpyKind dir )
 {
-	copy_host_device ( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
-	copy_host_device ( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
-	copy_host_device ( L->entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * L->m, dir, RES_SPARSE_MATRIX_ENTRY );
+    copy_host_device ( L->start, dev_workspace->L.start, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
+    copy_host_device ( L->end, dev_workspace->L.end, INT_SIZE * (L->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
+    copy_host_device ( L->entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * L->m, dir, RES_SPARSE_MATRIX_ENTRY );
 
-	copy_host_device ( U->start, dev_workspace->U.start, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
-	copy_host_device ( U->end, dev_workspace->U.end, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
-	copy_host_device ( U->entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * U->m, dir, RES_SPARSE_MATRIX_ENTRY );
+    copy_host_device ( U->start, dev_workspace->U.start, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
+    copy_host_device ( U->end, dev_workspace->U.end, INT_SIZE * (U->n + 1), dir, RES_SPARSE_MATRIX_INDEX );
+    copy_host_device ( U->entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * U->m, dir, RES_SPARSE_MATRIX_ENTRY );
 }
 
 void Sync_Host_Device (output_controls *, control_params *, enum cudaMemcpyKind)
@@ -98,86 +98,86 @@ void Sync_Host_Device (output_controls *, control_params *, enum cudaMemcpyKind)
 
 void Sync_Host_Device (control_params *host, control_params *device, enum cudaMemcpyKind)
 {
-	copy_host_device (host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
+    copy_host_device (host, device, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
 }
 
 
 void Prep_Device_For_Output (reax_system *system, simulation_data *data )
 {
-	//int size = sizeof (simulation_data) - (2*sizeof (reax_timing) + sizeof (void *));
-	//unsigned long start_address = (unsigned long)data->d_simulation_data + (unsigned long) (2 * INT_SIZE + REAL_SIZE);
-
-	//fprintf (stderr, "Address of Simulation data (address) --> %ld \n", data->d_simulation_data );
-	//fprintf (stderr, "Size of simulation_data --> %d \n", sizeof (simulation_data));
-	//fprintf (stderr, "size to copy --> %d \n", size );
-	//copy_host_device (data, (simulation_data *)data->d_simulation_data, size, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-
-	//Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost );
-	/*
-	   copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, 
-	   REAL_SIZE * 13, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-	   copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, 
-	   REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-	   copy_host_device (&data->int_press, &((simulation_data *)data->d_simulation_data)->int_press, 
-	   3*(RVEC_SIZE) + REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-
-	   copy_host_device (&data->therm.T, &((simulation_data *)data->d_simulation_data)->therm.T, 
-	   REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-	 */
-
-	simulation_data local_data;
-	copy_host_device (&local_data, (simulation_data *)data->d_simulation_data, 
-			SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-	data->E_BE = local_data.E_BE;
-	data->E_Ov = local_data.E_Ov;
-	data->E_Un = local_data.E_Un;
-	data->E_Lp = local_data.E_Lp;
-	data->E_Ang = local_data.E_Ang;
-	data->E_Pen = local_data.E_Pen;
-	data->E_Coa = local_data.E_Coa;
-	data->E_HB = local_data.E_HB;
-	data->E_Tor = local_data.E_Tor;
-	data->E_Con = local_data.E_Con;
-	data->E_vdW = local_data.E_vdW;
-	data->E_Ele = local_data.E_Ele;
-	data->E_Kin = local_data.E_Kin;
-	rvec_Copy (data->int_press, local_data.int_press);
-	rvec_Copy (data->ext_press, local_data.ext_press);
-	data->kin_press =  local_data.kin_press;
-	data->therm.T = local_data.therm.T;
-
-	//Sync_Host_Device (&system.g, &system.d_g, cudaMemcpyDeviceToHost );
-	Sync_Host_Device (system, cudaMemcpyDeviceToHost );
+    //int size = sizeof (simulation_data) - (2*sizeof (reax_timing) + sizeof (void *));
+    //unsigned long start_address = (unsigned long)data->d_simulation_data + (unsigned long) (2 * INT_SIZE + REAL_SIZE);
+
+    //fprintf (stderr, "Address of Simulation data (address) --> %ld \n", data->d_simulation_data );
+    //fprintf (stderr, "Size of simulation_data --> %d \n", sizeof (simulation_data));
+    //fprintf (stderr, "size to copy --> %d \n", size );
+    //copy_host_device (data, (simulation_data *)data->d_simulation_data, size, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+
+    //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyDeviceToHost );
+    /*
+       copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, 
+       REAL_SIZE * 13, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+       copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, 
+       REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+       copy_host_device (&data->int_press, &((simulation_data *)data->d_simulation_data)->int_press, 
+       3*(RVEC_SIZE) + REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+
+       copy_host_device (&data->therm.T, &((simulation_data *)data->d_simulation_data)->therm.T, 
+       REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+     */
+
+    simulation_data local_data;
+    copy_host_device (&local_data, (simulation_data *)data->d_simulation_data, 
+            SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+    data->E_BE = local_data.E_BE;
+    data->E_Ov = local_data.E_Ov;
+    data->E_Un = local_data.E_Un;
+    data->E_Lp = local_data.E_Lp;
+    data->E_Ang = local_data.E_Ang;
+    data->E_Pen = local_data.E_Pen;
+    data->E_Coa = local_data.E_Coa;
+    data->E_HB = local_data.E_HB;
+    data->E_Tor = local_data.E_Tor;
+    data->E_Con = local_data.E_Con;
+    data->E_vdW = local_data.E_vdW;
+    data->E_Ele = local_data.E_Ele;
+    data->E_Kin = local_data.E_Kin;
+    rvec_Copy (data->int_press, local_data.int_press);
+    rvec_Copy (data->ext_press, local_data.ext_press);
+    data->kin_press =  local_data.kin_press;
+    data->therm.T = local_data.therm.T;
+
+    //Sync_Host_Device (&system.g, &system.d_g, cudaMemcpyDeviceToHost );
+    Sync_Host_Device (system, cudaMemcpyDeviceToHost );
 }
 
 void Sync_Host_Device (list *host, list *device, int type)
 {
-	//list is already allocated -- discard it first
-	if (host->n > 0)
-		Delete_List (host, TYP_HOST);
-
-	//memory is allocated on the host
-	Make_List(device->n, device->num_intrs, type, host, TYP_HOST );
-
-	//memcpy the entries from device to host
-	copy_host_device (host->index, device->index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_INDEX );
-	copy_host_device (host->end_index, device->end_index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_END_INDEX );
-
-	switch (type)
-	{
-		case TYP_BOND:
-			copy_host_device (host->select.bond_list, device->select.bond_list, 
-					BOND_DATA_SIZE * device->num_intrs, cudaMemcpyDeviceToHost, LIST_BOND_DATA );
-			break;
-
-		case TYP_THREE_BODY:
-			copy_host_device (host->select.three_body_list, device->select.three_body_list, 
-					sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, LIST_THREE_BODY_DATA );
-			break;
-
-		default:
-			fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type );
-			exit (1);
-			break;
-	}
+    //list is already allocated -- discard it first
+    if (host->n > 0)
+        Delete_List (host, TYP_HOST);
+
+    //memory is allocated on the host
+    Make_List(device->n, device->num_intrs, type, host, TYP_HOST );
+
+    //memcpy the entries from device to host
+    copy_host_device (host->index, device->index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_INDEX );
+    copy_host_device (host->end_index, device->end_index, INT_SIZE * device->n, cudaMemcpyDeviceToHost, LIST_END_INDEX );
+
+    switch (type)
+    {
+        case TYP_BOND:
+            copy_host_device (host->select.bond_list, device->select.bond_list, 
+                    BOND_DATA_SIZE * device->num_intrs, cudaMemcpyDeviceToHost, LIST_BOND_DATA );
+            break;
+
+        case TYP_THREE_BODY:
+            copy_host_device (host->select.three_body_list, device->select.three_body_list, 
+                    sizeof (three_body_interaction_data )* device->num_intrs, cudaMemcpyDeviceToHost, LIST_THREE_BODY_DATA );
+            break;
+
+        default:
+            fprintf (stderr, "Unknown list synching from device to host ---- > %d \n", type );
+            exit (1);
+            break;
+    }
 }
diff --git a/PuReMD-GPU/src/cuda_init.cu b/PuReMD-GPU/src/cuda_init.cu
index 9574a275..09515038 100644
--- a/PuReMD-GPU/src/cuda_init.cu
+++ b/PuReMD-GPU/src/cuda_init.cu
@@ -29,274 +29,274 @@
 
 void Cuda_Init_System ( reax_system *system)
 {
-	cuda_malloc ( (void **) &system->d_atoms, system->N * REAX_ATOM_SIZE, 1, RES_SYSTEM_ATOMS );	
+    cuda_malloc ( (void **) &system->d_atoms, system->N * REAX_ATOM_SIZE, 1, RES_SYSTEM_ATOMS );    
 
-	cuda_malloc ( (void **) &system->d_box, sizeof (simulation_box), 1, RES_SYSTEM_SIMULATION_BOX );
+    cuda_malloc ( (void **) &system->d_box, sizeof (simulation_box), 1, RES_SYSTEM_SIMULATION_BOX );
 
-	//interaction parameters
-	cuda_malloc ((void **) &system->reaxprm.d_sbp, system->reaxprm.num_atom_types * SBP_SIZE,
-			1, RES_REAX_INT_SBP );
+    //interaction parameters
+    cuda_malloc ((void **) &system->reaxprm.d_sbp, system->reaxprm.num_atom_types * SBP_SIZE,
+            1, RES_REAX_INT_SBP );
 
-	cuda_malloc ((void **) &system->reaxprm.d_tbp, pow (system->reaxprm.num_atom_types, 2) * TBP_SIZE, 
-			1, RES_REAX_INT_TBP );
+    cuda_malloc ((void **) &system->reaxprm.d_tbp, pow (system->reaxprm.num_atom_types, 2) * TBP_SIZE, 
+            1, RES_REAX_INT_TBP );
 
-	cuda_malloc ((void **) &system->reaxprm.d_thbp, pow (system->reaxprm.num_atom_types, 3) * THBP_SIZE,
-			1, RES_REAX_INT_THBP );
+    cuda_malloc ((void **) &system->reaxprm.d_thbp, pow (system->reaxprm.num_atom_types, 3) * THBP_SIZE,
+            1, RES_REAX_INT_THBP );
 
-	cuda_malloc ((void **) &system->reaxprm.d_hbp, pow (system->reaxprm.num_atom_types, 3) * HBP_SIZE,
-			1, RES_REAX_INT_HBP );
+    cuda_malloc ((void **) &system->reaxprm.d_hbp, pow (system->reaxprm.num_atom_types, 3) * HBP_SIZE,
+            1, RES_REAX_INT_HBP );
 
-	cuda_malloc ((void **) &system->reaxprm.d_fbp, pow (system->reaxprm.num_atom_types, 4) * FBP_SIZE,
-			1, RES_REAX_INT_FBP );
+    cuda_malloc ((void **) &system->reaxprm.d_fbp, pow (system->reaxprm.num_atom_types, 4) * FBP_SIZE,
+            1, RES_REAX_INT_FBP );
 
-	cuda_malloc ((void **) &system->reaxprm.d_gp.l, REAL_SIZE * system->reaxprm.gp.n_global, 1, RES_GLOBAL_PARAMS );
+    cuda_malloc ((void **) &system->reaxprm.d_gp.l, REAL_SIZE * system->reaxprm.gp.n_global, 1, RES_GLOBAL_PARAMS );
 
-	system->reaxprm.d_gp.n_global = 0;
-	system->reaxprm.d_gp.vdw_type = 0;
+    system->reaxprm.d_gp.n_global = 0;
+    system->reaxprm.d_gp.vdw_type = 0;
 }
 
 void Cuda_Init_Control (control_params *control)
 {
-	cuda_malloc ((void **)&control->d_control, CONTROL_PARAMS_SIZE, 1, RES_CONTROL_PARAMS );
-	copy_host_device (control, control->d_control, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
+    cuda_malloc ((void **)&control->d_control, CONTROL_PARAMS_SIZE, 1, RES_CONTROL_PARAMS );
+    copy_host_device (control, control->d_control, CONTROL_PARAMS_SIZE, cudaMemcpyHostToDevice, RES_CONTROL_PARAMS );
 }
 
 void Cuda_Init_Simulation_Data (simulation_data *data)
 {
-	cuda_malloc ((void **) &(data->d_simulation_data), SIMULATION_DATA_SIZE, 1, RES_SIMULATION_DATA );
+    cuda_malloc ((void **) &(data->d_simulation_data), SIMULATION_DATA_SIZE, 1, RES_SIMULATION_DATA );
 }
 
 GLOBAL void Initialize_Grid (ivec *nbrs, rvec *nbrs_cp, int N)
 {
-	int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if (index >= N) return;
+    if (index >= N) return;
 
-	nbrs[index][0] = -1;
-	nbrs[index][1] = -1;
-	nbrs[index][2] = -1;
-	nbrs_cp[index][0] = -1;
-	nbrs_cp[index][1] = -1;
-	nbrs_cp[index][2] = -1;
+    nbrs[index][0] = -1;
+    nbrs[index][1] = -1;
+    nbrs[index][2] = -1;
+    nbrs_cp[index][0] = -1;
+    nbrs_cp[index][1] = -1;
+    nbrs_cp[index][2] = -1;
 }
 
 void Cuda_Init_Grid (grid *host, grid *dev)
 {
-	int total = host->ncell[0] * host->ncell[1] * host->ncell[2];
-	dev->max_atoms = host->max_atoms;
-	dev->max_nbrs = host->max_nbrs;
-	dev->total = host->total;
-	dev->max_cuda_nbrs = host->max_cuda_nbrs;
-	dev->cell_size = host->cell_size;
-
-	ivec_Copy (dev->spread, host->spread);
-	ivec_Copy (dev->ncell, host->ncell);
-	rvec_Copy (dev->len, host->len);
-	rvec_Copy (dev->inv_len, host->inv_len);
-
-	cuda_malloc ((void **) &dev->top, INT_SIZE * total , 1, RES_GRID_TOP );
-	cuda_malloc ((void **) &dev->mark, INT_SIZE * total , 1, RES_GRID_MARK );
-	cuda_malloc ((void **) &dev->start, INT_SIZE * total , 1, RES_GRID_START );
-	cuda_malloc ((void **) &dev->end, INT_SIZE * total , 1, RES_GRID_END );
-
-	cuda_malloc ((void **) &dev->atoms, INT_SIZE * total * host->max_atoms, 1, RES_GRID_ATOMS );
-	cuda_malloc ((void **) &dev->nbrs, IVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS );
-	cuda_malloc ((void **) &dev->nbrs_cp, RVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS_CP );
-
-	int block_size = 512;
-	int blocks = (total*dev->max_nbrs) / block_size + ((total*dev->max_nbrs) % block_size == 0 ? 0 : 1);
-
-	Initialize_Grid <<<blocks, block_size>>>
-		(dev->nbrs, dev->nbrs_cp, total * host->max_nbrs );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    int total = host->ncell[0] * host->ncell[1] * host->ncell[2];
+    dev->max_atoms = host->max_atoms;
+    dev->max_nbrs = host->max_nbrs;
+    dev->total = host->total;
+    dev->max_cuda_nbrs = host->max_cuda_nbrs;
+    dev->cell_size = host->cell_size;
+
+    ivec_Copy (dev->spread, host->spread);
+    ivec_Copy (dev->ncell, host->ncell);
+    rvec_Copy (dev->len, host->len);
+    rvec_Copy (dev->inv_len, host->inv_len);
+
+    cuda_malloc ((void **) &dev->top, INT_SIZE * total , 1, RES_GRID_TOP );
+    cuda_malloc ((void **) &dev->mark, INT_SIZE * total , 1, RES_GRID_MARK );
+    cuda_malloc ((void **) &dev->start, INT_SIZE * total , 1, RES_GRID_START );
+    cuda_malloc ((void **) &dev->end, INT_SIZE * total , 1, RES_GRID_END );
+
+    cuda_malloc ((void **) &dev->atoms, INT_SIZE * total * host->max_atoms, 1, RES_GRID_ATOMS );
+    cuda_malloc ((void **) &dev->nbrs, IVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS );
+    cuda_malloc ((void **) &dev->nbrs_cp, RVEC_SIZE * total * host->max_nbrs, 0, RES_GRID_NBRS_CP );
+
+    int block_size = 512;
+    int blocks = (total*dev->max_nbrs) / block_size + ((total*dev->max_nbrs) % block_size == 0 ? 0 : 1);
+
+    Initialize_Grid <<<blocks, block_size>>>
+        (dev->nbrs, dev->nbrs_cp, total * host->max_nbrs );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 GLOBAL void Init_Workspace_Arrays (single_body_parameters *sbp, reax_atom *atoms, 
-		static_storage workspace, int N)
+        static_storage workspace, int N)
 {
 
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if(i >= N) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i >= N) return;
 
-	workspace.Hdia_inv[i] = 1./sbp[atoms[i].type].eta;
-	workspace.b_s[i] = -sbp[ atoms[i].type ].chi;
-	workspace.b_t[i] = -1.0;
+    workspace.Hdia_inv[i] = 1./sbp[atoms[i].type].eta;
+    workspace.b_s[i] = -sbp[ atoms[i].type ].chi;
+    workspace.b_t[i] = -1.0;
 
-	workspace.b[i] = -sbp[ atoms[i].type ].chi;
-	workspace.b[i+N] = -1.0;
+    workspace.b[i] = -sbp[ atoms[i].type ].chi;
+    workspace.b[i+N] = -1.0;
 }
 
 GLOBAL void Init_Map_Serials (int *input, int N)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
 
-	input[i] = -1;
+    input[i] = -1;
 }
 
 void Cuda_Init_Workspace_System (reax_system *system, static_storage *workspace )
 {
-	int blocks, block_size = BLOCK_SIZE;
-	compute_blocks (&blocks, &block_size, MAX_ATOM_ID );
+    int blocks, block_size = BLOCK_SIZE;
+    compute_blocks (&blocks, &block_size, MAX_ATOM_ID );
 
-	cuda_malloc ( (void **) &workspace->map_serials, INT_SIZE * MAX_ATOM_ID, 0, RES_STORAGE_MAP_SERIALS );
+    cuda_malloc ( (void **) &workspace->map_serials, INT_SIZE * MAX_ATOM_ID, 0, RES_STORAGE_MAP_SERIALS );
 
-	Init_Map_Serials <<< blocks, block_size >>> 
-		( workspace->map_serials, MAX_ATOM_ID );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Init_Map_Serials <<< blocks, block_size >>> 
+        ( workspace->map_serials, MAX_ATOM_ID );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	cuda_malloc ( (void **) &workspace->orig_id, INT_SIZE * system->N, 0, RES_STORAGE_ORIG_ID );
-	cuda_malloc ( (void **) &workspace->restricted, INT_SIZE * system->N, 0, RES_STORAGE_RESTRICTED );
-	cuda_malloc ( (void **) &workspace->restricted_list, system->N * MAX_RESTRICT * INT_SIZE, 0, RES_STORAGE_RESTRICTED_LIST );
+    cuda_malloc ( (void **) &workspace->orig_id, INT_SIZE * system->N, 0, RES_STORAGE_ORIG_ID );
+    cuda_malloc ( (void **) &workspace->restricted, INT_SIZE * system->N, 0, RES_STORAGE_RESTRICTED );
+    cuda_malloc ( (void **) &workspace->restricted_list, system->N * MAX_RESTRICT * INT_SIZE, 0, RES_STORAGE_RESTRICTED_LIST );
 }
 
 
 void Cuda_Init_Workspace( reax_system *system, control_params *control,
-		static_storage *workspace )
+        static_storage *workspace )
 {
-	int i;
-
-	/* Allocate space for hydrogen bond list */
-	cuda_malloc ((void **) &workspace->hbond_index, 			system->N * INT_SIZE, 0, RES_STORAGE_HBOND_INDEX );
-
-	/* bond order related storage  */
-	cuda_malloc ((void **) &workspace->total_bond_order, 	system->N * REAL_SIZE, 0, RES_STORAGE_TOTAL_BOND_ORDER );
-	cuda_malloc ((void **) &workspace->Deltap, 				system->N * REAL_SIZE, 0, RES_STORAGE_DELTAP );
-	cuda_malloc ((void **) &workspace->Deltap_boc, 			system->N * REAL_SIZE, 0, RES_STORAGE_DELTAP_BOC );
-	cuda_malloc ((void **) &workspace->dDeltap_self,  		system->N * RVEC_SIZE, 0, RES_STORAGE_DDELTAP_SELF );
-
-	cuda_malloc ((void **) &workspace->Delta,  				system->N * REAL_SIZE, 0, RES_STORAGE_DELTA );
-	cuda_malloc ((void **) &workspace->Delta_lp,           system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_LP );
-	cuda_malloc ((void **) &workspace->Delta_lp_temp,      system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_LP_TEMP );
-	cuda_malloc ((void **) &workspace->dDelta_lp,          system->N * REAL_SIZE, 0, RES_STORAGE_DDELTA_LP );
-	cuda_malloc ((void **) &workspace->dDelta_lp_temp,     system->N * REAL_SIZE, 0, RES_STORAGE_DDELTA_LP_TEMP );
-	cuda_malloc ((void **) &workspace->Delta_e,            system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_E );
-	cuda_malloc ((void **) &workspace->Delta_boc,          system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_BOC );
-	cuda_malloc ((void **) &workspace->nlp,                system->N * REAL_SIZE, 0, RES_STORAGE_NLP );
-	cuda_malloc ((void **) &workspace->nlp_temp,           system->N * REAL_SIZE, 0, RES_STORAGE_NLP_TEMP );
-	cuda_malloc ((void **) &workspace->Clp,                system->N * REAL_SIZE, 0, RES_STORAGE_CLP );
-	cuda_malloc ((void **) &workspace->CdDelta,            system->N * REAL_SIZE, 0, RES_STORAGE_CDDELTA );
-	cuda_malloc ((void **) &workspace->vlpex,              system->N * REAL_SIZE, 0, RES_STORAGE_VLPEX );
-
-	/* QEq storage */
-	workspace->H.start        = NULL;
-	//cuda_malloc ((void **) &workspace->H.start,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
-	workspace->L.start        = NULL;
-	//cuda_malloc ((void **) &workspace->L.start,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
-	workspace->U.start        = NULL;
-	//cuda_malloc ((void **) &workspace->U.start,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
-
-	workspace->H.end			= NULL;
-	//cuda_malloc ((void **) &workspace->H.end,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
-	workspace->L.end			= NULL;
-	//cuda_malloc ((void **) &workspace->L.end,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
-	workspace->U.end			= NULL;
-	//cuda_malloc ((void **) &workspace->U.end,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
-
-	workspace->H.entries     = NULL;
-	workspace->L.entries     = NULL;
-	workspace->U.entries     = NULL;
-
-	cuda_malloc ((void **) &workspace->droptol,  system->N * REAL_SIZE, 1, RES_STORAGE_DROPTOL );
-	cuda_malloc ((void **) &workspace->w,        system->N * REAL_SIZE, 1, RES_STORAGE_W );
-	cuda_malloc ((void **) &workspace->Hdia_inv, system->N * REAL_SIZE, 1, RES_STORAGE_HDIA_INV );
-	cuda_malloc ((void **) &workspace->b,        system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B );
-	cuda_malloc ((void **) &workspace->b_s,      system->N * REAL_SIZE, 1, RES_STORAGE_B_S );
-	cuda_malloc ((void **) &workspace->b_t,      system->N * REAL_SIZE, 1, RES_STORAGE_B_T );
-	cuda_malloc ((void **) &workspace->b_prc,    system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B_PRC );
-	cuda_malloc ((void **) &workspace->b_prm,    system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B_PRM );
-	cuda_malloc ((void **) &workspace->s_t,      system->N * 2 * REAL_SIZE, 1, RES_STORAGE_S_T );
-	cuda_malloc ((void **) &workspace->s,        5 * system->N * REAL_SIZE, 1, RES_STORAGE_S );
-	cuda_malloc ((void **) &workspace->t,        5 * system->N * REAL_SIZE, 1, RES_STORAGE_T );
-
-	Init_Workspace_Arrays  <<<BLOCKS, BLOCK_SIZE>>>
-		(system->reaxprm.d_sbp, system->d_atoms, *workspace, system->N );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	/* GMRES storage */
-	cuda_malloc ((void **) &workspace->y,  (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_Y );
-	cuda_malloc ((void **) &workspace->z,  (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_Z );
-	cuda_malloc ((void **) &workspace->g,  (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_G );
-	cuda_malloc ((void **) &workspace->hs, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_HS );
-	cuda_malloc ((void **) &workspace->hc, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_HC );
-
-	cuda_malloc ((void **) &workspace->rn, (RESTART+1)*system->N * 2 * REAL_SIZE, 1, RES_STORAGE_RN );
-	cuda_malloc ((void **) &workspace->v,  (RESTART+1)*system->N * REAL_SIZE, 1, RES_STORAGE_V );
-	cuda_malloc ((void **) &workspace->h,  (RESTART+1)*(RESTART+1) * REAL_SIZE, 1, RES_STORAGE_H );
-
-	/* CG storage */
-	cuda_malloc ((void **) &workspace->r, system->N * REAL_SIZE, 1, RES_STORAGE_R );
-	cuda_malloc ((void **) &workspace->d, system->N * REAL_SIZE, 1, RES_STORAGE_D );
-	cuda_malloc ((void **) &workspace->q, system->N * REAL_SIZE, 1, RES_STORAGE_Q );
-	cuda_malloc ((void **) &workspace->p, system->N * REAL_SIZE, 1, RES_STORAGE_P );
-
-
-	/* integrator storage */
-	cuda_malloc ((void **) &workspace->a,   	system->N * RVEC_SIZE, 1, RES_STORAGE_A );
-	cuda_malloc ((void **) &workspace->f_old,  system->N * RVEC_SIZE, 1, RES_STORAGE_F_OLD );
-	cuda_malloc ((void **) &workspace->v_const,system->N * RVEC_SIZE, 1, RES_STORAGE_V_CONST );
-
-	/* storage for analysis */
-	if( control->molec_anal || control->diffusion_coef )
-	{
-		cuda_malloc ((void **) &workspace->mark,   	system->N * INT_SIZE, 1, RES_STORAGE_MARK );
-		cuda_malloc ((void **) &workspace->old_mark, system->N * INT_SIZE, 1, RES_STORAGE_OLD_MARK);
-	}
-	else
-		workspace->mark = workspace->old_mark = NULL;
-
-	if( control->diffusion_coef )
-		cuda_malloc ((void **) &workspace->x_old,  system->N * RVEC_SIZE, 1, RES_STORAGE_X_OLD );
-	else workspace->x_old = NULL;
-
-	workspace->realloc.num_far = -1;
-	workspace->realloc.Htop = -1;
-	workspace->realloc.hbonds = -1;
-	workspace->realloc.bonds = -1;
-	workspace->realloc.num_3body = -1;
-	workspace->realloc.gcell_atoms = -1;
-
-	Cuda_Reset_Workspace( system, workspace );
+    int i;
+
+    /* Allocate space for hydrogen bond list */
+    cuda_malloc ((void **) &workspace->hbond_index,             system->N * INT_SIZE, 0, RES_STORAGE_HBOND_INDEX );
+
+    /* bond order related storage  */
+    cuda_malloc ((void **) &workspace->total_bond_order,     system->N * REAL_SIZE, 0, RES_STORAGE_TOTAL_BOND_ORDER );
+    cuda_malloc ((void **) &workspace->Deltap,                 system->N * REAL_SIZE, 0, RES_STORAGE_DELTAP );
+    cuda_malloc ((void **) &workspace->Deltap_boc,             system->N * REAL_SIZE, 0, RES_STORAGE_DELTAP_BOC );
+    cuda_malloc ((void **) &workspace->dDeltap_self,          system->N * RVEC_SIZE, 0, RES_STORAGE_DDELTAP_SELF );
+
+    cuda_malloc ((void **) &workspace->Delta,                  system->N * REAL_SIZE, 0, RES_STORAGE_DELTA );
+    cuda_malloc ((void **) &workspace->Delta_lp,           system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_LP );
+    cuda_malloc ((void **) &workspace->Delta_lp_temp,      system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_LP_TEMP );
+    cuda_malloc ((void **) &workspace->dDelta_lp,          system->N * REAL_SIZE, 0, RES_STORAGE_DDELTA_LP );
+    cuda_malloc ((void **) &workspace->dDelta_lp_temp,     system->N * REAL_SIZE, 0, RES_STORAGE_DDELTA_LP_TEMP );
+    cuda_malloc ((void **) &workspace->Delta_e,            system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_E );
+    cuda_malloc ((void **) &workspace->Delta_boc,          system->N * REAL_SIZE, 0, RES_STORAGE_DELTA_BOC );
+    cuda_malloc ((void **) &workspace->nlp,                system->N * REAL_SIZE, 0, RES_STORAGE_NLP );
+    cuda_malloc ((void **) &workspace->nlp_temp,           system->N * REAL_SIZE, 0, RES_STORAGE_NLP_TEMP );
+    cuda_malloc ((void **) &workspace->Clp,                system->N * REAL_SIZE, 0, RES_STORAGE_CLP );
+    cuda_malloc ((void **) &workspace->CdDelta,            system->N * REAL_SIZE, 0, RES_STORAGE_CDDELTA );
+    cuda_malloc ((void **) &workspace->vlpex,              system->N * REAL_SIZE, 0, RES_STORAGE_VLPEX );
+
+    /* QEq storage */
+    workspace->H.start        = NULL;
+    //cuda_malloc ((void **) &workspace->H.start,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
+    workspace->L.start        = NULL;
+    //cuda_malloc ((void **) &workspace->L.start,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
+    workspace->U.start        = NULL;
+    //cuda_malloc ((void **) &workspace->U.start,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
+
+    workspace->H.end            = NULL;
+    //cuda_malloc ((void **) &workspace->H.end,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
+    workspace->L.end            = NULL;
+    //cuda_malloc ((void **) &workspace->L.end,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
+    workspace->U.end            = NULL;
+    //cuda_malloc ((void **) &workspace->U.end,              (system->N+1)* INT_SIZE, 0, RES_SPARSE_MATRIX_INDEX );
+
+    workspace->H.entries     = NULL;
+    workspace->L.entries     = NULL;
+    workspace->U.entries     = NULL;
+
+    cuda_malloc ((void **) &workspace->droptol,  system->N * REAL_SIZE, 1, RES_STORAGE_DROPTOL );
+    cuda_malloc ((void **) &workspace->w,        system->N * REAL_SIZE, 1, RES_STORAGE_W );
+    cuda_malloc ((void **) &workspace->Hdia_inv, system->N * REAL_SIZE, 1, RES_STORAGE_HDIA_INV );
+    cuda_malloc ((void **) &workspace->b,        system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B );
+    cuda_malloc ((void **) &workspace->b_s,      system->N * REAL_SIZE, 1, RES_STORAGE_B_S );
+    cuda_malloc ((void **) &workspace->b_t,      system->N * REAL_SIZE, 1, RES_STORAGE_B_T );
+    cuda_malloc ((void **) &workspace->b_prc,    system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B_PRC );
+    cuda_malloc ((void **) &workspace->b_prm,    system->N * 2 * REAL_SIZE, 1, RES_STORAGE_B_PRM );
+    cuda_malloc ((void **) &workspace->s_t,      system->N * 2 * REAL_SIZE, 1, RES_STORAGE_S_T );
+    cuda_malloc ((void **) &workspace->s,        5 * system->N * REAL_SIZE, 1, RES_STORAGE_S );
+    cuda_malloc ((void **) &workspace->t,        5 * system->N * REAL_SIZE, 1, RES_STORAGE_T );
+
+    Init_Workspace_Arrays  <<<BLOCKS, BLOCK_SIZE>>>
+        (system->reaxprm.d_sbp, system->d_atoms, *workspace, system->N );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    /* GMRES storage */
+    cuda_malloc ((void **) &workspace->y,  (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_Y );
+    cuda_malloc ((void **) &workspace->z,  (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_Z );
+    cuda_malloc ((void **) &workspace->g,  (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_G );
+    cuda_malloc ((void **) &workspace->hs, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_HS );
+    cuda_malloc ((void **) &workspace->hc, (RESTART+1) * REAL_SIZE, 1, RES_STORAGE_HC );
+
+    cuda_malloc ((void **) &workspace->rn, (RESTART+1)*system->N * 2 * REAL_SIZE, 1, RES_STORAGE_RN );
+    cuda_malloc ((void **) &workspace->v,  (RESTART+1)*system->N * REAL_SIZE, 1, RES_STORAGE_V );
+    cuda_malloc ((void **) &workspace->h,  (RESTART+1)*(RESTART+1) * REAL_SIZE, 1, RES_STORAGE_H );
+
+    /* CG storage */
+    cuda_malloc ((void **) &workspace->r, system->N * REAL_SIZE, 1, RES_STORAGE_R );
+    cuda_malloc ((void **) &workspace->d, system->N * REAL_SIZE, 1, RES_STORAGE_D );
+    cuda_malloc ((void **) &workspace->q, system->N * REAL_SIZE, 1, RES_STORAGE_Q );
+    cuda_malloc ((void **) &workspace->p, system->N * REAL_SIZE, 1, RES_STORAGE_P );
+
+
+    /* integrator storage */
+    cuda_malloc ((void **) &workspace->a,       system->N * RVEC_SIZE, 1, RES_STORAGE_A );
+    cuda_malloc ((void **) &workspace->f_old,  system->N * RVEC_SIZE, 1, RES_STORAGE_F_OLD );
+    cuda_malloc ((void **) &workspace->v_const,system->N * RVEC_SIZE, 1, RES_STORAGE_V_CONST );
+
+    /* storage for analysis */
+    if( control->molec_anal || control->diffusion_coef )
+    {
+        cuda_malloc ((void **) &workspace->mark,       system->N * INT_SIZE, 1, RES_STORAGE_MARK );
+        cuda_malloc ((void **) &workspace->old_mark, system->N * INT_SIZE, 1, RES_STORAGE_OLD_MARK);
+    }
+    else
+        workspace->mark = workspace->old_mark = NULL;
+
+    if( control->diffusion_coef )
+        cuda_malloc ((void **) &workspace->x_old,  system->N * RVEC_SIZE, 1, RES_STORAGE_X_OLD );
+    else workspace->x_old = NULL;
+
+    workspace->realloc.num_far = -1;
+    workspace->realloc.Htop = -1;
+    workspace->realloc.hbonds = -1;
+    workspace->realloc.bonds = -1;
+    workspace->realloc.num_3body = -1;
+    workspace->realloc.gcell_atoms = -1;
+
+    Cuda_Reset_Workspace( system, workspace );
 }
 
 void Cuda_Init_Workspace_Device ( static_storage *workspace )
 {
-	workspace->realloc.estimate_nbrs = -1;
-	workspace->realloc.num_far = -1;
-	workspace->realloc.Htop = -1;
-	workspace->realloc.hbonds = -1;
-	workspace->realloc.bonds = -1;
-	workspace->realloc.num_3body = -1;
-	workspace->realloc.gcell_atoms = -1;
+    workspace->realloc.estimate_nbrs = -1;
+    workspace->realloc.num_far = -1;
+    workspace->realloc.Htop = -1;
+    workspace->realloc.hbonds = -1;
+    workspace->realloc.bonds = -1;
+    workspace->realloc.num_3body = -1;
+    workspace->realloc.gcell_atoms = -1;
 }
 
 void Cuda_Init_Sparse_Matrix (sparse_matrix *matrix, int entries, int N)
 {
-	cuda_malloc ((void **) &matrix->start, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX );
-	cuda_malloc ((void **) &matrix->end, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX );
-	cuda_malloc ((void **) &matrix->entries, SPARSE_MATRIX_ENTRY_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY );
+    cuda_malloc ((void **) &matrix->start, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX );
+    cuda_malloc ((void **) &matrix->end, INT_SIZE * (N + 1), 1, RES_SPARSE_MATRIX_INDEX );
+    cuda_malloc ((void **) &matrix->entries, SPARSE_MATRIX_ENTRY_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY );
 
-	cuda_malloc ((void **) &matrix->j, INT_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY );
-	cuda_malloc ((void **) &matrix->val, REAL_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY );
+    cuda_malloc ((void **) &matrix->j, INT_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY );
+    cuda_malloc ((void **) &matrix->val, REAL_SIZE * entries, 1, RES_SPARSE_MATRIX_ENTRY );
 
 }
 
 void Cuda_Init_Scratch ()
 {
-	cuda_malloc ((void **) &scratch, SCRATCH_SIZE, 0, RES_SCRATCH );
-
-	/*
-	   cudaError_t retval = cudaErrorInvalidDevice;
-
-	   retval = cudaMallocHost ( (void **) &scratch, SCRATCH_SIZE );
-	//retval = cudaHostAlloc ((void **) &scratch, SCRATCH_SIZE, cudaHostAllocDefault );
-	if (retval != cudaSuccess)
-	{
-	fprintf (stderr, "Error allocating the scratch area on the device \n");
-	exit (0);
-	}
-	 */
+    cuda_malloc ((void **) &scratch, SCRATCH_SIZE, 0, RES_SCRATCH );
+
+    /*
+       cudaError_t retval = cudaErrorInvalidDevice;
+
+       retval = cudaMallocHost ( (void **) &scratch, SCRATCH_SIZE );
+    //retval = cudaHostAlloc ((void **) &scratch, SCRATCH_SIZE, cudaHostAllocDefault );
+    if (retval != cudaSuccess)
+    {
+    fprintf (stderr, "Error allocating the scratch area on the device \n");
+    exit (0);
+    }
+     */
 }
diff --git a/PuReMD-GPU/src/cuda_utils.cu b/PuReMD-GPU/src/cuda_utils.cu
index 05bc0e2d..2c632c05 100644
--- a/PuReMD-GPU/src/cuda_utils.cu
+++ b/PuReMD-GPU/src/cuda_utils.cu
@@ -26,112 +26,112 @@
 
 void cuda_malloc (void **ptr, int size, int memset, int err_code) {
 
-	cudaError_t retVal = cudaSuccess;
-
-	//fprintf (stderr, "&ptr --. %ld \n", &ptr);
-	//fprintf (stderr, "ptr --> %ld \n", ptr );
-
-	retVal = cudaMalloc (ptr, size);
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "Failed to allocate memory on device for the res: %d...  exiting with code: %d size: %d \n", 
-				err_code, retVal, size);
-		exit (err_code);
-	}  
-
-	//fprintf (stderr, "&ptr --. %ld \n", &ptr);
-	//fprintf (stderr, "ptr --> %ld \n", ptr );
-
-	if (memset) {
-		retVal = cudaMemset (*ptr, 0, size);
-		if (retVal != cudaSuccess) {
-			fprintf (stderr, "Failed to memset memory on device... exiting with code %d\n", 
-					err_code);
-			exit (err_code);
-		}
-	}  
+    cudaError_t retVal = cudaSuccess;
+
+    //fprintf (stderr, "&ptr --. %ld \n", &ptr);
+    //fprintf (stderr, "ptr --> %ld \n", ptr );
+
+    retVal = cudaMalloc (ptr, size);
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "Failed to allocate memory on device for the res: %d...  exiting with code: %d size: %d \n", 
+                err_code, retVal, size);
+        exit (err_code);
+    }  
+
+    //fprintf (stderr, "&ptr --. %ld \n", &ptr);
+    //fprintf (stderr, "ptr --> %ld \n", ptr );
+
+    if (memset) {
+        retVal = cudaMemset (*ptr, 0, size);
+        if (retVal != cudaSuccess) {
+            fprintf (stderr, "Failed to memset memory on device... exiting with code %d\n", 
+                    err_code);
+            exit (err_code);
+        }
+    }  
 }
 
 void cuda_free (void *ptr, int err_code) {
 
-	cudaError_t retVal = cudaSuccess;
-	if (!ptr) return;
+    cudaError_t retVal = cudaSuccess;
+    if (!ptr) return;
 
-	retVal = cudaFree (ptr);
+    retVal = cudaFree (ptr);
 
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld\n", 
-				err_code, retVal, ptr);
-		return;
-	}  
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "Failed to release memory on device for res %d... exiting with code %d -- Address %ld\n", 
+                err_code, retVal, ptr);
+        return;
+    }  
 }
 void cuda_memset (void *ptr, int data, size_t count, int err_code){
-	cudaError_t retVal = cudaSuccess;
-
-	retVal = cudaMemset (ptr, data, count);
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr);
-		fprintf (stderr, " size to memset: %d \n", count);
-		fprintf (stderr, " target data is : %d \n", data);
-		fprintf (stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d\n", 
-				err_code, retVal);
-		exit (err_code);
-	}
+    cudaError_t retVal = cudaSuccess;
+
+    retVal = cudaMemset (ptr, data, count);
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "ptr passed is %ld, value: %ld \n", ptr, &ptr);
+        fprintf (stderr, " size to memset: %d \n", count);
+        fprintf (stderr, " target data is : %d \n", data);
+        fprintf (stderr, "Failed to memset memory on device... exiting with code %d, cuda code %d\n", 
+                err_code, retVal);
+        exit (err_code);
+    }
 }
 
 void copy_host_device (void *host, void *dev, int size, enum cudaMemcpyKind dir, int resid)
 {
-	cudaError_t	retVal = cudaErrorNotReady;
-
-	if (dir == cudaMemcpyHostToDevice)
-		retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice);
-	else
-		retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost);
-
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "could not copy resource %d from host to device: reason %d \n",
-				resid, retVal);
-		exit (resid);
-	}
+    cudaError_t    retVal = cudaErrorNotReady;
+
+    if (dir == cudaMemcpyHostToDevice)
+        retVal = cudaMemcpy (dev, host, size, cudaMemcpyHostToDevice);
+    else
+        retVal = cudaMemcpy (host, dev, size, cudaMemcpyDeviceToHost);
+
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "could not copy resource %d from host to device: reason %d \n",
+                resid, retVal);
+        exit (resid);
+    }
 }
 
 void copy_device (void *dest, void *src, int size, int resid)
 {
-	cudaError_t	retVal = cudaErrorNotReady;
-
-	retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice);
-	if (retVal != cudaSuccess) {
-		fprintf (stderr, "could not copy resource %d from host to device: reason %d \n",
-				resid, retVal);
-		exit (resid);
-	}
+    cudaError_t    retVal = cudaErrorNotReady;
+
+    retVal = cudaMemcpy (dest, src, size, cudaMemcpyDeviceToDevice);
+    if (retVal != cudaSuccess) {
+        fprintf (stderr, "could not copy resource %d from host to device: reason %d \n",
+                resid, retVal);
+        exit (resid);
+    }
 }
 
 void compute_blocks ( int *blocks, int *block_size, int count )
 {
-	*block_size = CUDA_BLOCK_SIZE;
-	*blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1);
+    *block_size = CUDA_BLOCK_SIZE;
+    *blocks = (count / CUDA_BLOCK_SIZE ) + (count % CUDA_BLOCK_SIZE == 0 ? 0 : 1);
 }
 
 void compute_nearest_pow_2 (int blocks, int *result)
 {
-	int power = 1;
-	while (power < blocks) power *= 2;
+    int power = 1;
+    while (power < blocks) power *= 2;
 
-	*result = power;
+    *result = power;
 }
 
 
 void print_device_mem_usage ()
 {
-	size_t total, free;
-	cudaMemGetInfo (&free, &total);
-	if (cudaGetLastError () != cudaSuccess )
-	{
-		fprintf (stderr, "Error on the memory call \n");
-		return;
-	}
-
-	fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", 
-			total, total/(1024*1024), total/ (1024*1024*1024), 
-			free, free/(1024*1024), free/ (1024*1024*1024) );
+    size_t total, free;
+    cudaMemGetInfo (&free, &total);
+    if (cudaGetLastError () != cudaSuccess )
+    {
+        fprintf (stderr, "Error on the memory call \n");
+        return;
+    }
+
+    fprintf (stderr, "Total %ld Mb %ld gig %ld , free %ld, Mb %ld , gig %ld \n", 
+            total, total/(1024*1024), total/ (1024*1024*1024), 
+            free, free/(1024*1024), free/ (1024*1024*1024) );
 }
diff --git a/PuReMD-GPU/src/forces.cu b/PuReMD-GPU/src/forces.cu
index 08f9d8a5..e8e1e291 100644
--- a/PuReMD-GPU/src/forces.cu
+++ b/PuReMD-GPU/src/forces.cu
@@ -43,2838 +43,2838 @@
 
 
 void Dummy_Interaction( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
 }
 
 
 void Init_Bonded_Force_Functions( control_params *control )
 { 
-	Interaction_Functions[0] = Calculate_Bond_Orders;
-	Interaction_Functions[1] = Bond_Energy;  //*/Dummy_Interaction;
-	Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy;
-	//*/Dummy_Interaction;
-	Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction;
-	Interaction_Functions[4] = Four_Body_Interactions;  //*/Dummy_Interaction;
-	if( control->hb_cut > 0 )
-		Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction;
-	else Interaction_Functions[5] = Dummy_Interaction;
-	Interaction_Functions[6] = Dummy_Interaction; //empty
-	Interaction_Functions[7] = Dummy_Interaction; //empty
-	Interaction_Functions[8] = Dummy_Interaction; //empty
-	Interaction_Functions[9] = Dummy_Interaction; //empty
+    Interaction_Functions[0] = Calculate_Bond_Orders;
+    Interaction_Functions[1] = Bond_Energy;  //*/Dummy_Interaction;
+    Interaction_Functions[2] = LonePair_OverUnder_Coordination_Energy;
+    //*/Dummy_Interaction;
+    Interaction_Functions[3] = Three_Body_Interactions; //*/Dummy_Interaction;
+    Interaction_Functions[4] = Four_Body_Interactions;  //*/Dummy_Interaction;
+    if( control->hb_cut > 0 )
+        Interaction_Functions[5] = Hydrogen_Bonds; //*/Dummy_Interaction;
+    else Interaction_Functions[5] = Dummy_Interaction;
+    Interaction_Functions[6] = Dummy_Interaction; //empty
+    Interaction_Functions[7] = Dummy_Interaction; //empty
+    Interaction_Functions[8] = Dummy_Interaction; //empty
+    Interaction_Functions[9] = Dummy_Interaction; //empty
 }
 
 
 void Compute_Bonded_Forces( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace,
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
 
-	int i;
-	real t_start, t_elapsed;
+    int i;
+    real t_start, t_elapsed;
 
 #ifdef TEST_ENERGY
-	/* Mark beginning of a new timestep in each energy file */
-	fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", 
-			data->step, "atom1", "atom2", "bo", "ebond", "total" );
-	fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", 
-			data->step, "atom", "nlp", "elp", "total" );
-	fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", 
-			data->step, "atom", "eov", "total" );
-	fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", 
-			data->step, "atom", "eun", "total" );
-	fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", 
-			data->step, "atom1", "atom2", "atom3", 
-			"angle", "bo(12)", "bo(23)", "eval", "epen", "total" );
-	fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-			data->step, "atom1", "atom2", "atom3", 
-			"angle", "bo(12)", "bo(23)", "epen", "total" );
-	fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-			data->step, "atom1", "atom2", "atom3", 
-			"angle", "bo(12)", "bo(23)", "ecoa", "total" );
-	fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
-			data->step, "atom1", "atom2", "atom3", 
-			"r(23)", "angle", "bo(12)", "ehb", "total" );
-	fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", 
-			data->step, "atom1", "atom2", "atom3", "atom4", 
-			"phi", "bo(23)", "etor", "total" );
-	fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
-			data->step, "atom1", "atom2", "atom3", "atom4", 
-			"phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
+    /* Mark beginning of a new timestep in each energy file */
+    fprintf( out_control->ebond, "step: %d\n%6s%6s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "bo", "ebond", "total" );
+    fprintf( out_control->elp, "step: %d\n%6s%12s%12s%12s\n", 
+            data->step, "atom", "nlp", "elp", "total" );
+    fprintf( out_control->eov, "step: %d\n%6s%12s%12s\n", 
+            data->step, "atom", "eov", "total" );
+    fprintf( out_control->eun, "step: %d\n%6s%12s%12s\n", 
+            data->step, "atom", "eun", "total" );
+    fprintf( out_control->eval, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", 
+            "angle", "bo(12)", "bo(23)", "eval", "epen", "total" );
+    fprintf( out_control->epen, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", 
+            "angle", "bo(12)", "bo(23)", "epen", "total" );
+    fprintf( out_control->ecoa, "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", 
+            "angle", "bo(12)", "bo(23)", "ecoa", "total" );
+    fprintf( out_control->ehb,  "step: %d\n%6s%6s%6s%12s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", 
+            "r(23)", "angle", "bo(12)", "ehb", "total" );
+    fprintf( out_control->etor, "step: %d\n%6s%6s%6s%6s%12s%12s%12s%12s\n", 
+            data->step, "atom1", "atom2", "atom3", "atom4", 
+            "phi", "bo(23)", "etor", "total" );
+    fprintf( out_control->econ, "step:%d\n%6s%6s%6s%6s%12s%12s%12s%12s%12s%12s\n",
+            data->step, "atom1", "atom2", "atom3", "atom4", 
+            "phi", "bo(12)", "bo(23)", "bo(34)", "econ", "total" );
 #endif 
 
-	/* Implement all the function calls as function pointers */
-	for( i = 0; i < NO_OF_INTERACTIONS; i++ ) {
-		//for( i = 0; i < 5; i++ ) {
-		t_start = Get_Time ();
-		(Interaction_Functions[i])(system, control, data, workspace, 
-				lists, out_control);
-		t_elapsed = Get_Timing_Info ( t_start );
+    /* Implement all the function calls as function pointers */
+    for( i = 0; i < NO_OF_INTERACTIONS; i++ ) {
+        //for( i = 0; i < 5; i++ ) {
+        t_start = Get_Time ();
+        (Interaction_Functions[i])(system, control, data, workspace, 
+                lists, out_control);
+        t_elapsed = Get_Timing_Info ( t_start );
 
 #ifdef __DEBUG_CUDA__
-		fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed );
+        fprintf( stderr, "function %d tme %lf - \n", i, t_elapsed );
 #endif
 
 #if defined(DEBUG_FOCUS)
-		fprintf( stderr, "f%d-", i );
+        fprintf( stderr, "f%d-", i );
 #endif
 #ifdef TEST_FORCES
-		(Print_Interactions[i])(system, control, data, workspace, 
-				lists, out_control);
+        (Print_Interactions[i])(system, control, data, workspace, 
+                lists, out_control);
 #endif
-	}
-	}
-
-	void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, 
-			simulation_data *data, static_storage *workspace,
-			list **lists, output_controls *out_control )
-	{
-		real t_start, t_elapsed;
-		real *spad = (real *)scratch;
-		rvec *rvec_spad;
-
-		//Compute the bonded for interaction here. 
-		//Step 1.
+    }
+    }
+
+    void Cuda_Compute_Bonded_Forces( reax_system *system, control_params *control, 
+            simulation_data *data, static_storage *workspace,
+            list **lists, output_controls *out_control )
+    {
+        real t_start, t_elapsed;
+        real *spad = (real *)scratch;
+        rvec *rvec_spad;
+
+        //Compute the bonded for interaction here. 
+        //Step 1.
 #ifdef __DEBUG_CUDA__
-		t_start = Get_Time( );
-		fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE);
+        t_start = Get_Time( );
+        fprintf (stderr, " Begin Bonded Forces ... %d x %d\n", BLOCKS, BLOCK_SIZE);
 #endif
 
-		Cuda_Calculate_Bond_Orders_Init <<< BLOCKS, BLOCK_SIZE >>>
-			(  system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp,
-			   *dev_workspace, system->reaxprm.num_atom_types, system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_Calculate_Bond_Orders <<< BLOCKS, BLOCK_SIZE >>>
-			( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
-			  system->reaxprm.d_tbp, *dev_workspace, 
-			  *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), 
-			  system->reaxprm.num_atom_types, system->N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_Update_Uncorrected_BO <<<BLOCKS, BLOCK_SIZE>>>
-			(*dev_workspace, *(dev_lists + BONDS), system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_Update_Workspace_After_Bond_Orders <<<BLOCKS, BLOCK_SIZE>>>
-			(system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
-			 *dev_workspace, system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
+        Cuda_Calculate_Bond_Orders_Init <<< BLOCKS, BLOCK_SIZE >>>
+            (  system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp,
+               *dev_workspace, system->reaxprm.num_atom_types, system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_Calculate_Bond_Orders <<< BLOCKS, BLOCK_SIZE >>>
+            ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
+              system->reaxprm.d_tbp, *dev_workspace, 
+              *(dev_lists + BONDS), *(dev_lists + DDELTA), *(dev_lists + DBO), 
+              system->reaxprm.num_atom_types, system->N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_Update_Uncorrected_BO <<<BLOCKS, BLOCK_SIZE>>>
+            (*dev_workspace, *(dev_lists + BONDS), system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_Update_Workspace_After_Bond_Orders <<<BLOCKS, BLOCK_SIZE>>>
+            (system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, 
+             *dev_workspace, system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-		t_elapsed = Get_Timing_Info( t_start );
-		fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-		fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n");
+        t_elapsed = Get_Timing_Info( t_start );
+        fprintf (stderr, "Bond Orders... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+        fprintf (stderr, "Cuda_Calculate_Bond_Orders Done... \n");
 #endif
 
 
 
-		//Step 2.
+        //Step 2.
 #ifdef __DEBUG_CUDA__
-		t_start = Get_Time( );
+        t_start = Get_Time( );
 #endif
-		//cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE + system->N * REAL_SIZE + 16 * REAL_SIZE), RES_SCRATCH );
-		cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE ) , RES_SCRATCH );
-
-		Cuda_Bond_Energy <<< BLOCKS, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>
-			( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, system->reaxprm.d_tbp,
-			  (simulation_data *)data->d_simulation_data, *dev_workspace, *(dev_lists + BONDS), 
-			  system->N, system->reaxprm.num_atom_types, spad );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_BE
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-			(spad, spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-			//(spad + system->N, spad + system->N + 16, 16);
-			(spad + system->N, &((simulation_data *)data->d_simulation_data)->E_BE, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
+        //cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE + system->N * REAL_SIZE + 16 * REAL_SIZE), RES_SCRATCH );
+        cuda_memset (spad, 0, system->N * ( 2 * REAL_SIZE ) , RES_SCRATCH );
+
+        Cuda_Bond_Energy <<< BLOCKS, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>
+            ( system->d_atoms, system->reaxprm.d_gp, system->reaxprm.d_sbp, system->reaxprm.d_tbp,
+              (simulation_data *)data->d_simulation_data, *dev_workspace, *(dev_lists + BONDS), 
+              system->N, system->reaxprm.num_atom_types, spad );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for E_BE
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad, spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+            //(spad + system->N, spad + system->N + 16, 16);
+            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_BE, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-		t_elapsed = Get_Timing_Info( t_start );
-		fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-		fprintf (stderr, "Cuda_Bond_Energy Done... \n");
+        t_elapsed = Get_Timing_Info( t_start );
+        fprintf (stderr, "Cuda_Bond_Energy ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+        fprintf (stderr, "Cuda_Bond_Energy Done... \n");
 #endif
 
-		//Step 3.
+        //Step 3.
 #ifdef __DEBUG_CUDA__
-		t_start = Get_Time( );
+        t_start = Get_Time( );
 #endif
-		cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N ), RES_SCRATCH );
-
-		test_LonePair_OverUnder_Coordination_Energy_LP <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, 
-				system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-				*dev_workspace, (simulation_data *)data->d_simulation_data,
-				*(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, 
-				spad, spad + 2 * system->N, spad + 4*system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		test_LonePair_OverUnder_Coordination_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, 
-				system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-				*dev_workspace, (simulation_data *)data->d_simulation_data,
-				*(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, 
-				spad, spad + 2 * system->N, spad + 4*system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		test_LonePair_Postprocess		<<<BLOCKS, BLOCK_SIZE, 0>>>( system->d_atoms, system->reaxprm.d_gp, 
-				system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-				*dev_workspace, (simulation_data *)data->d_simulation_data,
-				*(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-
-		//Reduction for E_Lp
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-			(spad, spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-			(spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Lp, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Ov
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-			(spad + 2*system->N, spad + 3*system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-			(spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ov, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Un
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-			(spad + 4*system->N, spad + 5*system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-			(spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Un, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
+        cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N ), RES_SCRATCH );
+
+        test_LonePair_OverUnder_Coordination_Energy_LP <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, 
+                system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+                *dev_workspace, (simulation_data *)data->d_simulation_data,
+                *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, 
+                spad, spad + 2 * system->N, spad + 4*system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        test_LonePair_OverUnder_Coordination_Energy <<<BLOCKS, BLOCK_SIZE>>>( system->d_atoms, system->reaxprm.d_gp, 
+                system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+                *dev_workspace, (simulation_data *)data->d_simulation_data,
+                *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types, 
+                spad, spad + 2 * system->N, spad + 4*system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        test_LonePair_Postprocess        <<<BLOCKS, BLOCK_SIZE, 0>>>( system->d_atoms, system->reaxprm.d_gp, 
+                system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+                *dev_workspace, (simulation_data *)data->d_simulation_data,
+                *(dev_lists + BONDS), system->N, system->reaxprm.num_atom_types);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+
+        //Reduction for E_Lp
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad, spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Lp, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for E_Ov
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad + 2*system->N, spad + 3*system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+            (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ov, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for E_Un
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad + 4*system->N, spad + 5*system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+            (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Un, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-		t_elapsed = Get_Timing_Info( t_start );
-		fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
-		fprintf (stderr, "test_LonePair_postprocess Done... \n");
+        t_elapsed = Get_Timing_Info( t_start );
+        fprintf (stderr, "test_LonePair_postprocess ... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+        fprintf (stderr, "test_LonePair_postprocess Done... \n");
 #endif
 
-		//Step 4.
+        //Step 4.
 #ifdef __DEBUG_CUDA__
-		t_start = Get_Time( );
+        t_start = Get_Time( );
 #endif
 
-		cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH);
-		Three_Body_Estimate <<<BLOCKS, BLOCK_SIZE>>>
-			(system->d_atoms, 
-			 (control_params *)control->d_control, 
-			 *(dev_lists + BONDS),
-			 system->N, (int *)spad);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
+        cuda_memset(spad, 0, (dev_lists + BONDS)->num_intrs * sizeof (int), RES_SCRATCH);
+        Three_Body_Estimate <<<BLOCKS, BLOCK_SIZE>>>
+            (system->d_atoms, 
+             (control_params *)control->d_control, 
+             *(dev_lists + BONDS),
+             system->N, (int *)spad);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-		t_elapsed = Get_Timing_Info( t_start );
-		fprintf (stderr, "Three_Body_Estimate... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
+        t_elapsed = Get_Timing_Info( t_start );
+        fprintf (stderr, "Three_Body_Estimate... return value --> %d --- Timing %lf \n", cudaGetLastError (), t_elapsed );
 #endif
 
-		int *thbody = (int *) malloc (sizeof (int) * (dev_lists + BONDS)->num_intrs);
-		memset (thbody, 0, sizeof (int) * (dev_lists + BONDS)->num_intrs);
-		copy_host_device (thbody, spad, (dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, RES_SCRATCH);
+        int *thbody = (int *) malloc (sizeof (int) * (dev_lists + BONDS)->num_intrs);
+        memset (thbody, 0, sizeof (int) * (dev_lists + BONDS)->num_intrs);
+        copy_host_device (thbody, spad, (dev_lists + BONDS)->num_intrs * sizeof (int), cudaMemcpyDeviceToHost, RES_SCRATCH);
 
-		int total_3body = thbody [0] * SAFE_ZONE;
-		for (int x = 1; x < (dev_lists + BONDS)->num_intrs; x++) {
-			total_3body += thbody [x]*SAFE_ZONE;
-			thbody [x] += thbody [x-1];
-		}
-		system->num_thbodies = thbody [(dev_lists+BONDS)->num_intrs-1];
+        int total_3body = thbody [0] * SAFE_ZONE;
+        for (int x = 1; x < (dev_lists + BONDS)->num_intrs; x++) {
+            total_3body += thbody [x]*SAFE_ZONE;
+            thbody [x] += thbody [x-1];
+        }
+        system->num_thbodies = thbody [(dev_lists+BONDS)->num_intrs-1];
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, "Total Three body estimate is %d (bonds: %d) \n", total_3body, (dev_lists+BONDS)->num_intrs);
+        fprintf (stderr, "Total Three body estimate is %d (bonds: %d) \n", total_3body, (dev_lists+BONDS)->num_intrs);
 #endif
 
-		if (!system->init_thblist) 
-		{
-			system->init_thblist = true;
-			if(!Make_List((dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) {
-				fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
-				exit( INIT_ERR );
-			}
+        if (!system->init_thblist) 
+        {
+            system->init_thblist = true;
+            if(!Make_List((dev_lists+BONDS)->num_intrs, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) {
+                fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
+                exit( INIT_ERR );
+            }
 #ifdef __CUDA_MEM__
-			fprintf (stderr, "Device memory allocated: three body list = %d MB\n", 
-					sizeof (three_body_interaction_data) * total_3body / (1024*1024));
+            fprintf (stderr, "Device memory allocated: three body list = %d MB\n", 
+                    sizeof (three_body_interaction_data) * total_3body / (1024*1024));
 #endif
-		} else {
-			if ((dev_workspace->realloc.bonds > 0) || (system->num_thbodies > (dev_lists+THREE_BODIES)->num_intrs )) {
-				int size = MAX (dev_workspace->realloc.num_bonds, (dev_lists+BONDS)->num_intrs);
+        } else {
+            if ((dev_workspace->realloc.bonds > 0) || (system->num_thbodies > (dev_lists+THREE_BODIES)->num_intrs )) {
+                int size = MAX (dev_workspace->realloc.num_bonds, (dev_lists+BONDS)->num_intrs);
 
-				/*Delete Three-body list*/
-				Delete_List( dev_lists + THREE_BODIES, TYP_DEVICE );
+                /*Delete Three-body list*/
+                Delete_List( dev_lists + THREE_BODIES, TYP_DEVICE );
 
 #ifdef __CUDA_MEM__
-				fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", 
-						data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies);
+                fprintf (stderr, "Reallocating Three-body list: step: %d n - %d num_intrs - %d used: %d \n", 
+                        data->step, dev_workspace->realloc.num_bonds, total_3body, system->num_thbodies);
 #endif
-				/*Recreate Three-body list */
-				if(!Make_List(size, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) {
-					fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
-					exit( INIT_ERR );
-				}
-			}
-		}
-
-		//copy the indexes into the thb list;
-		copy_host_device (thbody, ((dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), 
-				cudaMemcpyHostToDevice, LIST_INDEX);
-		copy_host_device (thbody, ((dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), 
-				cudaMemcpyHostToDevice, LIST_END_INDEX);
-
-		free (thbody );
+                /*Recreate Three-body list */
+                if(!Make_List(size, total_3body, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) {
+                    fprintf( stderr, "Problem in initializing three-body list. Terminating!\n" );
+                    exit( INIT_ERR );
+                }
+            }
+        }
+
+        //copy the indexes into the thb list;
+        copy_host_device (thbody, ((dev_lists + THREE_BODIES)->index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), 
+                cudaMemcpyHostToDevice, LIST_INDEX);
+        copy_host_device (thbody, ((dev_lists + THREE_BODIES)->end_index + 1), sizeof (int) * ((dev_lists+BONDS)->num_intrs - 1), 
+                cudaMemcpyHostToDevice, LIST_END_INDEX);
+
+        free (thbody );
 
 #ifdef __DEBUG_CUDA__
-		t_start = Get_Time( );
+        t_start = Get_Time( );
 #endif
 
-		cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
-
-		Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
-			( system->d_atoms,
-			  system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, 
-			  (control_params *)control->d_control,
-			  (simulation_data *)data->d_simulation_data,
-			  *dev_workspace, 
-			  *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
-			  system->N, system->reaxprm.num_atom_types, 
-			  spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N));
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Not necessary to validate three-body list anymore, 
-		// Estimate is already done at the beginning which makes sure that 
-		// we have sufficient size for this list
-		//Cuda_Threebody_List( system, workspace, dev_lists + THREE_BODIES, data->step );
-
-		//Reduction for E_Ang
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-			(spad, spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-			(spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Ang, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Pen
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-			(spad + 2*system->N, spad + 3*system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-			(spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Pen, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Coa
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-			(spad + 4*system->N, spad + 5*system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-			(spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for ext_pres
-		rvec_spad = (rvec *) (spad + 6*system->N);
-		Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
-			(rvec_spad, rvec_spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
-			(rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		real t_1, t_2;
-		t_1 = Get_Time ();
-		//Sum up the f vector for each atom and collect the CdDelta from all the bonds
-		Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>>
-			( 	system->d_atoms,
-				(control_params *)control->d_control,
-				*dev_workspace, 
-				*(dev_lists + BONDS), 
-				system->N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-		t_2 = Get_Timing_Info (t_1);
+        cuda_memset (spad, 0, ( 6 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
+
+        Three_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
+            ( system->d_atoms,
+              system->reaxprm.d_sbp, system->reaxprm.d_thbp, system->reaxprm.d_gp, 
+              (control_params *)control->d_control,
+              (simulation_data *)data->d_simulation_data,
+              *dev_workspace, 
+              *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
+              system->N, system->reaxprm.num_atom_types, 
+              spad, spad + 2*system->N, spad + 4*system->N, (rvec *)(spad + 6*system->N));
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Not necessary to validate three-body list anymore, 
+        // Estimate is already done at the beginning which makes sure that 
+        // we have sufficient size for this list
+        //Cuda_Threebody_List( system, workspace, dev_lists + THREE_BODIES, data->step );
+
+        //Reduction for E_Ang
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad, spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Ang, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for E_Pen
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad + 2*system->N, spad + 3*system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+            (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Pen, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for E_Coa
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad + 4*system->N, spad + 5*system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+            (spad + 5*system->N, &((simulation_data *)data->d_simulation_data)->E_Coa, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for ext_pres
+        rvec_spad = (rvec *) (spad + 6*system->N);
+        Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
+            (rvec_spad, rvec_spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
+            (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        real t_1, t_2;
+        t_1 = Get_Time ();
+        //Sum up the f vector for each atom and collect the CdDelta from all the bonds
+        Three_Body_Interactions_results <<< BLOCKS, BLOCK_SIZE >>>
+            (     system->d_atoms,
+                (control_params *)control->d_control,
+                *dev_workspace, 
+                *(dev_lists + BONDS), 
+                system->N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        t_2 = Get_Timing_Info (t_1);
 
 #ifdef __DEBUG_CUDA__
-		t_elapsed = Get_Timing_Info( t_start );
-		fprintf (stderr, "Three_Body_Interactions post process Timing %lf \n", t_2);
-		fprintf (stderr, "Three_Body_Interactions ...  Timing %lf \n", t_elapsed );
-		fprintf (stderr, "Three_Body_Interactions Done... \n");
+        t_elapsed = Get_Timing_Info( t_start );
+        fprintf (stderr, "Three_Body_Interactions post process Timing %lf \n", t_2);
+        fprintf (stderr, "Three_Body_Interactions ...  Timing %lf \n", t_elapsed );
+        fprintf (stderr, "Three_Body_Interactions Done... \n");
 #endif
 
-		//Step 5.
+        //Step 5.
 #ifdef __DEBUG_CUDA__
-		t_start = Get_Time( );
+        t_start = Get_Time( );
 #endif
 
-		cuda_memset (spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
-		Four_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
-			//Four_Body_Interactions <<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>>
-			( system->d_atoms,
-			  system->reaxprm.d_gp,
-			  system->reaxprm.d_fbp,
-			  (control_params *)control->d_control,
-			  *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
-			  (simulation_box *)system->d_box,
-			  (simulation_data *)data->d_simulation_data,
-			  *dev_workspace,
-			  system->N, system->reaxprm.num_atom_types, 
-			  spad, spad + 2*system->N, (rvec *) (spad + 4*system->N));
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Tor
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-			(spad, spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-			(spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Tor, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for E_Con
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-			(spad + 2*system->N, spad + 3*system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-			(spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Con, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Reduction for ext_pres
-		rvec_spad = (rvec *) (spad + 4*system->N);
-		Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
-			(rvec_spad, rvec_spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
-			(rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//Post process here
-		Four_Body_Postprocess 	<<< BLOCKS, BLOCK_SIZE >>>
-			( 	system->d_atoms,
-				*dev_workspace,
-				*(dev_lists + BONDS),
-				system->N );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
+        cuda_memset (spad, 0, ( 4 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2), RES_SCRATCH );
+        Four_Body_Interactions <<< BLOCKS, BLOCK_SIZE >>>
+            //Four_Body_Interactions <<< system->N, 32, 32*( 2*REAL_SIZE + RVEC_SIZE)>>>
+            ( system->d_atoms,
+              system->reaxprm.d_gp,
+              system->reaxprm.d_fbp,
+              (control_params *)control->d_control,
+              *(dev_lists + BONDS), *(dev_lists + THREE_BODIES),
+              (simulation_box *)system->d_box,
+              (simulation_data *)data->d_simulation_data,
+              *dev_workspace,
+              system->N, system->reaxprm.num_atom_types, 
+              spad, spad + 2*system->N, (rvec *) (spad + 4*system->N));
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for E_Tor
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad, spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_Tor, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for E_Con
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+            (spad + 2*system->N, spad + 3*system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+            (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Con, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Reduction for ext_pres
+        rvec_spad = (rvec *) (spad + 4*system->N);
+        Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
+            (rvec_spad, rvec_spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
+            (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //Post process here
+        Four_Body_Postprocess     <<< BLOCKS, BLOCK_SIZE >>>
+            (     system->d_atoms,
+                *dev_workspace,
+                *(dev_lists + BONDS),
+                system->N );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-		t_elapsed = Get_Timing_Info( t_start );
-		fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed );
-		fprintf (stderr, " Four_Body_ Done... \n");
+        t_elapsed = Get_Timing_Info( t_start );
+        fprintf (stderr, "Four_Body_post process return value --> %d --- Four body Timing %lf \n", cudaGetLastError (), t_elapsed );
+        fprintf (stderr, " Four_Body_ Done... \n");
 #endif
 
 
-		//Step 6.
-		if (control->hb_cut > 0) {
+        //Step 6.
+        if (control->hb_cut > 0) {
 
 #ifdef __DEBUG_CUDA__
-			t_start = Get_Time( );
+            t_start = Get_Time( );
 #endif
-			cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH );
-
-			/*
-			   Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>>
-			   (  system->d_atoms, 
-			   system->reaxprm.d_sbp,
-			   system->reaxprm.d_hbp,
-			   (control_params *)control->d_control,
-			   (simulation_data *)data->d_simulation_data,
-			 *dev_workspace, 
-			 *(dev_lists + BONDS), *(dev_lists + HBONDS),
-			 system->N, system->reaxprm.num_atom_types, 
-			 spad, (rvec *) (spad + 2*system->N), NULL);
-			 cudaThreadSynchronize ();
-			 cudaCheckError ();
-			 */
+            cuda_memset (spad, 0, ( 2 * REAL_SIZE * system->N + RVEC_SIZE * system->N * 2 ), RES_SCRATCH );
+
+            /*
+               Hydrogen_Bonds <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE *( REAL_SIZE + RVEC_SIZE) >>>
+               (  system->d_atoms, 
+               system->reaxprm.d_sbp,
+               system->reaxprm.d_hbp,
+               (control_params *)control->d_control,
+               (simulation_data *)data->d_simulation_data,
+             *dev_workspace, 
+             *(dev_lists + BONDS), *(dev_lists + HBONDS),
+             system->N, system->reaxprm.num_atom_types, 
+             spad, (rvec *) (spad + 2*system->N), NULL);
+             cudaThreadSynchronize ();
+             cudaCheckError ();
+             */
 
 #ifdef __DEBUG_CUDA__
-			real test1,test2;
-			test1 = Get_Time ();
+            real test1,test2;
+            test1 = Get_Time ();
 #endif
 
-			int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + 
-				(((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1);
-			Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE )  >>>
-				(  system->d_atoms, 
-				   system->reaxprm.d_sbp,
-				   system->reaxprm.d_hbp,
-				   (control_params *)control->d_control,
-				   (simulation_data *)data->d_simulation_data,
-				   *dev_workspace, 
-				   *(dev_lists + BONDS), *(dev_lists + HBONDS),
-				   system->N, system->reaxprm.num_atom_types, 
-				   spad, (rvec *) (spad + 2*system->N), NULL);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            int hbs = (system->N * HBONDS_THREADS_PER_ATOM/ HBONDS_BLOCK_SIZE) + 
+                (((system->N * HBONDS_THREADS_PER_ATOM) % HBONDS_BLOCK_SIZE) == 0 ? 0 : 1);
+            Hydrogen_Bonds_HB <<< hbs, HBONDS_BLOCK_SIZE, HBONDS_BLOCK_SIZE * ( 2 * REAL_SIZE + 2 * RVEC_SIZE )  >>>
+                (  system->d_atoms, 
+                   system->reaxprm.d_sbp,
+                   system->reaxprm.d_hbp,
+                   (control_params *)control->d_control,
+                   (simulation_data *)data->d_simulation_data,
+                   *dev_workspace, 
+                   *(dev_lists + BONDS), *(dev_lists + HBONDS),
+                   system->N, system->reaxprm.num_atom_types, 
+                   spad, (rvec *) (spad + 2*system->N), NULL);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-			test2 = Get_Timing_Info (test1);
-			fprintf (stderr, "Timing for the hb and forces ---> %f \n", test2);
+            test2 = Get_Timing_Info (test1);
+            fprintf (stderr, "Timing for the hb and forces ---> %f \n", test2);
 #endif
 
-			//Reduction for E_HB
-			Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-				(spad, spad + system->N,  system->N);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            //Reduction for E_HB
+            Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+                (spad, spad + system->N,  system->N);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
-			Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
-				(spad + system->N, &((simulation_data *)data->d_simulation_data)->E_HB, BLOCKS_POW_2);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>> 
+                (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_HB, BLOCKS_POW_2);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
 
-			//Reduction for ext_pres
-			rvec_spad = (rvec *) (spad + 2*system->N);
-			Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
-				(rvec_spad, rvec_spad + system->N,  system->N);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            //Reduction for ext_pres
+            rvec_spad = (rvec *) (spad + 2*system->N);
+            Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE >>> 
+                (rvec_spad, rvec_spad + system->N,  system->N);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
-			Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
-				(rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2 >>> 
+                (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
-			//Post process here
+            //Post process here
 #ifdef __DEBUG_CUDA__
-			real t_1, t_2;
-			t_1 = Get_Time ();
+            real t_1, t_2;
+            t_1 = Get_Time ();
 #endif
-			Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>>
-				( 	system->d_atoms, 
-					system->reaxprm.d_sbp, 
-					*dev_workspace, 
-					*(dev_lists + BONDS),
-					*(dev_lists + HBONDS), 
-					*(dev_lists + FAR_NBRS),
-					system->N, 
-					spad); //this is for the fix to use the shared memory
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            Hydrogen_Bonds_Postprocess <<< BLOCKS, BLOCK_SIZE, BLOCK_SIZE * RVEC_SIZE >>>
+                (     system->d_atoms, 
+                    system->reaxprm.d_sbp, 
+                    *dev_workspace, 
+                    *(dev_lists + BONDS),
+                    *(dev_lists + HBONDS), 
+                    *(dev_lists + FAR_NBRS),
+                    system->N, 
+                    spad); //this is for the fix to use the shared memory
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
 #ifdef __DEBUG_CUDA__
-			t_2 = Get_Timing_Info ( t_1 );
-			fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2);
-			t_1 = Get_Time ();
+            t_2 = Get_Timing_Info ( t_1 );
+            fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2);
+            t_1 = Get_Time ();
 #endif
 
-			//Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
-			Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
-				( 	system->d_atoms, 
-					system->reaxprm.d_sbp, 
-					*dev_workspace, 
-					*(dev_lists + BONDS),
-					*(dev_lists + HBONDS), 
-					*(dev_lists + FAR_NBRS),
-					system->N );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-			t_2 = Get_Timing_Info ( t_1 );
+            //Hydrogen_Bonds_Far_Nbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
+            Hydrogen_Bonds_HNbrs <<< system->N, 32, 32 * RVEC_SIZE>>>
+                (     system->d_atoms, 
+                    system->reaxprm.d_sbp, 
+                    *dev_workspace, 
+                    *(dev_lists + BONDS),
+                    *(dev_lists + HBONDS), 
+                    *(dev_lists + FAR_NBRS),
+                    system->N );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+            t_2 = Get_Timing_Info ( t_1 );
 
 #ifdef __DEBUG_CUDA__
-			fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2);
-			t_elapsed = Get_Timing_Info( t_start );
-			fprintf (stderr, "Hydrogen bonds post process return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed );
-			fprintf (stderr, "Hydrogen_Bond Done... \n");
+            fprintf (stderr, " Hydrogen Bonds post process -----%f \n", t_2);
+            t_elapsed = Get_Timing_Info( t_start );
+            fprintf (stderr, "Hydrogen bonds post process return value --> %d --- HydrogenBonds Timing %lf \n", cudaGetLastError (), t_elapsed );
+            fprintf (stderr, "Hydrogen_Bond Done... \n");
 #endif
-		}
-		return; 
-	}
-
-	void Compute_NonBonded_Forces( reax_system *system, control_params *control, 
-			simulation_data *data,static_storage *workspace,
-			list** lists, output_controls *out_control )
-	{
-		real t_start, t_elapsed;
+        }
+        return; 
+    }
+
+    void Compute_NonBonded_Forces( reax_system *system, control_params *control, 
+            simulation_data *data,static_storage *workspace,
+            list** lists, output_controls *out_control )
+    {
+        real t_start, t_elapsed;
 #ifdef TEST_ENERGY
-		fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
-				data->step, "atom1", "atom2", "r12", "evdw", "total" );
-		fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n",
-				data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
+        fprintf( out_control->evdw, "step: %d\n%6s%6s%12s%12s%12s\n",
+                data->step, "atom1", "atom2", "r12", "evdw", "total" );
+        fprintf( out_control->ecou, "step: %d\n%6s%6s%12s%12s%12s%12s%12s\n",
+                data->step, "atom1", "atom2", "r12", "q1", "q2", "ecou", "total" );
 #endif
 
-		t_start = Get_Time( );
-		QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
-		t_elapsed = Get_Timing_Info( t_start );
-		data->timing.QEq += t_elapsed;
+        t_start = Get_Time( );
+        QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
+        t_elapsed = Get_Timing_Info( t_start );
+        data->timing.QEq += t_elapsed;
 #if defined(DEBUG_FOCUS)
-		fprintf( stderr, "qeq - " );
+        fprintf( stderr, "qeq - " );
 #endif
 
-		if ( control->tabulate == 0)
-			vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control );
-		else
-			Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, 
-					lists, out_control );
+        if ( control->tabulate == 0)
+            vdW_Coulomb_Energy( system, control, data, workspace, lists, out_control );
+        else
+            Tabulated_vdW_Coulomb_Energy( system, control, data, workspace, 
+                    lists, out_control );
 #if defined(DEBUG_FOCUS)
-		fprintf( stderr, "nonb forces - " );
+        fprintf( stderr, "nonb forces - " );
 #endif
 
 #ifdef TEST_FORCES
-		Print_vdW_Coulomb_Forces( system, control, data, workspace, 
-				lists, out_control );
+        Print_vdW_Coulomb_Forces( system, control, data, workspace, 
+                lists, out_control );
 #endif
-	}
-
-	void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
-			simulation_data *data,static_storage *workspace,
-			list** lists, output_controls *out_control )
-	{
-		real t_start, t_elapsed;
-		real t1 = 0, t2 = 0;
-		real *spad = (real *) scratch;
-		rvec *rvec_spad;
-		int cblks;
-
-		t_start = Get_Time( );
-		Cuda_QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
-		t_elapsed = Get_Timing_Info( t_start );
-		d_timing.QEq += t_elapsed;
+    }
+
+    void Cuda_Compute_NonBonded_Forces( reax_system *system, control_params *control, 
+            simulation_data *data,static_storage *workspace,
+            list** lists, output_controls *out_control )
+    {
+        real t_start, t_elapsed;
+        real t1 = 0, t2 = 0;
+        real *spad = (real *) scratch;
+        rvec *rvec_spad;
+        int cblks;
+
+        t_start = Get_Time( );
+        Cuda_QEq( system, control, data, workspace, lists[FAR_NBRS], out_control );
+        t_elapsed = Get_Timing_Info( t_start );
+        d_timing.QEq += t_elapsed;
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, " Cuda_QEq done with timing %lf \n", t_elapsed );
+        fprintf (stderr, " Cuda_QEq done with timing %lf \n", t_elapsed );
 #endif
-		cuda_memset (spad, 0, system->N * ( 4 * REAL_SIZE + 2 * RVEC_SIZE), RES_SCRATCH );
-
-		t_start = Get_Time ();
-		if ( control->tabulate == 0)
-		{
-			cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + 
-				((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1);
-			Cuda_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE * ( 2*REAL_SIZE + RVEC_SIZE) >>>
-				( system->d_atoms,   
-				  system->reaxprm.d_tbp,
-				  system->reaxprm.d_gp, 
-				  (control_params *)control->d_control, 
-				  (simulation_data *)data->d_simulation_data,  
-				  *(dev_lists + FAR_NBRS), 
-				  spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), 
-				  system->reaxprm.num_atom_types,
-				  system->N ) ;
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-		}
-		else
-		{
-			cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + 
-				((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1);
-			Cuda_Tabulated_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE* (2*REAL_SIZE + RVEC_SIZE)>>>
-				(   (reax_atom *)system->d_atoms, 
-				    (control_params *)control->d_control,
-				    (simulation_data *)data->d_simulation_data, 
-				    *(dev_lists + FAR_NBRS), 
-				    spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), 
-				    d_LR,
-				    system->reaxprm.num_atom_types,
-				    out_control->energy_update_freq,
-				    system->N ) ;
-
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-		}
-
-		t_elapsed = Get_Timing_Info (t_start );
+        cuda_memset (spad, 0, system->N * ( 4 * REAL_SIZE + 2 * RVEC_SIZE), RES_SCRATCH );
+
+        t_start = Get_Time ();
+        if ( control->tabulate == 0)
+        {
+            cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + 
+                ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1);
+            Cuda_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE * ( 2*REAL_SIZE + RVEC_SIZE) >>>
+                ( system->d_atoms,   
+                  system->reaxprm.d_tbp,
+                  system->reaxprm.d_gp, 
+                  (control_params *)control->d_control, 
+                  (simulation_data *)data->d_simulation_data,  
+                  *(dev_lists + FAR_NBRS), 
+                  spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), 
+                  system->reaxprm.num_atom_types,
+                  system->N ) ;
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+        }
+        else
+        {
+            cblks = (system->N * VDW_THREADS_PER_ATOM / VDW_BLOCK_SIZE) + 
+                ((system->N * VDW_THREADS_PER_ATOM/VDW_BLOCK_SIZE) == 0 ? 0 : 1);
+            Cuda_Tabulated_vdW_Coulomb_Energy <<< cblks, VDW_BLOCK_SIZE, VDW_BLOCK_SIZE* (2*REAL_SIZE + RVEC_SIZE)>>>
+                (   (reax_atom *)system->d_atoms, 
+                    (control_params *)control->d_control,
+                    (simulation_data *)data->d_simulation_data, 
+                    *(dev_lists + FAR_NBRS), 
+                    spad , spad + 2 * system->N, (rvec *) (spad + system->N * 4), 
+                    d_LR,
+                    system->reaxprm.num_atom_types,
+                    out_control->energy_update_freq,
+                    system->N ) ;
+
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+        }
+
+        t_elapsed = Get_Timing_Info (t_start );
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed - t2));
-		fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed));
+        fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed - t2));
+        fprintf (stderr, "Cuda_Tabulated_vdW_Coulomb_Energy done... %lf \n", (t_elapsed));
 #endif
 
-		//Reduction on E_vdW
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
-			(spad, spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-			(spad + system->N, &((simulation_data *)data->d_simulation_data)->E_vdW, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//reduction on E_Ele
-		Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
-			(spad + 2*system->N, spad + 3*system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-			(spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ele, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-		rvec_spad = (rvec *) (spad + 4*system->N);
-
-		//reduction on ext_press
-		Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE>>> 
-			(rvec_spad, rvec_spad + system->N,  system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2>>> 
-			(rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-	}
-
-
-	/* This version of Compute_Total_Force computes forces from coefficients 
-	   accumulated by all interaction functions. Saves enormous time & space! */
-	void Compute_Total_Force( reax_system *system, control_params *control, 
-			simulation_data *data, static_storage *workspace,
-			list **lists )
-	{
-		int i, pj;
-		list *bonds = (*lists) + BONDS;
-
-		for( i = 0; i < system->N; ++i )
-			for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-				if( i < bonds->select.bond_list[pj].nbr ) {
-					if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
-						Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
-					else 
-						Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
-				}
-	}
-
-
-	void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
-			int Hmax, int Htop, int num_bonds, int num_hbonds )
-	{
-		int i, flag;
-		list *bonds, *hbonds;
-
-		bonds = *lists + BONDS;
-		hbonds = *lists + HBONDS;
-
-		/* far neighbors */
-		if( Htop > Hmax * DANGER_ZONE ) {
-			workspace->realloc.Htop = Htop;
-			if( Htop > Hmax ) {
-				fprintf( stderr, 
-						"step%d - ran out of space on H matrix: Htop=%d, max = %d",
-						step, Htop, Hmax );
-				exit(INSUFFICIENT_SPACE);
-			}
-		}
-
-		/* bond list */
-		flag = -1;
-		workspace->realloc.num_bonds = num_bonds;
-		for( i = 0; i < n-1; ++i )
-			if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) {
-				workspace->realloc.bonds = 1;
-				if( End_Index(i, bonds) > Start_Index(i+1, bonds) )
-					flag = i;
-			}
-
-		if( flag > -1 ) {
-			fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-					step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) );
-			exit(INSUFFICIENT_SPACE);
-		}    
-
-		if( End_Index(i, bonds) >= bonds->num_intrs-2 ) {
-			workspace->realloc.bonds = 1;
-
-			if( End_Index(i, bonds) > bonds->num_intrs ) {
-				fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
-						step, flag, End_Index(i,bonds), bonds->num_intrs );
-				exit(INSUFFICIENT_SPACE);
-			}
-		}
-
-
-		/* hbonds list */
-		if( workspace->num_H > 0 ) {
-			flag = -1;
-			workspace->realloc.num_hbonds = num_hbonds;
-			for( i = 0; i < workspace->num_H-1; ++i )
-				if( Num_Entries(i, hbonds) >= 
-						(Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) {
-					workspace->realloc.hbonds = 1;
-					if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) )
-						flag = i;
-				}
-
-			if( flag > -1 ) {
-				fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-						step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) );
-				exit(INSUFFICIENT_SPACE);
-			}
-
-			if( Num_Entries(i,hbonds) >= 
-					(hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) {
-				workspace->realloc.hbonds = 1;
-
-				if( End_Index(i, hbonds) > hbonds->num_intrs ) {
-					fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
-							step, flag, End_Index(i,hbonds), hbonds->num_intrs );
-					exit(INSUFFICIENT_SPACE);
-				}
-			}
-		}
-	}
-
-
-	void Cuda_Validate_Lists( reax_system *system, static_storage *workspace, list **lists, int step, int n,
-			int num_bonds, int num_hbonds )
-	{
-		int i, flag;
-		list *bonds, *hbonds, *thblist;
-		int *bonds_start, *bonds_end;
-		int *hbonds_start, *hbonds_end;
-		int *mat_start, *mat_end;
-		int max_sparse_entries = 0;
-
-		bonds = *lists + BONDS;
-		hbonds = *lists + HBONDS;
-
-		bonds_start = (int *) calloc (bonds->n, INT_SIZE);
-		bonds_end = (int *) calloc (bonds->n, INT_SIZE);
-
-		hbonds_start = (int *) calloc (hbonds->n, INT_SIZE );
-		hbonds_end = (int *) calloc (hbonds->n, INT_SIZE );
-
-		mat_start = (int *) calloc (workspace->H.n, INT_SIZE );
-		mat_end = (int *) calloc (workspace->H.n, INT_SIZE );
-
-		copy_host_device (bonds_start, bonds->index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-		copy_host_device (bonds_end, bonds->end_index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-
-		copy_host_device (hbonds_start, hbonds->index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-		copy_host_device (hbonds_end, hbonds->end_index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-
-		copy_host_device (mat_start, workspace->H.start, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-		copy_host_device (mat_end, workspace->H.end, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-
-		/* Sparse Matrix entries */
+        //Reduction on E_vdW
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
+            (spad, spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+            (spad + system->N, &((simulation_data *)data->d_simulation_data)->E_vdW, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //reduction on E_Ele
+        Cuda_reduction <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>> 
+            (spad + 2*system->N, spad + 3*system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+            (spad + 3*system->N, &((simulation_data *)data->d_simulation_data)->E_Ele, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+        rvec_spad = (rvec *) (spad + 4*system->N);
+
+        //reduction on ext_press
+        Cuda_reduction_rvec <<<BLOCKS_POW_2, BLOCK_SIZE, RVEC_SIZE * BLOCK_SIZE>>> 
+            (rvec_spad, rvec_spad + system->N,  system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction_rvec <<<1, BLOCKS_POW_2, RVEC_SIZE * BLOCKS_POW_2>>> 
+            (rvec_spad + system->N, &((simulation_data *)data->d_simulation_data)->ext_press, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+
+
+    /* This version of Compute_Total_Force computes forces from coefficients 
+       accumulated by all interaction functions. Saves enormous time & space! */
+    void Compute_Total_Force( reax_system *system, control_params *control, 
+            simulation_data *data, static_storage *workspace,
+            list **lists )
+    {
+        int i, pj;
+        list *bonds = (*lists) + BONDS;
+
+        for( i = 0; i < system->N; ++i )
+            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+                if( i < bonds->select.bond_list[pj].nbr ) {
+                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT)
+                        Add_dBond_to_Forces( i, pj, system, data, workspace, lists );
+                    else 
+                        Add_dBond_to_Forces_NPT( i, pj, system, data, workspace, lists );
+                }
+    }
+
+
+    void Validate_Lists( static_storage *workspace, list **lists, int step, int n,
+            int Hmax, int Htop, int num_bonds, int num_hbonds )
+    {
+        int i, flag;
+        list *bonds, *hbonds;
+
+        bonds = *lists + BONDS;
+        hbonds = *lists + HBONDS;
+
+        /* far neighbors */
+        if( Htop > Hmax * DANGER_ZONE ) {
+            workspace->realloc.Htop = Htop;
+            if( Htop > Hmax ) {
+                fprintf( stderr, 
+                        "step%d - ran out of space on H matrix: Htop=%d, max = %d",
+                        step, Htop, Hmax );
+                exit(INSUFFICIENT_SPACE);
+            }
+        }
+
+        /* bond list */
+        flag = -1;
+        workspace->realloc.num_bonds = num_bonds;
+        for( i = 0; i < n-1; ++i )
+            if( End_Index(i, bonds) >= Start_Index(i+1, bonds)-2 ) {
+                workspace->realloc.bonds = 1;
+                if( End_Index(i, bonds) > Start_Index(i+1, bonds) )
+                    flag = i;
+            }
+
+        if( flag > -1 ) {
+            fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+                    step, flag, End_Index(flag,bonds), Start_Index(flag+1,bonds) );
+            exit(INSUFFICIENT_SPACE);
+        }    
+
+        if( End_Index(i, bonds) >= bonds->num_intrs-2 ) {
+            workspace->realloc.bonds = 1;
+
+            if( End_Index(i, bonds) > bonds->num_intrs ) {
+                fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
+                        step, flag, End_Index(i,bonds), bonds->num_intrs );
+                exit(INSUFFICIENT_SPACE);
+            }
+        }
+
+
+        /* hbonds list */
+        if( workspace->num_H > 0 ) {
+            flag = -1;
+            workspace->realloc.num_hbonds = num_hbonds;
+            for( i = 0; i < workspace->num_H-1; ++i )
+                if( Num_Entries(i, hbonds) >= 
+                        (Start_Index(i+1, hbonds) - Start_Index(i, hbonds)) * DANGER_ZONE ) {
+                    workspace->realloc.hbonds = 1;
+                    if( End_Index(i, hbonds) > Start_Index(i+1, hbonds) )
+                        flag = i;
+                }
+
+            if( flag > -1 ) {
+                fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+                        step, flag, End_Index(flag,hbonds), Start_Index(flag+1,hbonds) );
+                exit(INSUFFICIENT_SPACE);
+            }
+
+            if( Num_Entries(i,hbonds) >= 
+                    (hbonds->num_intrs - Start_Index(i,hbonds)) * DANGER_ZONE ) {
+                workspace->realloc.hbonds = 1;
+
+                if( End_Index(i, hbonds) > hbonds->num_intrs ) {
+                    fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
+                            step, flag, End_Index(i,hbonds), hbonds->num_intrs );
+                    exit(INSUFFICIENT_SPACE);
+                }
+            }
+        }
+    }
+
+
+    void Cuda_Validate_Lists( reax_system *system, static_storage *workspace, list **lists, int step, int n,
+            int num_bonds, int num_hbonds )
+    {
+        int i, flag;
+        list *bonds, *hbonds, *thblist;
+        int *bonds_start, *bonds_end;
+        int *hbonds_start, *hbonds_end;
+        int *mat_start, *mat_end;
+        int max_sparse_entries = 0;
+
+        bonds = *lists + BONDS;
+        hbonds = *lists + HBONDS;
+
+        bonds_start = (int *) calloc (bonds->n, INT_SIZE);
+        bonds_end = (int *) calloc (bonds->n, INT_SIZE);
+
+        hbonds_start = (int *) calloc (hbonds->n, INT_SIZE );
+        hbonds_end = (int *) calloc (hbonds->n, INT_SIZE );
+
+        mat_start = (int *) calloc (workspace->H.n, INT_SIZE );
+        mat_end = (int *) calloc (workspace->H.n, INT_SIZE );
+
+        copy_host_device (bonds_start, bonds->index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+        copy_host_device (bonds_end, bonds->end_index, bonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+
+        copy_host_device (hbonds_start, hbonds->index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+        copy_host_device (hbonds_end, hbonds->end_index, hbonds->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+
+        copy_host_device (mat_start, workspace->H.start, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+        copy_host_device (mat_end, workspace->H.end, workspace->H.n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+
+        /* Sparse Matrix entries */
 
 #ifdef __CUDA_TEST__
-		/*
-		   workspace->realloc.Htop = 0;
-		   for (i = 0; i < workspace->H.n-1; i++) {
-		   if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])){
-		   workspace->realloc.Htop = mat_end[i] - mat_start[i];
-		   }
-		   }
-		 */
+        /*
+           workspace->realloc.Htop = 0;
+           for (i = 0; i < workspace->H.n-1; i++) {
+           if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i])){
+           workspace->realloc.Htop = mat_end[i] - mat_start[i];
+           }
+           }
+         */
 #endif
 
-		flag = -1;
-		workspace->realloc.Htop = 0;
-		for ( i = 0; i < n-1; i ++){
-
-			if( (mat_end[i] - mat_start[i]) > 
-					(system->max_sparse_matrix_entries * DANGER_ZONE )) {
-				//fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index: %d (%d %d) \n", 
-				//								step, i, mat_start[i], mat_end[i]);
-				if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i]))
-					workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ;
-			}
-
-			if ( (mat_end[i] > mat_start[i+1]) ){
-				fprintf( stderr, "step%d-matcheck failed: i=%d end(i)=%d start(i+1)=%d\n",
-						step, flag, mat_end[i], mat_start[i+1]);
-				exit(INSUFFICIENT_SPACE);
-			}
-		}
-
-		if( (mat_end[i] - mat_start[i]) > system->max_sparse_matrix_entries * DANGER_ZONE ) {
-			if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i]))
-				workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ;
-			//fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index %d (%d %d)  -- %d \n", 
-			//								step, i, mat_start[i], mat_end[i], 
-			//								(int) (system->max_sparse_matrix_entries * DANGER_ZONE));
-
-			if( mat_end[i] > system->N * system->max_sparse_matrix_entries ) {
-				fprintf( stderr, "step%d-matchk failed: i=%d end(i)=%d mat_end=%d\n",
-						step, flag, mat_end[i], system->N * system->max_sparse_matrix_entries);
-				exit(INSUFFICIENT_SPACE);
-			}
-		}
-
-
-		/* bond list */
+        flag = -1;
+        workspace->realloc.Htop = 0;
+        for ( i = 0; i < n-1; i ++){
+
+            if( (mat_end[i] - mat_start[i]) > 
+                    (system->max_sparse_matrix_entries * DANGER_ZONE )) {
+                //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index: %d (%d %d) \n", 
+                //                                step, i, mat_start[i], mat_end[i]);
+                if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i]))
+                    workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ;
+            }
+
+            if ( (mat_end[i] > mat_start[i+1]) ){
+                fprintf( stderr, "step%d-matcheck failed: i=%d end(i)=%d start(i+1)=%d\n",
+                        step, flag, mat_end[i], mat_start[i+1]);
+                exit(INSUFFICIENT_SPACE);
+            }
+        }
+
+        if( (mat_end[i] - mat_start[i]) > system->max_sparse_matrix_entries * DANGER_ZONE ) {
+            if (workspace->realloc.Htop <= (mat_end[i] - mat_start[i]))
+                workspace->realloc.Htop = (mat_end[i] - mat_start[i]) ;
+            //fprintf (stderr, "step %d, Reached the water mark for sparse matrix for index %d (%d %d)  -- %d \n", 
+            //                                step, i, mat_start[i], mat_end[i], 
+            //                                (int) (system->max_sparse_matrix_entries * DANGER_ZONE));
+
+            if( mat_end[i] > system->N * system->max_sparse_matrix_entries ) {
+                fprintf( stderr, "step%d-matchk failed: i=%d end(i)=%d mat_end=%d\n",
+                        step, flag, mat_end[i], system->N * system->max_sparse_matrix_entries);
+                exit(INSUFFICIENT_SPACE);
+            }
+        }
+
+
+        /* bond list */
 #ifdef __CUDA_TEST__
-		//workspace->realloc.bonds = 1;
+        //workspace->realloc.bonds = 1;
 #endif
-		flag = -1;
-		workspace->realloc.num_bonds = 0;
-		for( i = 0; i < n-1; ++i ) {
-			workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS );
-			if( bonds_end[i] >= bonds_start[i+1]-2 ) {
-				workspace->realloc.bonds = 1;
-				//fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", 
-				//						step, i, bonds_start [i], bonds_end[i]);
-				if( bonds_end[i] > bonds_start[i+1] )
-					flag = i;
-			}
-		}
-
-		if( flag > -1 ) {
-			fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-					step, flag, bonds_end[flag], bonds_start[flag+1] );
-			exit(INSUFFICIENT_SPACE);
-		}    
-
-		workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS );
-		if( bonds_end[i] >= bonds->num_intrs-2 ) {
-			workspace->realloc.bonds = 1;
-			//fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", 
-			//						step, i, bonds_start [i], bonds_end[i]);
-
-			if( bonds_end[i] > bonds->num_intrs ) {
-				fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
-						step, flag, bonds_end[i], bonds->num_intrs );
-				exit(INSUFFICIENT_SPACE);
-			}
-		}
-
-		//fprintf (stderr, "step:%d Total bonds: %d \n", step, workspace->realloc.num_bonds);
-
-		/* hbonds list */
-		if( workspace->num_H > 0 ) {
+        flag = -1;
+        workspace->realloc.num_bonds = 0;
+        for( i = 0; i < n-1; ++i ) {
+            workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS );
+            if( bonds_end[i] >= bonds_start[i+1]-2 ) {
+                workspace->realloc.bonds = 1;
+                //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", 
+                //                        step, i, bonds_start [i], bonds_end[i]);
+                if( bonds_end[i] > bonds_start[i+1] )
+                    flag = i;
+            }
+        }
+
+        if( flag > -1 ) {
+            fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+                    step, flag, bonds_end[flag], bonds_start[flag+1] );
+            exit(INSUFFICIENT_SPACE);
+        }    
+
+        workspace->realloc.num_bonds += MAX((bonds_end [i] - bonds_start[i]) * 2, MIN_BONDS );
+        if( bonds_end[i] >= bonds->num_intrs-2 ) {
+            workspace->realloc.bonds = 1;
+            //fprintf (stderr, "step: %d, reached the water mark for bonds for atom: %d (%d %d) \n", 
+            //                        step, i, bonds_start [i], bonds_end[i]);
+
+            if( bonds_end[i] > bonds->num_intrs ) {
+                fprintf( stderr, "step%d-bondchk failed: i=%d end(i)=%d bond_end=%d\n",
+                        step, flag, bonds_end[i], bonds->num_intrs );
+                exit(INSUFFICIENT_SPACE);
+            }
+        }
+
+        //fprintf (stderr, "step:%d Total bonds: %d \n", step, workspace->realloc.num_bonds);
+
+        /* hbonds list */
+        if( workspace->num_H > 0 ) {
 #ifdef __CUDA_TEST__
-			//workspace->realloc.hbonds = 1;
+            //workspace->realloc.hbonds = 1;
 #endif
-			flag = -1;
-			workspace->realloc.num_hbonds = 0;
-			for( i = 0; i < workspace->num_H-1; ++i ) {
-				workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS );
-
-				if( (hbonds_end[i] - hbonds_start[i]) >= 
-						(hbonds_start[i+1] - hbonds_start[i]) * DANGER_ZONE ) {
-					workspace->realloc.hbonds = 1;
-					//fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", 
-					//						step, i, hbonds_start [i], hbonds_end[i]);
-					if( hbonds_end[i] > hbonds_start[i+1] )
-						flag = i;
-				}
-			}
-
-			if( flag > -1 ) {
-				fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d,end(i)=%d str(i+1)=%d\n",
-						step, flag, hbonds_start[(flag)],hbonds_end[(flag)], hbonds_start[(flag+1)] );
-				exit(INSUFFICIENT_SPACE);
-			}
-
-			workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS );
-			if( (hbonds_end[i] - hbonds_start[i]) >= 
-					(hbonds->num_intrs - hbonds_start[i]) * DANGER_ZONE ) {
-				workspace->realloc.hbonds = 1;
-				//fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", 
-				//						step, i, hbonds_start [i], hbonds_end[i]);
-
-				if( hbonds_end[i] > hbonds->num_intrs ) {
-					fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
-							step, flag, hbonds_end[i], hbonds->num_intrs );
-					exit(INSUFFICIENT_SPACE);
-				}
-			}
-		}
-
-		//fprintf (stderr, "step:%d Total Hbonds: %d \n", step, workspace->realloc.num_hbonds);
-
-		free (bonds_start);
-		free (bonds_end );
-
-		free (hbonds_start );
-		free (hbonds_end  );
-
-		free (mat_start );
-		free (mat_end );
-	}
-
-	void Cuda_Threebody_List( reax_system *system, static_storage *workspace, list *thblist, int step )
-	{
-		int *thb_start, *thb_end;
-		int i, flag;
-
-		thb_start = (int *) calloc (thblist->n, INT_SIZE);
-		thb_end = (int *) calloc (thblist->n, INT_SIZE );
-
-		copy_host_device (thb_start, thblist->index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-		copy_host_device (thb_end, thblist->end_index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
-
-		/*three_body list*/
-		flag = -1;
-		workspace->realloc.num_3body = 0;
-		for( i = 0; i < thblist->n-1; ++i ){
-			if( (thb_end[i] - thb_start[i]) >= (thb_start[i+1] - thb_start[i])*DANGER_ZONE ) {
-				workspace->realloc.thbody = 1;
-				if( thb_end[i] > thb_end[i+1] || thb_end[i] > thblist->num_intrs ) {
-					flag = i;
-					break;
-				}
-			}
-		}
-
-		if( flag > -1 ) {
-			//fprintf( stderr, "step%d-thbchk failed: i=%d end(i)=%d str(i+1)=%d\n",
-			//   step, flag, thb_end[flag], thb_start[flag+1] );
-			fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
-					step, flag-1, thb_start[flag-1], thb_end[flag-1], thblist->num_intrs );
-			fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
-					step, flag, thb_start[flag], thb_end[flag], thblist->num_intrs );
-			exit(INSUFFICIENT_SPACE);
-		}    
-
-		if( (thb_end[i]-thb_start[i]) >= (thblist->num_intrs - thb_start[i])*DANGER_ZONE ) {
-			workspace->realloc.thbody = 1;
-
-			if( thb_end[i] > thblist->num_intrs ) {
-				fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
-						step, i-1, thb_start[i-1], thb_end[i-1], thblist->num_intrs );
-				fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
-						step, i, thb_start[i], thb_end[i], thblist->num_intrs );
-				exit(INSUFFICIENT_SPACE);
-			}
-		}
-
-		free (thb_start);
-		free (thb_end);
-	}
-
-
-	void Init_Forces( reax_system *system, control_params *control, 
-			simulation_data *data, static_storage *workspace,
-			list **lists, output_controls *out_control ) {
-		int i, j, pj;
-		int start_i, end_i;
-		int type_i, type_j;
-		int Htop, btop_i, btop_j, num_bonds, num_hbonds;
-		int ihb, jhb, ihb_top, jhb_top;
-		int flag;
-		real r_ij, r2, self_coef;
-		real dr3gamij_1, dr3gamij_3, Tap;
-		//real val, dif, base;
-		real C12, C34, C56;
-		real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
-		real BO, BO_s, BO_pi, BO_pi2;
-		real p_boc1, p_boc2;   
-		sparse_matrix *H;
-		list *far_nbrs, *bonds, *hbonds;
-		single_body_parameters *sbp_i, *sbp_j;
-		two_body_parameters *twbp;
-		far_neighbor_data *nbr_pj;
-		//LR_lookup_table *t;
-		reax_atom *atom_i, *atom_j;
-		bond_data *ibond, *jbond;
-		bond_order_data *bo_ij, *bo_ji;
-
-		far_nbrs = *lists + FAR_NBRS;
-		bonds = *lists + BONDS;
-		hbonds = *lists + HBONDS;
-
-		H = &workspace->H;
-		Htop = 0;
-		num_bonds = 0;
-		num_hbonds = 0;
-		btop_i = btop_j = 0;
-		p_boc1 = system->reaxprm.gp.l[0];
-		p_boc2 = system->reaxprm.gp.l[1];
-
-		for( i = 0; i < system->N; ++i ) {
-			atom_i = &(system->atoms[i]);
-			type_i  = atom_i->type;
-			start_i = Start_Index(i, far_nbrs);
-			end_i   = End_Index(i, far_nbrs);
-			H->start[i] = Htop;
-			btop_i = End_Index( i, bonds );
-			sbp_i = &(system->reaxprm.sbp[type_i]);
-			ihb = ihb_top = -1;
-			if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
-				ihb_top = End_Index( workspace->hbond_index[i], hbonds );
-
-			for( pj = start_i; pj < end_i; ++pj ) {
-				nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-				j = nbr_pj->nbr;
-				atom_j = &(system->atoms[j]);
-
-				flag = 0;
-				if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-					if( nbr_pj->d <= control->r_cut)
-						flag = 1;
-					else flag = 0;
-				}
-				else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
-								nbr_pj->dvec))<=SQR(control->r_cut)){
-					nbr_pj->d = sqrt(nbr_pj->d);
-					flag = 1;
-				}
-
-				if( flag ){	
-					type_j = system->atoms[j].type;
-					r_ij = nbr_pj->d;
-					sbp_j = &(system->reaxprm.sbp[type_j]);
-					twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-					self_coef = (i == j) ? 0.5 : 1.0;
-
-					/* H matrix entry */
-					Tap = control->Tap7 * r_ij + control->Tap6;
-					Tap = Tap * r_ij + control->Tap5;
-					Tap = Tap * r_ij + control->Tap4;
-					Tap = Tap * r_ij + control->Tap3;
-					Tap = Tap * r_ij + control->Tap2;
-					Tap = Tap * r_ij + control->Tap1;
-					Tap = Tap * r_ij + control->Tap0;	      
-
-					dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-					dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-
-					H->entries[Htop].j = j;
-					H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
-					++Htop;
-
-					/* hydrogen bond lists */ 
-					if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-							nbr_pj->d <= control->hb_cut ) {
-						// fprintf( stderr, "%d %d\n", atom1, atom2 );
-						jhb = sbp_j->p_hbond;
-						if( ihb == 1 && jhb == 2 ) {
-							hbonds->select.hbond_list[ihb_top].nbr = j;
-							hbonds->select.hbond_list[ihb_top].scl = 1;
-							hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
-							++ihb_top;
-							++num_hbonds;
-						}
-						else if( ihb == 2 && jhb == 1 ) {
-							jhb_top = End_Index( workspace->hbond_index[j], hbonds );
-							hbonds->select.hbond_list[jhb_top].nbr = i;
-							hbonds->select.hbond_list[jhb_top].scl = -1;
-							hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
-							Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
-							++num_hbonds;
-						}
-					}
-
-					/* uncorrected bond orders */
-					if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
-						r2 = SQR(r_ij);
-
-						if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-							C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-							BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-						}
-						else BO_s = C12 = 0.0;
-
-						if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-							C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-							BO_pi = EXP( C34 );
-						}
-						else BO_pi = C34 = 0.0;
-
-						if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-							C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );	
-							BO_pi2= EXP( C56 );
-						}
-						else BO_pi2 = C56 = 0.0;
-
-						/* Initially BO values are the uncorrected ones, page 1 */
-						BO = BO_s + BO_pi + BO_pi2;
-
-						if( BO >= control->bo_cut ) {
-							num_bonds += 2;
-							/****** bonds i-j and j-i ******/
-							ibond = &( bonds->select.bond_list[btop_i] );
-							btop_j = End_Index( j, bonds );
-							jbond = &(bonds->select.bond_list[btop_j]);
-
-							ibond->nbr = j;
-							jbond->nbr = i;
-							ibond->d = r_ij;
-							jbond->d = r_ij;
-							rvec_Copy( ibond->dvec, nbr_pj->dvec );
-							rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
-							ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
-							ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
-							ibond->dbond_index = btop_i;
-							jbond->dbond_index = btop_i;
-							ibond->sym_index = btop_j;
-							jbond->sym_index = btop_i;
-							++btop_i;
-							Set_End_Index( j, btop_j+1, bonds );
-
-							bo_ij = &( ibond->bo_data );
-							bo_ji = &( jbond->bo_data );
-							bo_ji->BO = bo_ij->BO = BO;
-							bo_ji->BO_s = bo_ij->BO_s = BO_s;
-							bo_ji->BO_pi = bo_ij->BO_pi = BO_pi;
-							bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
-
-							/* Bond Order page2-3, derivative of total bond order prime */
-							Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-							Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-							Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-
-							/* Only dln_BOp_xx wrt. dr_i is stored here, note that 
-							   dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-							rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-							rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
-							rvec_Scale(bo_ij->dln_BOp_pi2,
-									-bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
-							rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
-							rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
-							rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
-
-							/* Only dBOp wrt. dr_i is stored here, note that 
-							   dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-							rvec_Scale( bo_ij->dBOp, 
-									-(bo_ij->BO_s * Cln_BOp_s + 
-										bo_ij->BO_pi * Cln_BOp_pi + 
-										bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-							rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
-
-							rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
-							rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
-
-							bo_ij->BO_s -= control->bo_cut;
-							bo_ij->BO -= control->bo_cut;
-							bo_ji->BO_s -= control->bo_cut;
-							bo_ji->BO -= control->bo_cut;
-							workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
-							workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
-							bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-							bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
-
-							/*fprintf( stderr, "%d %d %g %g %g\n",
-							  i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/
-
-							/*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", 
-							  Cln_BOp_s, twbp->p_bo2, C12 );
-							  fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", 
-							  Cln_BOp_pi, twbp->p_bo4, C34 );
-							  fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n",
-							  Cln_BOp_pi2, twbp->p_bo6, C56 );*/
-							/*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2);
-							  fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4);
-							  fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6);
-							  fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", 
-							  twbp->r_s, twbp->r_p, twbp->r_pp );
-							  fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/
-
-							/*fprintf( stderr, "\tfactors: %g %g %g\n",
-							  -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + 
-							  bo_ij->BO_pi2 * Cln_BOp_pp),
-							  -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/
-							/*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
-							  bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] );
-							  fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
-							  bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], 
-							  bo_ij->dln_BOp_pi[2] );
-							  fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n",
-							  bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], 
-							  bo_ij->dln_BOp_pi2[2] );*/
-
-							Set_End_Index( j, btop_j+1, bonds );
-						}
-					}
-				}
-			}
-
-			H->entries[Htop].j = i;
-			H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
-			++Htop;
-
-			Set_End_Index( i, btop_i, bonds );
-			if( ihb == 1 )
-				Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
-			//fprintf( stderr, "%d bonds start: %d, end: %d\n", 
-			//     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
-		}
-
-		// mark the end of j list
-		H->start[i] = Htop; 
-		/* validate lists - decide if reallocation is required! */
-		Validate_Lists( workspace, lists, 
-				data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+            flag = -1;
+            workspace->realloc.num_hbonds = 0;
+            for( i = 0; i < workspace->num_H-1; ++i ) {
+                workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS );
+
+                if( (hbonds_end[i] - hbonds_start[i]) >= 
+                        (hbonds_start[i+1] - hbonds_start[i]) * DANGER_ZONE ) {
+                    workspace->realloc.hbonds = 1;
+                    //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", 
+                    //                        step, i, hbonds_start [i], hbonds_end[i]);
+                    if( hbonds_end[i] > hbonds_start[i+1] )
+                        flag = i;
+                }
+            }
+
+            if( flag > -1 ) {
+                fprintf( stderr, "step%d-hbondchk failed: i=%d start(i)=%d,end(i)=%d str(i+1)=%d\n",
+                        step, flag, hbonds_start[(flag)],hbonds_end[(flag)], hbonds_start[(flag+1)] );
+                exit(INSUFFICIENT_SPACE);
+            }
+
+            workspace->realloc.num_hbonds += MAX( (hbonds_end[i] - hbonds_start[i]) * SAFE_HBONDS, MIN_HBONDS );
+            if( (hbonds_end[i] - hbonds_start[i]) >= 
+                    (hbonds->num_intrs - hbonds_start[i]) * DANGER_ZONE ) {
+                workspace->realloc.hbonds = 1;
+                //fprintf (stderr, "step: %d, reached the water mark for hbonds for atom: %d (%d %d) \n", 
+                //                        step, i, hbonds_start [i], hbonds_end[i]);
+
+                if( hbonds_end[i] > hbonds->num_intrs ) {
+                    fprintf( stderr, "step%d-hbondchk failed: i=%d end(i)=%d hbondend=%d\n",
+                            step, flag, hbonds_end[i], hbonds->num_intrs );
+                    exit(INSUFFICIENT_SPACE);
+                }
+            }
+        }
+
+        //fprintf (stderr, "step:%d Total Hbonds: %d \n", step, workspace->realloc.num_hbonds);
+
+        free (bonds_start);
+        free (bonds_end );
+
+        free (hbonds_start );
+        free (hbonds_end  );
+
+        free (mat_start );
+        free (mat_end );
+    }
+
+    void Cuda_Threebody_List( reax_system *system, static_storage *workspace, list *thblist, int step )
+    {
+        int *thb_start, *thb_end;
+        int i, flag;
+
+        thb_start = (int *) calloc (thblist->n, INT_SIZE);
+        thb_end = (int *) calloc (thblist->n, INT_SIZE );
+
+        copy_host_device (thb_start, thblist->index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+        copy_host_device (thb_end, thblist->end_index, thblist->n * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__ );
+
+        /*three_body list*/
+        flag = -1;
+        workspace->realloc.num_3body = 0;
+        for( i = 0; i < thblist->n-1; ++i ){
+            if( (thb_end[i] - thb_start[i]) >= (thb_start[i+1] - thb_start[i])*DANGER_ZONE ) {
+                workspace->realloc.thbody = 1;
+                if( thb_end[i] > thb_end[i+1] || thb_end[i] > thblist->num_intrs ) {
+                    flag = i;
+                    break;
+                }
+            }
+        }
+
+        if( flag > -1 ) {
+            //fprintf( stderr, "step%d-thbchk failed: i=%d end(i)=%d str(i+1)=%d\n",
+            //   step, flag, thb_end[flag], thb_start[flag+1] );
+            fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
+                    step, flag-1, thb_start[flag-1], thb_end[flag-1], thblist->num_intrs );
+            fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
+                    step, flag, thb_start[flag], thb_end[flag], thblist->num_intrs );
+            exit(INSUFFICIENT_SPACE);
+        }    
+
+        if( (thb_end[i]-thb_start[i]) >= (thblist->num_intrs - thb_start[i])*DANGER_ZONE ) {
+            workspace->realloc.thbody = 1;
+
+            if( thb_end[i] > thblist->num_intrs ) {
+                fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
+                        step, i-1, thb_start[i-1], thb_end[i-1], thblist->num_intrs );
+                fprintf( stderr, "step%d-thbchk failed: i=%d start(i)=%d end(i)=%d thb_end=%d\n",
+                        step, i, thb_start[i], thb_end[i], thblist->num_intrs );
+                exit(INSUFFICIENT_SPACE);
+            }
+        }
+
+        free (thb_start);
+        free (thb_end);
+    }
+
+
+    void Init_Forces( reax_system *system, control_params *control, 
+            simulation_data *data, static_storage *workspace,
+            list **lists, output_controls *out_control ) {
+        int i, j, pj;
+        int start_i, end_i;
+        int type_i, type_j;
+        int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+        int ihb, jhb, ihb_top, jhb_top;
+        int flag;
+        real r_ij, r2, self_coef;
+        real dr3gamij_1, dr3gamij_3, Tap;
+        //real val, dif, base;
+        real C12, C34, C56;
+        real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+        real BO, BO_s, BO_pi, BO_pi2;
+        real p_boc1, p_boc2;   
+        sparse_matrix *H;
+        list *far_nbrs, *bonds, *hbonds;
+        single_body_parameters *sbp_i, *sbp_j;
+        two_body_parameters *twbp;
+        far_neighbor_data *nbr_pj;
+        //LR_lookup_table *t;
+        reax_atom *atom_i, *atom_j;
+        bond_data *ibond, *jbond;
+        bond_order_data *bo_ij, *bo_ji;
+
+        far_nbrs = *lists + FAR_NBRS;
+        bonds = *lists + BONDS;
+        hbonds = *lists + HBONDS;
+
+        H = &workspace->H;
+        Htop = 0;
+        num_bonds = 0;
+        num_hbonds = 0;
+        btop_i = btop_j = 0;
+        p_boc1 = system->reaxprm.gp.l[0];
+        p_boc2 = system->reaxprm.gp.l[1];
+
+        for( i = 0; i < system->N; ++i ) {
+            atom_i = &(system->atoms[i]);
+            type_i  = atom_i->type;
+            start_i = Start_Index(i, far_nbrs);
+            end_i   = End_Index(i, far_nbrs);
+            H->start[i] = Htop;
+            btop_i = End_Index( i, bonds );
+            sbp_i = &(system->reaxprm.sbp[type_i]);
+            ihb = ihb_top = -1;
+            if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
+                ihb_top = End_Index( workspace->hbond_index[i], hbonds );
+
+            for( pj = start_i; pj < end_i; ++pj ) {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j = nbr_pj->nbr;
+                atom_j = &(system->atoms[j]);
+
+                flag = 0;
+                if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+                    if( nbr_pj->d <= control->r_cut)
+                        flag = 1;
+                    else flag = 0;
+                }
+                else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
+                                nbr_pj->dvec))<=SQR(control->r_cut)){
+                    nbr_pj->d = sqrt(nbr_pj->d);
+                    flag = 1;
+                }
+
+                if( flag ){    
+                    type_j = system->atoms[j].type;
+                    r_ij = nbr_pj->d;
+                    sbp_j = &(system->reaxprm.sbp[type_j]);
+                    twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
+                    self_coef = (i == j) ? 0.5 : 1.0;
+
+                    /* H matrix entry */
+                    Tap = control->Tap7 * r_ij + control->Tap6;
+                    Tap = Tap * r_ij + control->Tap5;
+                    Tap = Tap * r_ij + control->Tap4;
+                    Tap = Tap * r_ij + control->Tap3;
+                    Tap = Tap * r_ij + control->Tap2;
+                    Tap = Tap * r_ij + control->Tap1;
+                    Tap = Tap * r_ij + control->Tap0;          
+
+                    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+                    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+
+                    H->entries[Htop].j = j;
+                    H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
+                    ++Htop;
+
+                    /* hydrogen bond lists */ 
+                    if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
+                            nbr_pj->d <= control->hb_cut ) {
+                        // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                        jhb = sbp_j->p_hbond;
+                        if( ihb == 1 && jhb == 2 ) {
+                            hbonds->select.hbond_list[ihb_top].nbr = j;
+                            hbonds->select.hbond_list[ihb_top].scl = 1;
+                            hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
+                            ++ihb_top;
+                            ++num_hbonds;
+                        }
+                        else if( ihb == 2 && jhb == 1 ) {
+                            jhb_top = End_Index( workspace->hbond_index[j], hbonds );
+                            hbonds->select.hbond_list[jhb_top].nbr = i;
+                            hbonds->select.hbond_list[jhb_top].scl = -1;
+                            hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
+                            Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
+                            ++num_hbonds;
+                        }
+                    }
+
+                    /* uncorrected bond orders */
+                    if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                        r2 = SQR(r_ij);
+
+                        if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                            C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                            BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                        }
+                        else BO_s = C12 = 0.0;
+
+                        if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                            C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                            BO_pi = EXP( C34 );
+                        }
+                        else BO_pi = C34 = 0.0;
+
+                        if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                            C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                            BO_pi2= EXP( C56 );
+                        }
+                        else BO_pi2 = C56 = 0.0;
+
+                        /* Initially BO values are the uncorrected ones, page 1 */
+                        BO = BO_s + BO_pi + BO_pi2;
+
+                        if( BO >= control->bo_cut ) {
+                            num_bonds += 2;
+                            /****** bonds i-j and j-i ******/
+                            ibond = &( bonds->select.bond_list[btop_i] );
+                            btop_j = End_Index( j, bonds );
+                            jbond = &(bonds->select.bond_list[btop_j]);
+
+                            ibond->nbr = j;
+                            jbond->nbr = i;
+                            ibond->d = r_ij;
+                            jbond->d = r_ij;
+                            rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                            rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
+                            ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
+                            ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+                            ibond->dbond_index = btop_i;
+                            jbond->dbond_index = btop_i;
+                            ibond->sym_index = btop_j;
+                            jbond->sym_index = btop_i;
+                            ++btop_i;
+                            Set_End_Index( j, btop_j+1, bonds );
+
+                            bo_ij = &( ibond->bo_data );
+                            bo_ji = &( jbond->bo_data );
+                            bo_ji->BO = bo_ij->BO = BO;
+                            bo_ji->BO_s = bo_ij->BO_s = BO_s;
+                            bo_ji->BO_pi = bo_ij->BO_pi = BO_pi;
+                            bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
+
+                            /* Bond Order page2-3, derivative of total bond order prime */
+                            Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                            Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                            Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+
+                            /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                               dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+                            rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
+                            rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                            rvec_Scale(bo_ij->dln_BOp_pi2,
+                                    -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                            rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
+                            rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
+                            rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
+
+                            /* Only dBOp wrt. dr_i is stored here, note that 
+                               dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+                            rvec_Scale( bo_ij->dBOp, 
+                                    -(bo_ij->BO_s * Cln_BOp_s + 
+                                        bo_ij->BO_pi * Cln_BOp_pi + 
+                                        bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                            rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
+
+                            rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
+                            rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
+
+                            bo_ij->BO_s -= control->bo_cut;
+                            bo_ij->BO -= control->bo_cut;
+                            bo_ji->BO_s -= control->bo_cut;
+                            bo_ji->BO -= control->bo_cut;
+                            workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
+                            workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
+                            bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
+                            bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+
+                            /*fprintf( stderr, "%d %d %g %g %g\n",
+                              i+1, j+1, bo_ij->BO, bo_ij->BO_pi, bo_ij->BO_pi2 );*/
+
+                            /*fprintf( stderr, "Cln_BOp_s: %f, pbo2: %f, C12:%f\n", 
+                              Cln_BOp_s, twbp->p_bo2, C12 );
+                              fprintf( stderr, "Cln_BOp_pi: %f, pbo4: %f, C34:%f\n", 
+                              Cln_BOp_pi, twbp->p_bo4, C34 );
+                              fprintf( stderr, "Cln_BOp_pi2: %f, pbo6: %f, C56:%f\n",
+                              Cln_BOp_pi2, twbp->p_bo6, C56 );*/
+                            /*fprintf(stderr, "pbo1: %f, pbo2:%f\n", twbp->p_bo1, twbp->p_bo2);
+                              fprintf(stderr, "pbo3: %f, pbo4:%f\n", twbp->p_bo3, twbp->p_bo4);
+                              fprintf(stderr, "pbo5: %f, pbo6:%f\n", twbp->p_bo5, twbp->p_bo6);
+                              fprintf( stderr, "r_s: %f, r_p: %f, r_pp: %f\n", 
+                              twbp->r_s, twbp->r_p, twbp->r_pp );
+                              fprintf( stderr, "C12: %g, C34:%g, C56:%g\n", C12, C34, C56 );*/
+
+                            /*fprintf( stderr, "\tfactors: %g %g %g\n",
+                              -(bo_ij->BO_s * Cln_BOp_s + bo_ij->BO_pi * Cln_BOp_pi + 
+                              bo_ij->BO_pi2 * Cln_BOp_pp),
+                              -bo_ij->BO_pi * Cln_BOp_pi, -bo_ij->BO_pi2 * Cln_BOp_pi2 );*/
+                            /*fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
+                              bo_ij->dBOp[0], bo_ij->dBOp[1], bo_ij->dBOp[2] );
+                              fprintf( stderr, "dBOpi:\t[%g, %g, %g]\n", 
+                              bo_ij->dln_BOp_pi[0], bo_ij->dln_BOp_pi[1], 
+                              bo_ij->dln_BOp_pi[2] );
+                              fprintf( stderr, "dBOpi2:\t[%g, %g, %g]\n\n",
+                              bo_ij->dln_BOp_pi2[0], bo_ij->dln_BOp_pi2[1], 
+                              bo_ij->dln_BOp_pi2[2] );*/
+
+                            Set_End_Index( j, btop_j+1, bonds );
+                        }
+                    }
+                }
+            }
+
+            H->entries[Htop].j = i;
+            H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
+            ++Htop;
+
+            Set_End_Index( i, btop_i, bonds );
+            if( ihb == 1 )
+                Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
+            //fprintf( stderr, "%d bonds start: %d, end: %d\n", 
+            //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
+        }
+
+        // mark the end of j list
+        H->start[i] = Htop; 
+        /* validate lists - decide if reallocation is required! */
+        Validate_Lists( workspace, lists, 
+                data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
 
 #if defined(DEBUG_FOCUS)
-		fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
-				data->step, Htop, num_bonds, num_hbonds );
+        fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
+                data->step, Htop, num_bonds, num_hbonds );
 
 #endif
-	}
-
-
-	GLOBAL void Estimate_Sparse_Matrix_Entries ( reax_atom *atoms, control_params *control, 
-			simulation_data *data, simulation_box *box, list far_nbrs, int N, int *indices ) {
-
-		int i, j, pj;
-		int start_i, end_i;
-		int type_i, type_j;
-		int Htop;
-		int flag;
-		far_neighbor_data *nbr_pj;
-		reax_atom *atom_i, *atom_j;
-
-		int temp;
-
-		Htop = 0;
-
-		i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= N) return;
-
-		atom_i = &(atoms[i]);
-		type_i  = atom_i->type;
-		start_i = Start_Index(i, &far_nbrs);
-		end_i   = End_Index(i, &far_nbrs);
-		indices[i] = Htop;
-
-		for( pj = start_i; pj < end_i; ++pj ) {
-			nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-			j = nbr_pj->nbr;
-			atom_j = &(atoms[j]);
-
-			//CHANGE ORIGINAL
-			//if (i < j) continue;
-			//CHANGE ORIGINAL
-
-			flag = 0;
-			if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-				if( nbr_pj->d <= control->r_cut)
-					flag = 1;
-				else flag = 0;
-			}
-			else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec)) <= 	
-					SQR(control->r_cut)){
-				nbr_pj->d = sqrt(nbr_pj->d);
-				flag = 1;
-			}
-
-			if( flag ){	
-				++Htop;
-			}
-		}
-
-		++Htop;
-
-		// mark the end of j list
-		indices[i] = Htop;
-	}
-
-
-
-
-	GLOBAL void Init_Forces( reax_atom *atoms, 		global_parameters g_params, control_params *control, 
-			single_body_parameters *sbp, two_body_parameters *tbp, 
-			simulation_data *data, simulation_box *box,    static_storage workspace,
-			list far_nbrs, 			list bonds, 			   list hbonds, 
-			int N, 						int max_sparse_entries, int num_atom_types ) 
-	{
-
-		int i, j, pj;
-		int start_i, end_i;
-		int type_i, type_j;
-		int Htop, btop_i, btop_j, num_bonds, num_hbonds;
-		int ihb, jhb, ihb_top, jhb_top;
-		int flag;
-		real r_ij, r2, self_coef;
-		real dr3gamij_1, dr3gamij_3, Tap;
-		//real val, dif, base;
-		real C12, C34, C56;
-		real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
-		real BO, BO_s, BO_pi, BO_pi2;
-		real p_boc1, p_boc2;   
-		sparse_matrix *H;
-		single_body_parameters *sbp_i, *sbp_j;
-		two_body_parameters *twbp;
-		far_neighbor_data *nbr_pj;
-		//LR_lookup_table *t;
-		reax_atom *atom_i, *atom_j;
-		bond_data *ibond, *jbond;
-		bond_order_data *bo_ij, *bo_ji;
-
-		i = blockIdx.x * blockDim.x + threadIdx.x;
-		if (i >= N) return;
-
-		H = &( workspace.H );
-		//CHANGE ORIGINAL
-		//Htop = 0;
-		Htop = i * max_sparse_entries;
-		//CHANGE ORIGINAL
-		num_bonds = 0;
-		num_hbonds = 0;
-		btop_i = btop_j = 0;
-		p_boc1 = g_params.l[0];
-		p_boc2 = g_params.l[1];
-
-		//for( i = 0; i < system->N; ++i ) 
-		atom_i = &(atoms[i]);
-		type_i  = atom_i->type;
-		start_i = Start_Index(i, &far_nbrs);
-		end_i   = End_Index(i, &far_nbrs);
-
-		H->start[i] = Htop;
-		H->end[i] = Htop;
-
-		btop_i = End_Index( i, &bonds );
-		sbp_i = &(sbp[type_i]);
-		ihb = ihb_top = -1;
-
-		ihb = sbp_i->p_hbond;
-
-		if( control->hb_cut > 0 && (ihb==1 || ihb == 2))
-			ihb_top = End_Index( workspace.hbond_index[i], &hbonds );
-
-		for( pj = start_i; pj < end_i; ++pj ) {
-			nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-			j = nbr_pj->nbr;
-			atom_j = &(atoms[j]);
-
-			flag = 0;
-			if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-				if( nbr_pj->d <= control->r_cut)
-					flag = 1;
-				else flag = 0;
-			}
-			else if (i > j) {
-				if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
-					nbr_pj->d = sqrt(nbr_pj->d);
-					flag = 1;
-				}
-			} else if (i < j) {
-				if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
-					nbr_pj->d = sqrt(nbr_pj->d);
-					flag = 1;
-				}
-			}
-
-			if( flag ){	
-
-				type_j = atoms[j].type;
-				r_ij = nbr_pj->d;
-				sbp_j = &(sbp[type_j]);
-				twbp = &(tbp[ index_tbp (type_i,type_j, num_atom_types) ]);
-				self_coef = (i == j) ? 0.5 : 1.0;
-
-				/* H matrix entry */
-
-				//CHANGE ORIGINAL
-				//if (i > j) {
-				Tap = control->Tap7 * r_ij + control->Tap6;
-				Tap = Tap * r_ij + control->Tap5;
-				Tap = Tap * r_ij + control->Tap4;
-				Tap = Tap * r_ij + control->Tap3;
-				Tap = Tap * r_ij + control->Tap2;
-				Tap = Tap * r_ij + control->Tap1;
-				Tap = Tap * r_ij + control->Tap0;	      
-
-				dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-				dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-
-				H->entries[Htop].j = j;
-				H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
-
-				++Htop;
-				//}
-				//CHANGE ORIGINAL
-
-				/* hydrogen bond lists */ 
-				if( control->hb_cut > 0 && (ihb==1 || ihb == 2) && 
-						nbr_pj->d <= control->hb_cut ) {
-					// fprintf( stderr, "%d %d\n", atom1, atom2 );
-					jhb = sbp_j->p_hbond;
-
-					if (ihb == 1 && jhb == 2) {
-						if (i > j) {
-							hbonds.select.hbond_list[ihb_top].nbr = j;
-							hbonds.select.hbond_list[ihb_top].scl = 1;
-							hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-							//Auxilary data structures
-							rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-							hbonds.select.hbond_list[ihb_top].sym_index= -1;
-							++ihb_top;
-							++num_hbonds;
-						} else {
-							hbonds.select.hbond_list[ihb_top].nbr = j;
-							hbonds.select.hbond_list[ihb_top].scl = -1;
-							hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-							//Auxilary data structures
-							rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-							hbonds.select.hbond_list[ihb_top].sym_index= -1;
-							++ihb_top;
-							++num_hbonds;
-						}
-					} else if (ihb == 2 && jhb == 1) { 
-						hbonds.select.hbond_list[ihb_top].nbr = j; 
-						hbonds.select.hbond_list[ihb_top].scl = 1; 
-						hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-						//TODO
-						rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-						hbonds.select.hbond_list[ihb_top].sym_index= -1;
-						++ihb_top;
-						++num_hbonds;
-					} 
-				}
-
-				/* uncorrected bond orders */
-				if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) {
-					r2 = SQR(r_ij);
-
-					if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-						C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-						BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-					}
-					else BO_s = C12 = 0.0;
-
-					if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-						C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-						BO_pi = EXP( C34 );
-					}
-					else BO_pi = C34 = 0.0;
-
-					if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-						C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );	
-						BO_pi2= EXP( C56 );
-					}
-					else BO_pi2 = C56 = 0.0;
-
-					/* Initially BO values are the uncorrected ones, page 1 */
-					BO = BO_s + BO_pi + BO_pi2;
-
-
-					if( BO >= control->bo_cut ) {
-						//CHANGE ORIGINAL
-						num_bonds += 1;
-						//CHANGE ORIGINAL
-
-						/****** bonds i-j and j-i ******/
-
-						/* Bond Order page2-3, derivative of total bond order prime */
-						Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-						Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-						Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-
-
-						if (i > j) 
-						{
-							ibond = &( bonds.select.bond_list[btop_i] );
-							ibond->nbr = j;
-							ibond->d = r_ij;
-							rvec_Copy( ibond->dvec, nbr_pj->dvec );
-							ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
-
-							//ibond->dbond_index = btop_i;
-							//ibond->sym_index = btop_j;
-							++btop_i;
-
-							bo_ij = &( ibond->bo_data );
-							bo_ij->BO = BO;
-							bo_ij->BO_s = BO_s;
-							bo_ij->BO_pi = BO_pi;
-							bo_ij->BO_pi2 = BO_pi2;
-
-							//Auxilary data structures
-							ibond->scratch = 0;
-							ibond->CdDelta_ij = 0;
-							rvec_MakeZero (ibond->f);
-
-							ibond->l = -1;
-							ibond->CdDelta_jk = 0;
-							ibond->Cdbo_kl = 0;
-							rvec_MakeZero (ibond->i_f);
-							rvec_MakeZero (ibond->k_f);
-
-							rvec_MakeZero (ibond->h_f);
-
-							rvec_MakeZero (ibond->t_f);
-
-							// Only dln_BOp_xx wrt. dr_i is stored here, note that 
-							// 	dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 
-							rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-							rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
-							rvec_Scale(bo_ij->dln_BOp_pi2,
-									-bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
-
-							// Only dBOp wrt. dr_i is stored here, note that 
-							//	dBOp/dr_i = -dBOp/dr_j and all others are 0 
-							rvec_Scale( bo_ij->dBOp, 
-									-(bo_ij->BO_s * Cln_BOp_s + 
-										bo_ij->BO_pi * Cln_BOp_pi + 
-										bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-
-							rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp );
-
-							bo_ij->BO_s -= control->bo_cut;
-							bo_ij->BO -= control->bo_cut;
-							workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp
-
-							bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-
-
-						} else if ( i < j )
-						{
-							rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
-							rvec dBOp;
-
-							btop_j = btop_i;
-
-							jbond = &(bonds.select.bond_list[btop_j]);
-							jbond->nbr = j;
-							jbond->d = r_ij;
-							rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
-							ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
-
-							btop_i ++;
-							//jbond->dbond_index = btop_i;
-							//jbond->sym_index = btop_i;
-
-							bo_ji = &( jbond->bo_data );
-							bo_ji->BO = BO;
-							bo_ji->BO_s = BO_s;
-							bo_ji->BO_pi = BO_pi;
-							bo_ji->BO_pi2 = BO_pi2;
-
-							//Auxilary data structures
-							jbond->scratch = 0;
-							jbond->CdDelta_ij = 0;
-							rvec_MakeZero (jbond->f);
-
-							jbond->l = -1;
-							jbond->CdDelta_jk = 0;
-							jbond->Cdbo_kl = 0;
-							rvec_MakeZero (jbond->i_f);
-							rvec_MakeZero (jbond->k_f);
-
-							rvec_MakeZero (jbond->h_f);
-
-							rvec_MakeZero (jbond->t_f);
-
-							// Only dln_BOp_xx wrt. dr_i is stored here, note that 
-							// dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0
-							rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec);
-							rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec);
-							rvec_Scale(dln_BOp_pi2,
-									-BO_pi2*Cln_BOp_pi2,nbr_pj->dvec);
-
-							rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s);
-							rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi );
-							rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 );
-
-							// Only dBOp wrt. dr_i is stored here, note that 
-							//	dBOp/dr_i = -dBOp/dr_j and all others are 0 
-							rvec_Scale( dBOp, 
-									-(BO_s * Cln_BOp_s + 
-										BO_pi * Cln_BOp_pi + 
-										BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec );
-							rvec_Scale( bo_ji->dBOp, -1., dBOp );
-
-							rvec_Add( workspace.dDeltap_self[i] , bo_ji->dBOp );
-
-							bo_ji->BO_s -= control->bo_cut;
-							bo_ji->BO -= control->bo_cut;
-							workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp
-
-							bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
-
-						}
-					} 
-				}
-			}
-		}
-
-		H->entries[Htop].j = i;
-		H->entries[Htop].val = sbp[type_i].eta;
-		++Htop;
-
-		H->end[i] = Htop;
-
-		Set_End_Index( i, btop_i, &bonds );
-		if( ihb == 1 || ihb == 2)
-			Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds );
-
-		//fprintf( stderr, "%d bonds start: %d, end: %d\n", 
-		//     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
-		//}
-
-		// mark the end of j list
-		//H->start[i] = Htop; 
-		/* validate lists - decide if reallocation is required! */
-		//Validate_Lists( workspace, lists, 
-		//	  data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+    }
+
+
+    GLOBAL void Estimate_Sparse_Matrix_Entries ( reax_atom *atoms, control_params *control, 
+            simulation_data *data, simulation_box *box, list far_nbrs, int N, int *indices ) {
+
+        int i, j, pj;
+        int start_i, end_i;
+        int type_i, type_j;
+        int Htop;
+        int flag;
+        far_neighbor_data *nbr_pj;
+        reax_atom *atom_i, *atom_j;
+
+        int temp;
+
+        Htop = 0;
+
+        i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i >= N) return;
+
+        atom_i = &(atoms[i]);
+        type_i  = atom_i->type;
+        start_i = Start_Index(i, &far_nbrs);
+        end_i   = End_Index(i, &far_nbrs);
+        indices[i] = Htop;
+
+        for( pj = start_i; pj < end_i; ++pj ) {
+            nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
+            atom_j = &(atoms[j]);
+
+            //CHANGE ORIGINAL
+            //if (i < j) continue;
+            //CHANGE ORIGINAL
+
+            flag = 0;
+            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+                if( nbr_pj->d <= control->r_cut)
+                    flag = 1;
+                else flag = 0;
+            }
+            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec)) <=     
+                    SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+
+            if( flag ){    
+                ++Htop;
+            }
+        }
+
+        ++Htop;
+
+        // mark the end of j list
+        indices[i] = Htop;
+    }
+
+
+
+
+    GLOBAL void Init_Forces( reax_atom *atoms,         global_parameters g_params, control_params *control, 
+            single_body_parameters *sbp, two_body_parameters *tbp, 
+            simulation_data *data, simulation_box *box,    static_storage workspace,
+            list far_nbrs,             list bonds,                list hbonds, 
+            int N,                         int max_sparse_entries, int num_atom_types ) 
+    {
+
+        int i, j, pj;
+        int start_i, end_i;
+        int type_i, type_j;
+        int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+        int ihb, jhb, ihb_top, jhb_top;
+        int flag;
+        real r_ij, r2, self_coef;
+        real dr3gamij_1, dr3gamij_3, Tap;
+        //real val, dif, base;
+        real C12, C34, C56;
+        real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+        real BO, BO_s, BO_pi, BO_pi2;
+        real p_boc1, p_boc2;   
+        sparse_matrix *H;
+        single_body_parameters *sbp_i, *sbp_j;
+        two_body_parameters *twbp;
+        far_neighbor_data *nbr_pj;
+        //LR_lookup_table *t;
+        reax_atom *atom_i, *atom_j;
+        bond_data *ibond, *jbond;
+        bond_order_data *bo_ij, *bo_ji;
+
+        i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i >= N) return;
+
+        H = &( workspace.H );
+        //CHANGE ORIGINAL
+        //Htop = 0;
+        Htop = i * max_sparse_entries;
+        //CHANGE ORIGINAL
+        num_bonds = 0;
+        num_hbonds = 0;
+        btop_i = btop_j = 0;
+        p_boc1 = g_params.l[0];
+        p_boc2 = g_params.l[1];
+
+        //for( i = 0; i < system->N; ++i ) 
+        atom_i = &(atoms[i]);
+        type_i  = atom_i->type;
+        start_i = Start_Index(i, &far_nbrs);
+        end_i   = End_Index(i, &far_nbrs);
+
+        H->start[i] = Htop;
+        H->end[i] = Htop;
+
+        btop_i = End_Index( i, &bonds );
+        sbp_i = &(sbp[type_i]);
+        ihb = ihb_top = -1;
+
+        ihb = sbp_i->p_hbond;
+
+        if( control->hb_cut > 0 && (ihb==1 || ihb == 2))
+            ihb_top = End_Index( workspace.hbond_index[i], &hbonds );
+
+        for( pj = start_i; pj < end_i; ++pj ) {
+            nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
+            atom_j = &(atoms[j]);
+
+            flag = 0;
+            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+                if( nbr_pj->d <= control->r_cut)
+                    flag = 1;
+                else flag = 0;
+            }
+            else if (i > j) {
+                if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
+                    nbr_pj->d = sqrt(nbr_pj->d);
+                    flag = 1;
+                }
+            } else if (i < j) {
+                if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
+                    nbr_pj->d = sqrt(nbr_pj->d);
+                    flag = 1;
+                }
+            }
+
+            if( flag ){    
+
+                type_j = atoms[j].type;
+                r_ij = nbr_pj->d;
+                sbp_j = &(sbp[type_j]);
+                twbp = &(tbp[ index_tbp (type_i,type_j, num_atom_types) ]);
+                self_coef = (i == j) ? 0.5 : 1.0;
+
+                /* H matrix entry */
+
+                //CHANGE ORIGINAL
+                //if (i > j) {
+                Tap = control->Tap7 * r_ij + control->Tap6;
+                Tap = Tap * r_ij + control->Tap5;
+                Tap = Tap * r_ij + control->Tap4;
+                Tap = Tap * r_ij + control->Tap3;
+                Tap = Tap * r_ij + control->Tap2;
+                Tap = Tap * r_ij + control->Tap1;
+                Tap = Tap * r_ij + control->Tap0;          
+
+                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+
+                H->entries[Htop].j = j;
+                H->entries[Htop].val = self_coef * Tap * EV_to_KCALpMOL / dr3gamij_3;
+
+                ++Htop;
+                //}
+                //CHANGE ORIGINAL
+
+                /* hydrogen bond lists */ 
+                if( control->hb_cut > 0 && (ihb==1 || ihb == 2) && 
+                        nbr_pj->d <= control->hb_cut ) {
+                    // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                    jhb = sbp_j->p_hbond;
+
+                    if (ihb == 1 && jhb == 2) {
+                        if (i > j) {
+                            hbonds.select.hbond_list[ihb_top].nbr = j;
+                            hbonds.select.hbond_list[ihb_top].scl = 1;
+                            hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                            //Auxilary data structures
+                            rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                            hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                            ++ihb_top;
+                            ++num_hbonds;
+                        } else {
+                            hbonds.select.hbond_list[ihb_top].nbr = j;
+                            hbonds.select.hbond_list[ihb_top].scl = -1;
+                            hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                            //Auxilary data structures
+                            rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                            hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                            ++ihb_top;
+                            ++num_hbonds;
+                        }
+                    } else if (ihb == 2 && jhb == 1) { 
+                        hbonds.select.hbond_list[ihb_top].nbr = j; 
+                        hbonds.select.hbond_list[ihb_top].scl = 1; 
+                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+                        //TODO
+                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    } 
+                }
+
+                /* uncorrected bond orders */
+                if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                    r2 = SQR(r_ij);
+
+                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                    }
+                    else BO_s = C12 = 0.0;
+
+                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                        BO_pi = EXP( C34 );
+                    }
+                    else BO_pi = C34 = 0.0;
+
+                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                        BO_pi2= EXP( C56 );
+                    }
+                    else BO_pi2 = C56 = 0.0;
+
+                    /* Initially BO values are the uncorrected ones, page 1 */
+                    BO = BO_s + BO_pi + BO_pi2;
+
+
+                    if( BO >= control->bo_cut ) {
+                        //CHANGE ORIGINAL
+                        num_bonds += 1;
+                        //CHANGE ORIGINAL
+
+                        /****** bonds i-j and j-i ******/
+
+                        /* Bond Order page2-3, derivative of total bond order prime */
+                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+
+
+                        if (i > j) 
+                        {
+                            ibond = &( bonds.select.bond_list[btop_i] );
+                            ibond->nbr = j;
+                            ibond->d = r_ij;
+                            rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                            ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
+
+                            //ibond->dbond_index = btop_i;
+                            //ibond->sym_index = btop_j;
+                            ++btop_i;
+
+                            bo_ij = &( ibond->bo_data );
+                            bo_ij->BO = BO;
+                            bo_ij->BO_s = BO_s;
+                            bo_ij->BO_pi = BO_pi;
+                            bo_ij->BO_pi2 = BO_pi2;
+
+                            //Auxilary data structures
+                            ibond->scratch = 0;
+                            ibond->CdDelta_ij = 0;
+                            rvec_MakeZero (ibond->f);
+
+                            ibond->l = -1;
+                            ibond->CdDelta_jk = 0;
+                            ibond->Cdbo_kl = 0;
+                            rvec_MakeZero (ibond->i_f);
+                            rvec_MakeZero (ibond->k_f);
+
+                            rvec_MakeZero (ibond->h_f);
+
+                            rvec_MakeZero (ibond->t_f);
+
+                            // Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                            //     dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 
+                            rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
+                            rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                            rvec_Scale(bo_ij->dln_BOp_pi2,
+                                    -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+
+                            // Only dBOp wrt. dr_i is stored here, note that 
+                            //    dBOp/dr_i = -dBOp/dr_j and all others are 0 
+                            rvec_Scale( bo_ij->dBOp, 
+                                    -(bo_ij->BO_s * Cln_BOp_s + 
+                                        bo_ij->BO_pi * Cln_BOp_pi + 
+                                        bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+
+                            rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp );
+
+                            bo_ij->BO_s -= control->bo_cut;
+                            bo_ij->BO -= control->bo_cut;
+                            workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp
+
+                            bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
+
+
+                        } else if ( i < j )
+                        {
+                            rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
+                            rvec dBOp;
+
+                            btop_j = btop_i;
+
+                            jbond = &(bonds.select.bond_list[btop_j]);
+                            jbond->nbr = j;
+                            jbond->d = r_ij;
+                            rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
+                            ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+
+                            btop_i ++;
+                            //jbond->dbond_index = btop_i;
+                            //jbond->sym_index = btop_i;
+
+                            bo_ji = &( jbond->bo_data );
+                            bo_ji->BO = BO;
+                            bo_ji->BO_s = BO_s;
+                            bo_ji->BO_pi = BO_pi;
+                            bo_ji->BO_pi2 = BO_pi2;
+
+                            //Auxilary data structures
+                            jbond->scratch = 0;
+                            jbond->CdDelta_ij = 0;
+                            rvec_MakeZero (jbond->f);
+
+                            jbond->l = -1;
+                            jbond->CdDelta_jk = 0;
+                            jbond->Cdbo_kl = 0;
+                            rvec_MakeZero (jbond->i_f);
+                            rvec_MakeZero (jbond->k_f);
+
+                            rvec_MakeZero (jbond->h_f);
+
+                            rvec_MakeZero (jbond->t_f);
+
+                            // Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                            // dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0
+                            rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec);
+                            rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec);
+                            rvec_Scale(dln_BOp_pi2,
+                                    -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec);
+
+                            rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s);
+                            rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi );
+                            rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 );
+
+                            // Only dBOp wrt. dr_i is stored here, note that 
+                            //    dBOp/dr_i = -dBOp/dr_j and all others are 0 
+                            rvec_Scale( dBOp, 
+                                    -(BO_s * Cln_BOp_s + 
+                                        BO_pi * Cln_BOp_pi + 
+                                        BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec );
+                            rvec_Scale( bo_ji->dBOp, -1., dBOp );
+
+                            rvec_Add( workspace.dDeltap_self[i] , bo_ji->dBOp );
+
+                            bo_ji->BO_s -= control->bo_cut;
+                            bo_ji->BO -= control->bo_cut;
+                            workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp
+
+                            bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+
+                        }
+                    } 
+                }
+            }
+        }
+
+        H->entries[Htop].j = i;
+        H->entries[Htop].val = sbp[type_i].eta;
+        ++Htop;
+
+        H->end[i] = Htop;
+
+        Set_End_Index( i, btop_i, &bonds );
+        if( ihb == 1 || ihb == 2)
+            Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds );
+
+        //fprintf( stderr, "%d bonds start: %d, end: %d\n", 
+        //     i, Start_Index( i, bonds ), End_Index( i, bonds ) );
+        //}
+
+        // mark the end of j list
+        //H->start[i] = Htop; 
+        /* validate lists - decide if reallocation is required! */
+        //Validate_Lists( workspace, lists, 
+        //      data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
 }
 
-GLOBAL void Init_Forces_Tab ( reax_atom *atoms, 		global_parameters g_params, control_params *control, 
-		single_body_parameters *sbp, two_body_parameters *tbp, 
-		simulation_data *data, simulation_box *box,    static_storage workspace,
-		list far_nbrs, 			list bonds, 			   list hbonds, 
-		int N, 						int max_sparse_entries, int num_atom_types, 
-		LR_lookup_table *d_LR) 
+GLOBAL void Init_Forces_Tab ( reax_atom *atoms,         global_parameters g_params, control_params *control, 
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        simulation_data *data, simulation_box *box,    static_storage workspace,
+        list far_nbrs,             list bonds,                list hbonds, 
+        int N,                         int max_sparse_entries, int num_atom_types, 
+        LR_lookup_table *d_LR) 
 {
-	int i, j, pj;
-	int start_i, end_i;
-	int type_i, type_j;
-	int Htop, btop_i, btop_j, num_bonds, num_hbonds;
-	int tmin, tmax, r;
-	int ihb, jhb, ihb_top, jhb_top;
-	int flag;
-	real r_ij, r2, self_coef;
-	real val, dif, base;
-	real C12, C34, C56;
-	real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
-	real BO, BO_s, BO_pi, BO_pi2;
-	real p_boc1, p_boc2;   
-	sparse_matrix *H;
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	far_neighbor_data *nbr_pj;
-	LR_lookup_table *t;
-	reax_atom *atom_i, *atom_j;
-	bond_data *ibond, *jbond;
-	bond_order_data *bo_ij, *bo_ji;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	H = &(workspace.H);
-	//CHANGE ORIGINAL
-	Htop = i * max_sparse_entries;
-	//CHANGE ORIGINAL
-	num_bonds = 0;
-	num_hbonds = 0;
-	btop_i = btop_j = 0;
-	p_boc1 = g_params.l[0];
-	p_boc2 = g_params.l[1];
-
-	//for( i = 0; i < system->N; ++i )
-	atom_i = &(atoms[i]);
-	type_i  = atom_i->type;
-	start_i = Start_Index(i, &far_nbrs);
-	end_i   = End_Index(i, &far_nbrs);
-	H->start[i] = Htop;
-	H->end[i] = Htop;
-	btop_i = End_Index( i, &bonds );
-	sbp_i = &(sbp[type_i]);
-	ihb = ihb_top = -1;
-
-	ihb = sbp_i->p_hbond;
-
-	if( control->hb_cut > 0 && (ihb==1 || ihb == 2))
-		ihb_top = End_Index( workspace.hbond_index[i], &hbonds );
-
-	for( pj = start_i; pj < end_i; ++pj ) {
-		nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-		j = nbr_pj->nbr;
-		atom_j = &(atoms[j]);
-
-		flag = 0;
-		if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-			if(nbr_pj->d <= control->r_cut)
-				flag = 1;
-			else flag = 0;
-		}
-		else if (i > j) {
-			if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
-				nbr_pj->d = sqrt(nbr_pj->d);
-				flag = 1;
-			}
-		}
-		else if ( i < j) {
-			if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
-				nbr_pj->d = sqrt(nbr_pj->d);
-				flag = 1;
-			}
-		}
-
-		if( flag ){	
-			type_j = atoms[j].type;
-			r_ij = nbr_pj->d;
-			sbp_j = &(sbp[type_j]);
-			twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
-			self_coef = (i == j) ? 0.5 : 1.0;
-			tmin  = MIN( type_i, type_j );
-			tmax  = MAX( type_i, type_j );
-			t = &( d_LR[ index_lr (tmin, tmax, num_atom_types) ]);	  
-
-			/* cubic spline interpolation */
-			//CHANGE ORIGINAL
-			//if (i > j) {
-			r = (int)(r_ij * t->inv_dx);
-			if( r == 0 )  ++r;
-			base = (real)(r+1) * t->dx;
-			dif = r_ij - base;
-			val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-				t->ele[r].a;
-			val *= EV_to_KCALpMOL / C_ele;
-
-			H->entries[Htop].j = j;
-			H->entries[Htop].val = self_coef * val;
-			//H->j [Htop] = j;
-			//H->val [Htop] = self_coef * val;
-			++Htop;
-			//}
-			//CHANGE ORIGINAL
-
-			/* hydrogen bond lists */ 
-			if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-					nbr_pj->d <= control->hb_cut ) {
-				// fprintf( stderr, "%d %d\n", atom1, atom2 );
-				jhb = sbp_j->p_hbond;
-
-				if ( ihb == 1 && jhb == 2 ) {
-					if (i > j) {
-						hbonds.select.hbond_list[ihb_top].nbr = j;
-						hbonds.select.hbond_list[ihb_top].scl = 1;
-						hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-						//Auxilary data structures
-						rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-						hbonds.select.hbond_list[ihb_top].sym_index= -1;
-						++ihb_top;
-						++num_hbonds;
-					} else {
-						hbonds.select.hbond_list[ihb_top].nbr = j;
-						hbonds.select.hbond_list[ihb_top].scl = -1;
-						hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-						//Auxilary data structures
-						rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-						hbonds.select.hbond_list[ihb_top].sym_index= -1;
-						++ihb_top;
-						++num_hbonds;
-					}
-				} else if (ihb == 2 && jhb == 1) {
-					hbonds.select.hbond_list[ihb_top].nbr = j;
-					hbonds.select.hbond_list[ihb_top].scl = 1;
-					hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
-
-					//Auxilary data structures
-					rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
-					hbonds.select.hbond_list[ihb_top].sym_index= -1;
-					++ihb_top;
-					++num_hbonds;
-				}
-			}
-
-			/* uncorrected bond orders */
-			if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) {
-				r2 = SQR(r_ij);
-
-				if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-					C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-					BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-				}
-				else BO_s = C12 = 0.0;
-
-				if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-					C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-					BO_pi = EXP( C34 );
-				}
-				else BO_pi = C34 = 0.0;
-
-				if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-					C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );	
-					BO_pi2= EXP( C56 );
-				}
-				else BO_pi2 = C56 = 0.0;
-
-				/* Initially BO values are the uncorrected ones, page 1 */
-				BO = BO_s + BO_pi + BO_pi2;
-
-				if( BO >= control->bo_cut ) {
-
-					//CHANGE ORIGINAL
-					num_bonds += 1;
-					//CHANGE ORIGINAL
-
-					/****** bonds i-j and j-i ******/
-					if ( i > j )
-					{
-						ibond = &( bonds.select.bond_list[btop_i] );
-						ibond->nbr = j;
-						ibond->d = r_ij;
-
-						rvec_Copy( ibond->dvec, nbr_pj->dvec );
-						ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
-
-						//ibond->dbond_index = btop_i;
-						//ibond->sym_index = btop_j;
-
-						++btop_i;
-
-						bo_ij = &( ibond->bo_data );
-						bo_ij->BO = BO;
-						bo_ij->BO_s = BO_s;
-						bo_ij->BO_pi = BO_pi;
-						bo_ij->BO_pi2 = BO_pi2;
-
-						//Auxilary data strucutres to resolve dependencies
-						ibond->scratch = 0;
-						ibond->CdDelta_ij = 0;
-						rvec_MakeZero (ibond->f);
-
-						ibond->l = -1;
-						ibond->CdDelta_jk = 0;
-						ibond->Cdbo_kl = 0;
-						rvec_MakeZero (ibond->i_f);
-						rvec_MakeZero (ibond->k_f);
-
-						rvec_MakeZero (ibond->h_f);
-
-						rvec_MakeZero (ibond->t_f);
-
-						/* Bond Order page2-3, derivative of total bond order prime */
-						Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-						Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-						Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-
-						/* Only dln_BOp_xx wrt. dr_i is stored here, note that 
-						   dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-						rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-						rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
-						rvec_Scale(bo_ij->dln_BOp_pi2,
-								-bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
-
-						/* Only dBOp wrt. dr_i is stored here, note that 
-						   dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-						rvec_Scale( bo_ij->dBOp, 
-								-(bo_ij->BO_s * Cln_BOp_s + 
-									bo_ij->BO_pi * Cln_BOp_pi + 
-									bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-
-						rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp );
-
-						bo_ij->BO_s -= control->bo_cut;
-						bo_ij->BO -= control->bo_cut;
-
-						workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp
-
-						bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-					} 
-					else {
-						rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
-						rvec dBOp;
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int tmin, tmax, r;
+    int ihb, jhb, ihb_top, jhb_top;
+    int flag;
+    real r_ij, r2, self_coef;
+    real val, dif, base;
+    real C12, C34, C56;
+    real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2;   
+    sparse_matrix *H;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    LR_lookup_table *t;
+    reax_atom *atom_i, *atom_j;
+    bond_data *ibond, *jbond;
+    bond_order_data *bo_ij, *bo_ji;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    H = &(workspace.H);
+    //CHANGE ORIGINAL
+    Htop = i * max_sparse_entries;
+    //CHANGE ORIGINAL
+    num_bonds = 0;
+    num_hbonds = 0;
+    btop_i = btop_j = 0;
+    p_boc1 = g_params.l[0];
+    p_boc2 = g_params.l[1];
+
+    //for( i = 0; i < system->N; ++i )
+    atom_i = &(atoms[i]);
+    type_i  = atom_i->type;
+    start_i = Start_Index(i, &far_nbrs);
+    end_i   = End_Index(i, &far_nbrs);
+    H->start[i] = Htop;
+    H->end[i] = Htop;
+    btop_i = End_Index( i, &bonds );
+    sbp_i = &(sbp[type_i]);
+    ihb = ihb_top = -1;
+
+    ihb = sbp_i->p_hbond;
+
+    if( control->hb_cut > 0 && (ihb==1 || ihb == 2))
+        ihb_top = End_Index( workspace.hbond_index[i], &hbonds );
+
+    for( pj = start_i; pj < end_i; ++pj ) {
+        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &(atoms[j]);
+
+        flag = 0;
+        if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+            if(nbr_pj->d <= control->r_cut)
+                flag = 1;
+            else flag = 0;
+        }
+        else if (i > j) {
+            if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+        }
+        else if ( i < j) {
+            if((nbr_pj->d=Sq_Distance_on_T3(atom_j->x,atom_i->x,box,nbr_pj->dvec))<=SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+        }
+
+        if( flag ){    
+            type_j = atoms[j].type;
+            r_ij = nbr_pj->d;
+            sbp_j = &(sbp[type_j]);
+            twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+            self_coef = (i == j) ? 0.5 : 1.0;
+            tmin  = MIN( type_i, type_j );
+            tmax  = MAX( type_i, type_j );
+            t = &( d_LR[ index_lr (tmin, tmax, num_atom_types) ]);      
+
+            /* cubic spline interpolation */
+            //CHANGE ORIGINAL
+            //if (i > j) {
+            r = (int)(r_ij * t->inv_dx);
+            if( r == 0 )  ++r;
+            base = (real)(r+1) * t->dx;
+            dif = r_ij - base;
+            val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                t->ele[r].a;
+            val *= EV_to_KCALpMOL / C_ele;
+
+            H->entries[Htop].j = j;
+            H->entries[Htop].val = self_coef * val;
+            //H->j [Htop] = j;
+            //H->val [Htop] = self_coef * val;
+            ++Htop;
+            //}
+            //CHANGE ORIGINAL
+
+            /* hydrogen bond lists */ 
+            if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
+                    nbr_pj->d <= control->hb_cut ) {
+                // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                jhb = sbp_j->p_hbond;
+
+                if ( ihb == 1 && jhb == 2 ) {
+                    if (i > j) {
+                        hbonds.select.hbond_list[ihb_top].nbr = j;
+                        hbonds.select.hbond_list[ihb_top].scl = 1;
+                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                        //Auxilary data structures
+                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    } else {
+                        hbonds.select.hbond_list[ihb_top].nbr = j;
+                        hbonds.select.hbond_list[ihb_top].scl = -1;
+                        hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                        //Auxilary data structures
+                        rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                        hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    }
+                } else if (ihb == 2 && jhb == 1) {
+                    hbonds.select.hbond_list[ihb_top].nbr = j;
+                    hbonds.select.hbond_list[ihb_top].scl = 1;
+                    hbonds.select.hbond_list[ihb_top].ptr = nbr_pj;
+
+                    //Auxilary data structures
+                    rvec_MakeZero (hbonds.select.hbond_list[ihb_top].h_f);
+                    hbonds.select.hbond_list[ihb_top].sym_index= -1;
+                    ++ihb_top;
+                    ++num_hbonds;
+                }
+            }
+
+            /* uncorrected bond orders */
+            if( far_nbrs.select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                r2 = SQR(r_ij);
+
+                if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                }
+                else BO_s = C12 = 0.0;
+
+                if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                    BO_pi = EXP( C34 );
+                }
+                else BO_pi = C34 = 0.0;
+
+                if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                    BO_pi2= EXP( C56 );
+                }
+                else BO_pi2 = C56 = 0.0;
+
+                /* Initially BO values are the uncorrected ones, page 1 */
+                BO = BO_s + BO_pi + BO_pi2;
+
+                if( BO >= control->bo_cut ) {
+
+                    //CHANGE ORIGINAL
+                    num_bonds += 1;
+                    //CHANGE ORIGINAL
+
+                    /****** bonds i-j and j-i ******/
+                    if ( i > j )
+                    {
+                        ibond = &( bonds.select.bond_list[btop_i] );
+                        ibond->nbr = j;
+                        ibond->d = r_ij;
+
+                        rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                        ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
+
+                        //ibond->dbond_index = btop_i;
+                        //ibond->sym_index = btop_j;
+
+                        ++btop_i;
+
+                        bo_ij = &( ibond->bo_data );
+                        bo_ij->BO = BO;
+                        bo_ij->BO_s = BO_s;
+                        bo_ij->BO_pi = BO_pi;
+                        bo_ij->BO_pi2 = BO_pi2;
+
+                        //Auxilary data strucutres to resolve dependencies
+                        ibond->scratch = 0;
+                        ibond->CdDelta_ij = 0;
+                        rvec_MakeZero (ibond->f);
+
+                        ibond->l = -1;
+                        ibond->CdDelta_jk = 0;
+                        ibond->Cdbo_kl = 0;
+                        rvec_MakeZero (ibond->i_f);
+                        rvec_MakeZero (ibond->k_f);
+
+                        rvec_MakeZero (ibond->h_f);
+
+                        rvec_MakeZero (ibond->t_f);
+
+                        /* Bond Order page2-3, derivative of total bond order prime */
+                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+
+                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                           dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi2,
+                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+
+                        /* Only dBOp wrt. dr_i is stored here, note that 
+                           dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+                        rvec_Scale( bo_ij->dBOp, 
+                                -(bo_ij->BO_s * Cln_BOp_s + 
+                                    bo_ij->BO_pi * Cln_BOp_pi + 
+                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+
+                        rvec_Add( workspace.dDeltap_self[i], bo_ij->dBOp );
+
+                        bo_ij->BO_s -= control->bo_cut;
+                        bo_ij->BO -= control->bo_cut;
+
+                        workspace.total_bond_order[i] += bo_ij->BO; //currently total_BOp
+
+                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
+                    } 
+                    else {
+                        rvec dln_BOp_s, dln_BOp_pi, dln_BOp_pi2;
+                        rvec dBOp;
 
-						btop_j = btop_i;
+                        btop_j = btop_i;
 
-						jbond = &( bonds.select.bond_list[btop_j] );
-						jbond->nbr = j; 
-						jbond->d = r_ij;
+                        jbond = &( bonds.select.bond_list[btop_j] );
+                        jbond->nbr = j; 
+                        jbond->d = r_ij;
 
-						rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
-						ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+                        rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
+                        ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
 
-						//jbond->dbond_index = btop_i;
-						//jbond->sym_index = btop_i;
+                        //jbond->dbond_index = btop_i;
+                        //jbond->sym_index = btop_i;
 
-						++btop_i;
+                        ++btop_i;
 
-						bo_ji = &( jbond->bo_data );
+                        bo_ji = &( jbond->bo_data );
 
-						bo_ji->BO = BO;
-						bo_ji->BO_s = BO_s;
-						bo_ji->BO_pi = BO_pi;
-						bo_ji->BO_pi2 = BO_pi2;
+                        bo_ji->BO = BO;
+                        bo_ji->BO_s = BO_s;
+                        bo_ji->BO_pi = BO_pi;
+                        bo_ji->BO_pi2 = BO_pi2;
 
-						// Auxilary data structures to resolve dependencies
-						jbond->scratch = 0;
-						jbond->CdDelta_ij = 0;
-						rvec_MakeZero (jbond->f);
+                        // Auxilary data structures to resolve dependencies
+                        jbond->scratch = 0;
+                        jbond->CdDelta_ij = 0;
+                        rvec_MakeZero (jbond->f);
 
-						jbond->l = -1;
-						jbond->CdDelta_jk = 0;
-						jbond->Cdbo_kl = 0;
-						rvec_MakeZero (jbond->i_f);
-						rvec_MakeZero (jbond->k_f);
+                        jbond->l = -1;
+                        jbond->CdDelta_jk = 0;
+                        jbond->Cdbo_kl = 0;
+                        rvec_MakeZero (jbond->i_f);
+                        rvec_MakeZero (jbond->k_f);
 
-						rvec_MakeZero (jbond->h_f);
+                        rvec_MakeZero (jbond->h_f);
 
-						rvec_MakeZero (jbond->t_f);
+                        rvec_MakeZero (jbond->t_f);
 
-						// Bond Order page2-3, derivative of total bond order prime
-						Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-						Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-						Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+                        // Bond Order page2-3, derivative of total bond order prime
+                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
 
-						// Only dln_BOp_xx wrt. dr_i is stored here, note that 
-						//   dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 
+                        // Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                        //   dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 
 
-						rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec);
-						rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec);
-						rvec_Scale(dln_BOp_pi2, -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec);
+                        rvec_Scale(dln_BOp_s,-BO_s*Cln_BOp_s,nbr_pj->dvec);
+                        rvec_Scale(dln_BOp_pi,-BO_pi*Cln_BOp_pi,nbr_pj->dvec);
+                        rvec_Scale(dln_BOp_pi2, -BO_pi2*Cln_BOp_pi2,nbr_pj->dvec);
 
-						rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s);
-						rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi );
-						rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 );
+                        rvec_Scale(bo_ji->dln_BOp_s, -1., dln_BOp_s);
+                        rvec_Scale(bo_ji->dln_BOp_pi, -1., dln_BOp_pi );
+                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., dln_BOp_pi2 );
 
-						// Only dBOp wrt. dr_i is stored here, note that 
-						//   dBOp/dr_i = -dBOp/dr_j and all others are 0
-						//CHANGE ORIGINAL
-						rvec_Scale( dBOp, 
-								-(BO_s * Cln_BOp_s + 
-									BO_pi * Cln_BOp_pi + 
-									BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec);
-						rvec_Scale( bo_ji->dBOp, -1., dBOp);
-						//CHANGE ORIGINAL
+                        // Only dBOp wrt. dr_i is stored here, note that 
+                        //   dBOp/dr_i = -dBOp/dr_j and all others are 0
+                        //CHANGE ORIGINAL
+                        rvec_Scale( dBOp, 
+                                -(BO_s * Cln_BOp_s + 
+                                    BO_pi * Cln_BOp_pi + 
+                                    BO_pi2 * Cln_BOp_pi2), nbr_pj->dvec);
+                        rvec_Scale( bo_ji->dBOp, -1., dBOp);
+                        //CHANGE ORIGINAL
 
-						rvec_Add( workspace.dDeltap_self[i], bo_ji->dBOp );
+                        rvec_Add( workspace.dDeltap_self[i], bo_ji->dBOp );
 
-						bo_ji->BO_s -= control->bo_cut;
-						bo_ji->BO -= control->bo_cut;
+                        bo_ji->BO_s -= control->bo_cut;
+                        bo_ji->BO -= control->bo_cut;
 
-						workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp
+                        workspace.total_bond_order[i] += bo_ji->BO; //currently total_BOp
 
-						bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
-					}
-				}
-			}
-		}
-	}
+                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+                    }
+                }
+            }
+        }
+    }
 
-	H->entries[Htop].j = i;
-	H->entries[Htop].val = sbp[type_i].eta;
+    H->entries[Htop].j = i;
+    H->entries[Htop].val = sbp[type_i].eta;
 
-	//H->j [Htop] = i;
-	//H->val [Htop] = sbp[type_i].eta;
+    //H->j [Htop] = i;
+    //H->val [Htop] = sbp[type_i].eta;
 
-	++Htop;
+    ++Htop;
 
-	H->end[i] = Htop;
-	Set_End_Index( i, btop_i, &bonds );
-	if( ihb == 1  || ihb == 2)
-		Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds );
+    H->end[i] = Htop;
+    Set_End_Index( i, btop_i, &bonds );
+    if( ihb == 1  || ihb == 2)
+        Set_End_Index( workspace.hbond_index[i], ihb_top, &hbonds );
 }
 
 GLOBAL void fix_sym_dbond_indices (list pbonds, int N)
 {
-	int i, nbr;
-	bond_data *ibond, *jbond;
-	int atom_j;
-
-	list *bonds = &pbonds;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++)
-	{
-		ibond = &( bonds->select.bond_list [j] );	
-		nbr = ibond->nbr;
-
-		for (int k = Start_Index (nbr, bonds); k < End_Index (nbr, bonds); k ++)
-		{
-			jbond = &( bonds->select.bond_list[ k ] );
-			atom_j = jbond->nbr;
-
-			if ( (atom_j == i) )
-			{
-				if (i > nbr) {
-					ibond->dbond_index = j; 
-					jbond->dbond_index = j;
-
-					ibond->sym_index = k;
-					jbond->sym_index = j;
-				}
-			}
-		}
-	}
+    int i, nbr;
+    bond_data *ibond, *jbond;
+    int atom_j;
+
+    list *bonds = &pbonds;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++)
+    {
+        ibond = &( bonds->select.bond_list [j] );    
+        nbr = ibond->nbr;
+
+        for (int k = Start_Index (nbr, bonds); k < End_Index (nbr, bonds); k ++)
+        {
+            jbond = &( bonds->select.bond_list[ k ] );
+            atom_j = jbond->nbr;
+
+            if ( (atom_j == i) )
+            {
+                if (i > nbr) {
+                    ibond->dbond_index = j; 
+                    jbond->dbond_index = j;
+
+                    ibond->sym_index = k;
+                    jbond->sym_index = j;
+                }
+            }
+        }
+    }
 }
 
 
 GLOBAL void fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N)
 {
-	static_storage *workspace = &p_workspace;
-	hbond_data *ihbond, *jhbond;
-	int nbr;
-
-	//int i = (blockIdx.x * blockDim.x + threadIdx.x) >> 4;
-	int i = (blockIdx.x);
-	int start = Start_Index (workspace->hbond_index[i], &hbonds);
-	int end = End_Index (workspace->hbond_index[i], &hbonds);
-	//int j = start + threadIdx.x;
-	//int j = start + (threadIdx.x % 16);
-
-	//for (int j = Start_Index (workspace->hbond_index[i], &hbonds); 
-	//		j < End_Index (workspace->hbond_index[i], &hbonds); j++)
-	int j = start + threadIdx.x;
-	while (j < end)
-		//for (int j = start; j < end; j++)
-	{
-		ihbond = &( hbonds.select.hbond_list [j] );
-		nbr = ihbond->nbr;
-
-		int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds);
-		int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds);
-
-		for (int k = nbrstart; k < nbrend; k++)
-			//k = nbrstart + threadIdx.x;
-			//while (k < nbrend)
-		{
-			jhbond = &( hbonds.select.hbond_list [k] );
-
-			if (jhbond->nbr == i){
-				ihbond->sym_index = k;
-				jhbond->sym_index = j;
-				break;
-			}
-
-			//k += blockDim.x;
-		}
-
-		j += 32;
-	}
+    static_storage *workspace = &p_workspace;
+    hbond_data *ihbond, *jhbond;
+    int nbr;
+
+    //int i = (blockIdx.x * blockDim.x + threadIdx.x) >> 4;
+    int i = (blockIdx.x);
+    int start = Start_Index (workspace->hbond_index[i], &hbonds);
+    int end = End_Index (workspace->hbond_index[i], &hbonds);
+    //int j = start + threadIdx.x;
+    //int j = start + (threadIdx.x % 16);
+
+    //for (int j = Start_Index (workspace->hbond_index[i], &hbonds); 
+    //        j < End_Index (workspace->hbond_index[i], &hbonds); j++)
+    int j = start + threadIdx.x;
+    while (j < end)
+        //for (int j = start; j < end; j++)
+    {
+        ihbond = &( hbonds.select.hbond_list [j] );
+        nbr = ihbond->nbr;
+
+        int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds);
+        int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds);
+
+        for (int k = nbrstart; k < nbrend; k++)
+            //k = nbrstart + threadIdx.x;
+            //while (k < nbrend)
+        {
+            jhbond = &( hbonds.select.hbond_list [k] );
+
+            if (jhbond->nbr == i){
+                ihbond->sym_index = k;
+                jhbond->sym_index = j;
+                break;
+            }
+
+            //k += blockDim.x;
+        }
+
+        j += 32;
+    }
 }
 
 GLOBAL void New_fix_sym_hbond_indices (static_storage p_workspace, list hbonds, int N )
 {
 
-	static_storage *workspace = &p_workspace;
-	hbond_data *ihbond, *jhbond;
-
-	int __THREADS_PER_ATOM__ = HBONDS_SYM_THREADS_PER_ATOM;
-	int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-	int warp_id = thread_id / __THREADS_PER_ATOM__;
-	int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
-	int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-
-	if (warp_id >= N) return;
-
-	int i = warp_id;
-	int nbr;
-	int k;
-	int start = Start_Index (workspace->hbond_index[i], &hbonds);
-	int end = End_Index (workspace->hbond_index[i], &hbonds);
-	int j = start + lane_id;
-	//for (int j = start; j < end; j++)
-	while (j < end)
-	{
-		ihbond = &( hbonds.select.hbond_list [j] );
-		nbr = ihbond->nbr;
-
-		int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds);
-		int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds);
-
-		//k = nbrstart + lane_id;
-		//if (lane_id == 0) found [my_bucket] = 0;
-		//while (k < nbrend)
-		for (k = nbrstart; k < nbrend; k++)
-		{
-			jhbond = &( hbonds.select.hbond_list [k] );
-
-			if (jhbond->nbr == i){
-				ihbond->sym_index = k;
-				jhbond->sym_index = j;
-				break;
-			}
-		}
-
-		j += __THREADS_PER_ATOM__;
-	}
+    static_storage *workspace = &p_workspace;
+    hbond_data *ihbond, *jhbond;
+
+    int __THREADS_PER_ATOM__ = HBONDS_SYM_THREADS_PER_ATOM;
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warp_id = thread_id / __THREADS_PER_ATOM__;
+    int lane_id = thread_id & (__THREADS_PER_ATOM__ - 1);
+    int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+
+    if (warp_id >= N) return;
+
+    int i = warp_id;
+    int nbr;
+    int k;
+    int start = Start_Index (workspace->hbond_index[i], &hbonds);
+    int end = End_Index (workspace->hbond_index[i], &hbonds);
+    int j = start + lane_id;
+    //for (int j = start; j < end; j++)
+    while (j < end)
+    {
+        ihbond = &( hbonds.select.hbond_list [j] );
+        nbr = ihbond->nbr;
+
+        int nbrstart = Start_Index (workspace->hbond_index[nbr], &hbonds);
+        int nbrend = End_Index (workspace->hbond_index[nbr], &hbonds);
+
+        //k = nbrstart + lane_id;
+        //if (lane_id == 0) found [my_bucket] = 0;
+        //while (k < nbrend)
+        for (k = nbrstart; k < nbrend; k++)
+        {
+            jhbond = &( hbonds.select.hbond_list [k] );
+
+            if (jhbond->nbr == i){
+                ihbond->sym_index = k;
+                jhbond->sym_index = j;
+                break;
+            }
+        }
+
+        j += __THREADS_PER_ATOM__;
+    }
 }
 
 
 void Init_Forces_Tab( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace,
-		list **lists, output_controls *out_control ) {
-	int i, j, pj;
-	int start_i, end_i;
-	int type_i, type_j;
-	int Htop, btop_i, btop_j, num_bonds, num_hbonds;
-	int tmin, tmax, r;
-	int ihb, jhb, ihb_top, jhb_top;
-	int flag;
-	real r_ij, r2, self_coef;
-	real val, dif, base;
-	real C12, C34, C56;
-	real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
-	real BO, BO_s, BO_pi, BO_pi2;
-	real p_boc1, p_boc2;   
-	sparse_matrix *H;
-	list *far_nbrs, *bonds, *hbonds;
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	far_neighbor_data *nbr_pj;
-	LR_lookup_table *t;
-	reax_atom *atom_i, *atom_j;
-	bond_data *ibond, *jbond;
-	bond_order_data *bo_ij, *bo_ji;
-
-	far_nbrs = *lists + FAR_NBRS;
-	bonds = *lists + BONDS;
-	hbonds = *lists + HBONDS;
-
-	H = &workspace->H;
-	Htop = 0;
-	num_bonds = 0;
-	num_hbonds = 0;
-	btop_i = btop_j = 0;
-	p_boc1 = system->reaxprm.gp.l[0];
-	p_boc2 = system->reaxprm.gp.l[1];
-
-	for( i = 0; i < system->N; ++i ) {
-		atom_i = &(system->atoms[i]);
-		type_i  = atom_i->type;
-		start_i = Start_Index(i, far_nbrs);
-		end_i   = End_Index(i, far_nbrs);
-		H->start[i] = Htop;
-		btop_i = End_Index( i, bonds );
-		sbp_i = &(system->reaxprm.sbp[type_i]);
-		ihb = ihb_top = -1;
-		if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
-			ihb_top = End_Index( workspace->hbond_index[i], hbonds );
-
-		for( pj = start_i; pj < end_i; ++pj ) {
-			nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-			j = nbr_pj->nbr;
-			atom_j = &(system->atoms[j]);
-
-			flag = 0;
-			if((data->step-data->prev_steps) % control->reneighbor == 0) { 
-				if(nbr_pj->d <= control->r_cut)
-					flag = 1;
-				else flag = 0;
-			}
-			else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
-							nbr_pj->dvec))<=SQR(control->r_cut)){
-				nbr_pj->d = sqrt(nbr_pj->d);
-				flag = 1;
-			}
-
-			if( flag ){	
-				type_j = system->atoms[j].type;
-				r_ij = nbr_pj->d;
-				sbp_j = &(system->reaxprm.sbp[type_j]);
-				twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-				self_coef = (i == j) ? 0.5 : 1.0;
-				tmin  = MIN( type_i, type_j );
-				tmax  = MAX( type_i, type_j );
-				t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] );	  
-
-				/* cubic spline interpolation */
-				r = (int)(r_ij * t->inv_dx);
-				if( r == 0 )  ++r;
-				base = (real)(r+1) * t->dx;
-				dif = r_ij - base;
-				val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-					t->ele[r].a;
-				val *= EV_to_KCALpMOL / C_ele;
-
-				H->entries[Htop].j = j;
-				H->entries[Htop].val = self_coef * val;
-				++Htop;
-
-				/* hydrogen bond lists */ 
-				if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-						nbr_pj->d <= control->hb_cut ) {
-					// fprintf( stderr, "%d %d\n", atom1, atom2 );
-					jhb = sbp_j->p_hbond;
-					if( ihb == 1 && jhb == 2 ) {
-						hbonds->select.hbond_list[ihb_top].nbr = j;
-						hbonds->select.hbond_list[ihb_top].scl = 1;
-						hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
-						++ihb_top;
-						++num_hbonds;
-					}
-					else if( ihb == 2 && jhb == 1 ) {
-						jhb_top = End_Index( workspace->hbond_index[j], hbonds );
-						hbonds->select.hbond_list[jhb_top].nbr = i;
-						hbonds->select.hbond_list[jhb_top].scl = -1;
-						hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
-						Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
-						++num_hbonds;
-					}
-				}
-
-				/* uncorrected bond orders */
-				if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
-					r2 = SQR(r_ij);
-
-					if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-						C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-						BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-					}
-					else BO_s = C12 = 0.0;
-
-					if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-						C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-						BO_pi = EXP( C34 );
-					}
-					else BO_pi = C34 = 0.0;
-
-					if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-						C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );	
-						BO_pi2= EXP( C56 );
-					}
-					else BO_pi2 = C56 = 0.0;
-
-					/* Initially BO values are the uncorrected ones, page 1 */
-					BO = BO_s + BO_pi + BO_pi2;
-
-					if( BO >= control->bo_cut ) {
-						num_bonds += 2;
-						/****** bonds i-j and j-i ******/
-						ibond = &( bonds->select.bond_list[btop_i] );
-						btop_j = End_Index( j, bonds );
-						jbond = &(bonds->select.bond_list[btop_j]);
-
-						ibond->nbr = j;
-						jbond->nbr = i;
-						ibond->d = r_ij;
-						jbond->d = r_ij;
-						rvec_Copy( ibond->dvec, nbr_pj->dvec );
-						rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
-						ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
-						ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
-						ibond->dbond_index = btop_i;
-						jbond->dbond_index = btop_i;
-						ibond->sym_index = btop_j;
-						jbond->sym_index = btop_i;
-						++btop_i;
-						Set_End_Index( j, btop_j+1, bonds );
-
-						bo_ij = &( ibond->bo_data );
-						bo_ji = &( jbond->bo_data );
-						bo_ji->BO = bo_ij->BO = BO;
-						bo_ji->BO_s = bo_ij->BO_s = BO_s;
-						bo_ji->BO_pi = bo_ij->BO_pi = BO_pi;
-						bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
-
-						/* Bond Order page2-3, derivative of total bond order prime */
-						Cln_BOp_s = twbp->p_bo2 * C12 / r2;
-						Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
-						Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
-
-						/* Only dln_BOp_xx wrt. dr_i is stored here, note that 
-						   dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
-						rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
-						rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
-						rvec_Scale(bo_ij->dln_BOp_pi2,
-								-bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
-						rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
-						rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
-						rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
-
-						/* Only dBOp wrt. dr_i is stored here, note that 
-						   dBOp/dr_i = -dBOp/dr_j and all others are 0 */
-						rvec_Scale( bo_ij->dBOp, 
-								-(bo_ij->BO_s * Cln_BOp_s + 
-									bo_ij->BO_pi * Cln_BOp_pi + 
-									bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
-						rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
-
-						rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
-						rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
-
-						bo_ij->BO_s -= control->bo_cut;
-						bo_ij->BO -= control->bo_cut;
-						bo_ji->BO_s -= control->bo_cut;
-						bo_ji->BO -= control->bo_cut;
-						workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
-						workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
-						bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
-						bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
-
-						Set_End_Index( j, btop_j+1, bonds );
-					}
-				}
-			}
-		}
-
-		H->entries[Htop].j = i;
-		H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
-		++Htop;
-
-		Set_End_Index( i, btop_i, bonds );
-		if( ihb == 1 )
-			Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
-	}
-
-	// mark the end of j list
-	H->start[i] = Htop; 
-	/* validate lists - decide if reallocation is required! */
-	Validate_Lists( workspace, lists, 
-			data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control ) {
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int Htop, btop_i, btop_j, num_bonds, num_hbonds;
+    int tmin, tmax, r;
+    int ihb, jhb, ihb_top, jhb_top;
+    int flag;
+    real r_ij, r2, self_coef;
+    real val, dif, base;
+    real C12, C34, C56;
+    real Cln_BOp_s, Cln_BOp_pi, Cln_BOp_pi2;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2;   
+    sparse_matrix *H;
+    list *far_nbrs, *bonds, *hbonds;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    LR_lookup_table *t;
+    reax_atom *atom_i, *atom_j;
+    bond_data *ibond, *jbond;
+    bond_order_data *bo_ij, *bo_ji;
+
+    far_nbrs = *lists + FAR_NBRS;
+    bonds = *lists + BONDS;
+    hbonds = *lists + HBONDS;
+
+    H = &workspace->H;
+    Htop = 0;
+    num_bonds = 0;
+    num_hbonds = 0;
+    btop_i = btop_j = 0;
+    p_boc1 = system->reaxprm.gp.l[0];
+    p_boc2 = system->reaxprm.gp.l[1];
+
+    for( i = 0; i < system->N; ++i ) {
+        atom_i = &(system->atoms[i]);
+        type_i  = atom_i->type;
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
+        H->start[i] = Htop;
+        btop_i = End_Index( i, bonds );
+        sbp_i = &(system->reaxprm.sbp[type_i]);
+        ihb = ihb_top = -1;
+        if( control->hb_cut > 0 && (ihb=sbp_i->p_hbond) == 1 )
+            ihb_top = End_Index( workspace->hbond_index[i], hbonds );
+
+        for( pj = start_i; pj < end_i; ++pj ) {
+            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
+            atom_j = &(system->atoms[j]);
+
+            flag = 0;
+            if((data->step-data->prev_steps) % control->reneighbor == 0) { 
+                if(nbr_pj->d <= control->r_cut)
+                    flag = 1;
+                else flag = 0;
+            }
+            else if((nbr_pj->d=Sq_Distance_on_T3(atom_i->x,atom_j->x,&(system->box),
+                            nbr_pj->dvec))<=SQR(control->r_cut)){
+                nbr_pj->d = sqrt(nbr_pj->d);
+                flag = 1;
+            }
+
+            if( flag ){    
+                type_j = system->atoms[j].type;
+                r_ij = nbr_pj->d;
+                sbp_j = &(system->reaxprm.sbp[type_j]);
+                twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
+                self_coef = (i == j) ? 0.5 : 1.0;
+                tmin  = MIN( type_i, type_j );
+                tmax  = MAX( type_i, type_j );
+                t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] );      
+
+                /* cubic spline interpolation */
+                r = (int)(r_ij * t->inv_dx);
+                if( r == 0 )  ++r;
+                base = (real)(r+1) * t->dx;
+                dif = r_ij - base;
+                val = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                    t->ele[r].a;
+                val *= EV_to_KCALpMOL / C_ele;
+
+                H->entries[Htop].j = j;
+                H->entries[Htop].val = self_coef * val;
+                ++Htop;
+
+                /* hydrogen bond lists */ 
+                if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
+                        nbr_pj->d <= control->hb_cut ) {
+                    // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                    jhb = sbp_j->p_hbond;
+                    if( ihb == 1 && jhb == 2 ) {
+                        hbonds->select.hbond_list[ihb_top].nbr = j;
+                        hbonds->select.hbond_list[ihb_top].scl = 1;
+                        hbonds->select.hbond_list[ihb_top].ptr = nbr_pj;
+                        ++ihb_top;
+                        ++num_hbonds;
+                    }
+                    else if( ihb == 2 && jhb == 1 ) {
+                        jhb_top = End_Index( workspace->hbond_index[j], hbonds );
+                        hbonds->select.hbond_list[jhb_top].nbr = i;
+                        hbonds->select.hbond_list[jhb_top].scl = -1;
+                        hbonds->select.hbond_list[jhb_top].ptr = nbr_pj;
+                        Set_End_Index( workspace->hbond_index[j], jhb_top+1, hbonds );
+                        ++num_hbonds;
+                    }
+                }
+
+                /* uncorrected bond orders */
+                if( far_nbrs->select.far_nbr_list[pj].d <= control->nbr_cut ) {
+                    r2 = SQR(r_ij);
+
+                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                    }
+                    else BO_s = C12 = 0.0;
+
+                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                        BO_pi = EXP( C34 );
+                    }
+                    else BO_pi = C34 = 0.0;
+
+                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                        BO_pi2= EXP( C56 );
+                    }
+                    else BO_pi2 = C56 = 0.0;
+
+                    /* Initially BO values are the uncorrected ones, page 1 */
+                    BO = BO_s + BO_pi + BO_pi2;
+
+                    if( BO >= control->bo_cut ) {
+                        num_bonds += 2;
+                        /****** bonds i-j and j-i ******/
+                        ibond = &( bonds->select.bond_list[btop_i] );
+                        btop_j = End_Index( j, bonds );
+                        jbond = &(bonds->select.bond_list[btop_j]);
+
+                        ibond->nbr = j;
+                        jbond->nbr = i;
+                        ibond->d = r_ij;
+                        jbond->d = r_ij;
+                        rvec_Copy( ibond->dvec, nbr_pj->dvec );
+                        rvec_Scale( jbond->dvec, -1, nbr_pj->dvec );
+                        ivec_Copy( ibond->rel_box, nbr_pj->rel_box );
+                        ivec_Scale( jbond->rel_box, -1, nbr_pj->rel_box );
+                        ibond->dbond_index = btop_i;
+                        jbond->dbond_index = btop_i;
+                        ibond->sym_index = btop_j;
+                        jbond->sym_index = btop_i;
+                        ++btop_i;
+                        Set_End_Index( j, btop_j+1, bonds );
+
+                        bo_ij = &( ibond->bo_data );
+                        bo_ji = &( jbond->bo_data );
+                        bo_ji->BO = bo_ij->BO = BO;
+                        bo_ji->BO_s = bo_ij->BO_s = BO_s;
+                        bo_ji->BO_pi = bo_ij->BO_pi = BO_pi;
+                        bo_ji->BO_pi2 = bo_ij->BO_pi2 = BO_pi2;
+
+                        /* Bond Order page2-3, derivative of total bond order prime */
+                        Cln_BOp_s = twbp->p_bo2 * C12 / r2;
+                        Cln_BOp_pi = twbp->p_bo4 * C34 / r2;
+                        Cln_BOp_pi2 = twbp->p_bo6 * C56 / r2;
+
+                        /* Only dln_BOp_xx wrt. dr_i is stored here, note that 
+                           dln_BOp_xx/dr_i = -dln_BOp_xx/dr_j and all others are 0 */
+                        rvec_Scale(bo_ij->dln_BOp_s,-bo_ij->BO_s*Cln_BOp_s,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi,-bo_ij->BO_pi*Cln_BOp_pi,ibond->dvec);
+                        rvec_Scale(bo_ij->dln_BOp_pi2,
+                                -bo_ij->BO_pi2*Cln_BOp_pi2,ibond->dvec);
+                        rvec_Scale(bo_ji->dln_BOp_s, -1., bo_ij->dln_BOp_s);
+                        rvec_Scale(bo_ji->dln_BOp_pi, -1., bo_ij->dln_BOp_pi );
+                        rvec_Scale(bo_ji->dln_BOp_pi2, -1., bo_ij->dln_BOp_pi2 );
+
+                        /* Only dBOp wrt. dr_i is stored here, note that 
+                           dBOp/dr_i = -dBOp/dr_j and all others are 0 */
+                        rvec_Scale( bo_ij->dBOp, 
+                                -(bo_ij->BO_s * Cln_BOp_s + 
+                                    bo_ij->BO_pi * Cln_BOp_pi + 
+                                    bo_ij->BO_pi2 * Cln_BOp_pi2), ibond->dvec );
+                        rvec_Scale( bo_ji->dBOp, -1., bo_ij->dBOp );
+
+                        rvec_Add( workspace->dDeltap_self[i], bo_ij->dBOp );
+                        rvec_Add( workspace->dDeltap_self[j], bo_ji->dBOp );
+
+                        bo_ij->BO_s -= control->bo_cut;
+                        bo_ij->BO -= control->bo_cut;
+                        bo_ji->BO_s -= control->bo_cut;
+                        bo_ji->BO -= control->bo_cut;
+                        workspace->total_bond_order[i] += bo_ij->BO; //currently total_BOp
+                        workspace->total_bond_order[j] += bo_ji->BO; //currently total_BOp
+                        bo_ij->Cdbo = bo_ij->Cdbopi = bo_ij->Cdbopi2 = 0.0;
+                        bo_ji->Cdbo = bo_ji->Cdbopi = bo_ji->Cdbopi2 = 0.0;
+
+                        Set_End_Index( j, btop_j+1, bonds );
+                    }
+                }
+            }
+        }
+
+        H->entries[Htop].j = i;
+        H->entries[Htop].val = system->reaxprm.sbp[type_i].eta;
+        ++Htop;
+
+        Set_End_Index( i, btop_i, bonds );
+        if( ihb == 1 )
+            Set_End_Index( workspace->hbond_index[i], ihb_top, hbonds );
+    }
+
+    // mark the end of j list
+    H->start[i] = Htop; 
+    /* validate lists - decide if reallocation is required! */
+    Validate_Lists( workspace, lists, 
+            data->step, system->N, H->m, Htop, num_bonds, num_hbonds ); 
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
-			data->step, Htop, num_bonds, num_hbonds );
-	//Print_Bonds( system, bonds, "sbonds.out" );
-	//Print_Bond_List2( system, bonds, "sbonds.out" );
-	//Print_Sparse_Matrix2( H, "H.out" );
+    fprintf( stderr, "step%d: Htop = %d, num_bonds = %d, num_hbonds = %d\n", 
+            data->step, Htop, num_bonds, num_hbonds );
+    //Print_Bonds( system, bonds, "sbonds.out" );
+    //Print_Bond_List2( system, bonds, "sbonds.out" );
+    //Print_Sparse_Matrix2( H, "H.out" );
 #endif
 }
 
 void Estimate_Storage_Sizes( reax_system *system, control_params *control, 
-		list **lists, int *Htop, int *hb_top, 
-		int *bond_top, int *num_3body ) {
-	int i, j, pj;
-	int start_i, end_i;
-	int type_i, type_j;
-	int ihb, jhb;
-	real r_ij, r2;
-	real C12, C34, C56;
-	real BO, BO_s, BO_pi, BO_pi2;
-	real p_boc1, p_boc2; 
-	list *far_nbrs;
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	far_neighbor_data *nbr_pj;
-	reax_atom *atom_i, *atom_j;
-
-	far_nbrs = *lists + FAR_NBRS;
-	p_boc1 = system->reaxprm.gp.l[0];
-	p_boc2 = system->reaxprm.gp.l[1];
-
-	for( i = 0; i < system->N; ++i ) {
-		atom_i = &(system->atoms[i]);
-		type_i  = atom_i->type;
-		start_i = Start_Index(i, far_nbrs);
-		end_i   = End_Index(i, far_nbrs);
-		sbp_i = &(system->reaxprm.sbp[type_i]);
-		ihb = sbp_i->p_hbond;
-
-		for( pj = start_i; pj < end_i; ++pj ) {
-			nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-			j = nbr_pj->nbr;
-			atom_j = &(system->atoms[j]);
-			type_j = atom_j->type;
-			sbp_j = &(system->reaxprm.sbp[type_j]);
-			twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-
-			if( nbr_pj->d <= control->r_cut ) {
-				++(*Htop);
-
-				/* hydrogen bond lists */ 
-				if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && 
-						nbr_pj->d <= control->hb_cut ) {
-					jhb = sbp_j->p_hbond;
-					if( ihb == 1 && jhb == 2 )
-						++hb_top[i];
-					else if( ihb == 2 && jhb == 1 )
-						++hb_top[j];
-				}
-
-				/* uncorrected bond orders */
-				if( nbr_pj->d <= control->nbr_cut ) {
-					r_ij = nbr_pj->d;
-					r2 = SQR(r_ij);
-
-					if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-						C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-						BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-					}
-					else BO_s = C12 = 0.0;
-
-					if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-						C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-						BO_pi = EXP( C34 );
-					}
-					else BO_pi = C34 = 0.0;
-
-					if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-						C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );	
-						BO_pi2= EXP( C56 );
-					}
-					else BO_pi2 = C56 = 0.0;
-
-					/* Initially BO values are the uncorrected ones, page 1 */
-					BO = BO_s + BO_pi + BO_pi2;
-
-					if( BO >= control->bo_cut ) {
-						++bond_top[i];
-						++bond_top[j];
-					}
-				}
-			}
-		}
-	}
-
-	*Htop += system->N;
-	*Htop *= SAFE_ZONE;
-
-	for( i = 0; i < system->N; ++i ) {
-		hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
-		*num_3body += SQR(bond_top[i]);
-		bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
-	}
-	*num_3body *= SAFE_ZONE;
+        list **lists, int *Htop, int *hb_top, 
+        int *bond_top, int *num_3body ) {
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int ihb, jhb;
+    real r_ij, r2;
+    real C12, C34, C56;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2; 
+    list *far_nbrs;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+
+    far_nbrs = *lists + FAR_NBRS;
+    p_boc1 = system->reaxprm.gp.l[0];
+    p_boc2 = system->reaxprm.gp.l[1];
+
+    for( i = 0; i < system->N; ++i ) {
+        atom_i = &(system->atoms[i]);
+        type_i  = atom_i->type;
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
+        sbp_i = &(system->reaxprm.sbp[type_i]);
+        ihb = sbp_i->p_hbond;
+
+        for( pj = start_i; pj < end_i; ++pj ) {
+            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
+            atom_j = &(system->atoms[j]);
+            type_j = atom_j->type;
+            sbp_j = &(system->reaxprm.sbp[type_j]);
+            twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
+
+            if( nbr_pj->d <= control->r_cut ) {
+                ++(*Htop);
+
+                /* hydrogen bond lists */ 
+                if( control->hb_cut > 0.1 && (ihb==1 || ihb==2) && 
+                        nbr_pj->d <= control->hb_cut ) {
+                    jhb = sbp_j->p_hbond;
+                    if( ihb == 1 && jhb == 2 )
+                        ++hb_top[i];
+                    else if( ihb == 2 && jhb == 1 )
+                        ++hb_top[j];
+                }
+
+                /* uncorrected bond orders */
+                if( nbr_pj->d <= control->nbr_cut ) {
+                    r_ij = nbr_pj->d;
+                    r2 = SQR(r_ij);
+
+                    if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                        C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                        BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                    }
+                    else BO_s = C12 = 0.0;
+
+                    if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                        C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                        BO_pi = EXP( C34 );
+                    }
+                    else BO_pi = C34 = 0.0;
+
+                    if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                        C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                        BO_pi2= EXP( C56 );
+                    }
+                    else BO_pi2 = C56 = 0.0;
+
+                    /* Initially BO values are the uncorrected ones, page 1 */
+                    BO = BO_s + BO_pi + BO_pi2;
+
+                    if( BO >= control->bo_cut ) {
+                        ++bond_top[i];
+                        ++bond_top[j];
+                    }
+                }
+            }
+        }
+    }
+
+    *Htop += system->N;
+    *Htop *= SAFE_ZONE;
+
+    for( i = 0; i < system->N; ++i ) {
+        hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
+        *num_3body += SQR(bond_top[i]);
+        bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
+    }
+    *num_3body *= SAFE_ZONE;
 }
 
 void Cuda_Estimate_Storage_Sizes (reax_system *system, control_params *control, int *output)
 {
-	int *Htop, *num_3body, input_size;
-	int *hb_top, *bond_top;
-	int *input = (int *) scratch;
-	int max_3body = 0;
+    int *Htop, *num_3body, input_size;
+    int *hb_top, *bond_top;
+    int *input = (int *) scratch;
+    int max_3body = 0;
 
-	Htop = 0;
-	num_3body = 0;
-	input_size = INT_SIZE * (2 * system->N + 1 + 1);
+    Htop = 0;
+    num_3body = 0;
+    input_size = INT_SIZE * (2 * system->N + 1 + 1);
 
-	//cuda_malloc ((void **) &input, input_size, 1, __LINE__);
-	cuda_memset (input, 0, input_size, RES_SCRATCH );
+    //cuda_malloc ((void **) &input, input_size, 1, __LINE__);
+    cuda_memset (input, 0, input_size, RES_SCRATCH );
 
-	Estimate_Storage_Sizes <<<BLOCKS_POW_2, BLOCK_SIZE>>>
-		(system->d_atoms, system->N, system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-		 system->reaxprm.d_gp, (control_params *)control->d_control, *(dev_lists + FAR_NBRS), 
-		 system->reaxprm.num_atom_types, input);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Estimate_Storage_Sizes <<<BLOCKS_POW_2, BLOCK_SIZE>>>
+        (system->d_atoms, system->N, system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+         system->reaxprm.d_gp, (control_params *)control->d_control, *(dev_lists + FAR_NBRS), 
+         system->reaxprm.num_atom_types, input);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	copy_host_device (output, input, input_size, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device (output, input, input_size, cudaMemcpyDeviceToHost, __LINE__ );
 
-	Htop = &output[0];
-	num_3body  = &output[1];
-	hb_top = &output[ 2 ];
-	bond_top = &output[ 2 + system->N ];
+    Htop = &output[0];
+    num_3body  = &output[1];
+    hb_top = &output[ 2 ];
+    bond_top = &output[ 2 + system->N ];
 
-	*Htop += system->N;
-	*Htop *= SAFE_ZONE;
+    *Htop += system->N;
+    *Htop *= SAFE_ZONE;
 
-	for( int i = 0; i < system->N; ++i ) {
-		hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
+    for( int i = 0; i < system->N; ++i ) {
+        hb_top[i] = MAX( hb_top[i] * SAFE_HBONDS, MIN_HBONDS );
 
-		if (max_3body <= SQR (bond_top[i]))
-			max_3body = SQR (bond_top[i]);
+        if (max_3body <= SQR (bond_top[i]))
+            max_3body = SQR (bond_top[i]);
 
-		*num_3body += SQR(bond_top[i]);
-		bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
-	}
+        *num_3body += SQR(bond_top[i]);
+        bond_top[i] = MAX( bond_top[i] * 2, MIN_BONDS );
+    }
 
-	*num_3body = max_3body * SAFE_ZONE;
+    *num_3body = max_3body * SAFE_ZONE;
 }
 
 
-GLOBAL void Estimate_Storage_Sizes 	(reax_atom *atoms, 
-		int N,
-		single_body_parameters *sbp,
-		two_body_parameters *tbp,
-		global_parameters gp, 
-		control_params *control, 
-		list far_nbrs,
-		int num_atom_types, int *results)
+GLOBAL void Estimate_Storage_Sizes     (reax_atom *atoms, 
+        int N,
+        single_body_parameters *sbp,
+        two_body_parameters *tbp,
+        global_parameters gp, 
+        control_params *control, 
+        list far_nbrs,
+        int num_atom_types, int *results)
 {
-	int *Htop = &results[0];
-	int *num_3body  = &results[1];
-	int *hb_top = &results [ 2 ];
-	int *bond_top = &results [ 2 + N ];
-
-	int i, j, pj;
-	int start_i, end_i;
-	int type_i, type_j;
-	int ihb, jhb;
-	real r_ij, r2;
-	real C12, C34, C56;
-	real BO, BO_s, BO_pi, BO_pi2;
-	real p_boc1, p_boc2; 
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	far_neighbor_data *nbr_pj;
-	reax_atom *atom_i, *atom_j;
-
-	p_boc1 = gp.l[0];
-	p_boc2 = gp.l[1];
-
-	//for( i = 0; i < N; ++i ) {
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if (i >= N ) return ;
-
-	atom_i = &(atoms[i]);
-	type_i  = atom_i->type;
-	start_i = Start_Index(i, &far_nbrs);
-	end_i   = End_Index(i, &far_nbrs);
-	sbp_i = &(sbp[type_i]);
-	ihb = sbp_i->p_hbond;
-
-	for( pj = start_i; pj < end_i; ++pj ) {
-		nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
-		j = nbr_pj->nbr;
-		atom_j = &( atoms[j] );
-		type_j = atom_j->type;
-		sbp_j = &( sbp[type_j] );
-		twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );
-
-
-		if( nbr_pj->d <= control->r_cut ) {
-			//++(*Htop);
-			atomicAdd (Htop, 1);
-
-			/* hydrogen bond lists */ 
-			//TODO - CHANGE ORIGINAL
-			if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
-					nbr_pj->d <= control->hb_cut ) {
-				jhb = sbp_j->p_hbond;
-				if( ihb == 1 && jhb == 2 )
-					//++hb_top[i];
-					atomicAdd (&hb_top[i], 1);
-				else if( ihb == 2 && jhb == 1 )
-					//++hb_top[j];
-					//atomicAdd (&hb_top[j], 1);
-					atomicAdd (&hb_top[i], 1);
-			}
-			//TODO -- CHANGE ORIGINAL
-
-			//CHANGE ORIGINAL
-			if (i < j) continue;
-			//CHANGE ORIGINAL
-
-
-			/* uncorrected bond orders */
-			if( nbr_pj->d <= control->nbr_cut ) {
-				r_ij = nbr_pj->d;
-				r2 = SQR(r_ij);
-
-				if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
-					C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
-					BO_s = (1.0 + control->bo_cut) * EXP( C12 );
-				}
-				else BO_s = C12 = 0.0;
-
-				if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
-					C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
-					BO_pi = EXP( C34 );
-				}
-				else BO_pi = C34 = 0.0;
-
-				if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
-					C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );	
-					BO_pi2= EXP( C56 );
-				}
-				else BO_pi2 = C56 = 0.0;
-
-				/* Initially BO values are the uncorrected ones, page 1 */
-				BO = BO_s + BO_pi + BO_pi2;
-
-				if( BO >= control->bo_cut ) {
-					//++bond_top[i];
-					//++bond_top[j];
-					atomicAdd (&bond_top[i], 1);
-					atomicAdd (&bond_top[j], 1);
-				}
-			}
-		}
-	}
-	//}
+    int *Htop = &results[0];
+    int *num_3body  = &results[1];
+    int *hb_top = &results [ 2 ];
+    int *bond_top = &results [ 2 + N ];
+
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    int ihb, jhb;
+    real r_ij, r2;
+    real C12, C34, C56;
+    real BO, BO_s, BO_pi, BO_pi2;
+    real p_boc1, p_boc2; 
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    reax_atom *atom_i, *atom_j;
+
+    p_boc1 = gp.l[0];
+    p_boc2 = gp.l[1];
+
+    //for( i = 0; i < N; ++i ) {
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i >= N ) return ;
+
+    atom_i = &(atoms[i]);
+    type_i  = atom_i->type;
+    start_i = Start_Index(i, &far_nbrs);
+    end_i   = End_Index(i, &far_nbrs);
+    sbp_i = &(sbp[type_i]);
+    ihb = sbp_i->p_hbond;
+
+    for( pj = start_i; pj < end_i; ++pj ) {
+        nbr_pj = &( far_nbrs.select.far_nbr_list[pj] );
+        j = nbr_pj->nbr;
+        atom_j = &( atoms[j] );
+        type_j = atom_j->type;
+        sbp_j = &( sbp[type_j] );
+        twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );
+
+
+        if( nbr_pj->d <= control->r_cut ) {
+            //++(*Htop);
+            atomicAdd (Htop, 1);
+
+            /* hydrogen bond lists */ 
+            //TODO - CHANGE ORIGINAL
+            if( control->hb_cut > 0 && (ihb==1 || ihb==2) && 
+                    nbr_pj->d <= control->hb_cut ) {
+                jhb = sbp_j->p_hbond;
+                if( ihb == 1 && jhb == 2 )
+                    //++hb_top[i];
+                    atomicAdd (&hb_top[i], 1);
+                else if( ihb == 2 && jhb == 1 )
+                    //++hb_top[j];
+                    //atomicAdd (&hb_top[j], 1);
+                    atomicAdd (&hb_top[i], 1);
+            }
+            //TODO -- CHANGE ORIGINAL
+
+            //CHANGE ORIGINAL
+            if (i < j) continue;
+            //CHANGE ORIGINAL
+
+
+            /* uncorrected bond orders */
+            if( nbr_pj->d <= control->nbr_cut ) {
+                r_ij = nbr_pj->d;
+                r2 = SQR(r_ij);
+
+                if( sbp_i->r_s > 0.0 && sbp_j->r_s > 0.0) {
+                    C12 = twbp->p_bo1 * POW( r_ij / twbp->r_s, twbp->p_bo2 );
+                    BO_s = (1.0 + control->bo_cut) * EXP( C12 );
+                }
+                else BO_s = C12 = 0.0;
+
+                if( sbp_i->r_pi > 0.0 && sbp_j->r_pi > 0.0) {
+                    C34 = twbp->p_bo3 * POW( r_ij / twbp->r_p, twbp->p_bo4 );
+                    BO_pi = EXP( C34 );
+                }
+                else BO_pi = C34 = 0.0;
+
+                if( sbp_i->r_pi_pi > 0.0 && sbp_j->r_pi_pi > 0.0) {
+                    C56 = twbp->p_bo5 * POW( r_ij / twbp->r_pp, twbp->p_bo6 );    
+                    BO_pi2= EXP( C56 );
+                }
+                else BO_pi2 = C56 = 0.0;
+
+                /* Initially BO values are the uncorrected ones, page 1 */
+                BO = BO_s + BO_pi + BO_pi2;
+
+                if( BO >= control->bo_cut ) {
+                    //++bond_top[i];
+                    //++bond_top[j];
+                    atomicAdd (&bond_top[i], 1);
+                    atomicAdd (&bond_top[j], 1);
+                }
+            }
+        }
+    }
+    //}
 }
 
 void Cuda_Compute_Forces( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list** lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list** lists, output_controls *out_control )
 {
-	real t_start, t_elapsed;
-	real t_1, t_2;
-	int *indices;
-	int *Htop;
-	int max_sparse_entries = 0;
-	list *far_nbrs = dev_lists + FAR_NBRS;
-	int hblocks;
-
-	t_start = Get_Time ();
-	if ( !control->tabulate ) {
-		Init_Forces <<<BLOCKS, BLOCK_SIZE>>>
-			(system->d_atoms, 		system->reaxprm.d_gp, (control_params *)control->d_control, 
-			 system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-			 (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace,
-			 *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), 
-			 system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types ); 
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-	}
-	else 
-	{
-		Init_Forces_Tab <<< BLOCKS, BLOCK_SIZE >>>
-			( system->d_atoms, 		system->reaxprm.d_gp, (control_params *)control->d_control, 
-			  system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
-			  (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box,  *dev_workspace,
-			  *(dev_lists + FAR_NBRS), 	*(dev_lists + BONDS), *(dev_lists + HBONDS), 
-			  system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types, 
-			  d_LR );
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-	}
-
-	/*This is for bonds processing to fix dbond and sym_indexes */
-	t_1 = Get_Time ();
-	fix_sym_dbond_indices <<<BLOCKS, BLOCK_SIZE>>> (*(dev_lists + BONDS), system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-	t_2 = Get_Timing_Info ( t_1 );
-
-	//FIX -1 HYDROGEN BOND fix for cases where there are no hbonds.
-	if ((control->hb_cut > 0) && (dev_workspace->num_H > 0))
-	{
-
-		hblocks = (system->N * HBONDS_SYM_THREADS_PER_ATOM / HBONDS_SYM_BLOCK_SIZE) + 
-			((system->N * HBONDS_SYM_THREADS_PER_ATOM % HBONDS_SYM_BLOCK_SIZE) == 0 ? 0 : 1);
-		t_1 = Get_Time ();
-		/*
-		   int bs = system->N;
-		   int ss = 32;
-		   fix_sym_hbond_indices <<<bs, ss>>> (*dev_workspace, *(dev_lists + HBONDS), system->N);
-		 */
-		New_fix_sym_hbond_indices <<<hblocks, HBONDS_SYM_BLOCK_SIZE>>> (*dev_workspace, *(dev_lists + HBONDS), system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-	}
-	t_2 = Get_Timing_Info ( t_1 );
-
-	t_elapsed = Get_Timing_Info (t_start);
-	d_timing.init_forces+= t_elapsed;
-
-	Cuda_Validate_Lists( system, dev_workspace, &dev_lists, data->step, system->N,
-			system->num_bonds, system->num_hbonds );
+    real t_start, t_elapsed;
+    real t_1, t_2;
+    int *indices;
+    int *Htop;
+    int max_sparse_entries = 0;
+    list *far_nbrs = dev_lists + FAR_NBRS;
+    int hblocks;
+
+    t_start = Get_Time ();
+    if ( !control->tabulate ) {
+        Init_Forces <<<BLOCKS, BLOCK_SIZE>>>
+            (system->d_atoms,         system->reaxprm.d_gp, (control_params *)control->d_control, 
+             system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+             (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, *dev_workspace,
+             *(dev_lists + FAR_NBRS), *(dev_lists + BONDS), *(dev_lists + HBONDS), 
+             system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types ); 
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+    else 
+    {
+        Init_Forces_Tab <<< BLOCKS, BLOCK_SIZE >>>
+            ( system->d_atoms,         system->reaxprm.d_gp, (control_params *)control->d_control, 
+              system->reaxprm.d_sbp, system->reaxprm.d_tbp, 
+              (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box,  *dev_workspace,
+              *(dev_lists + FAR_NBRS),     *(dev_lists + BONDS), *(dev_lists + HBONDS), 
+              system->N, system->max_sparse_matrix_entries, system->reaxprm.num_atom_types, 
+              d_LR );
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+
+    /*This is for bonds processing to fix dbond and sym_indexes */
+    t_1 = Get_Time ();
+    fix_sym_dbond_indices <<<BLOCKS, BLOCK_SIZE>>> (*(dev_lists + BONDS), system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+    t_2 = Get_Timing_Info ( t_1 );
+
+    //FIX -1 HYDROGEN BOND fix for cases where there are no hbonds.
+    if ((control->hb_cut > 0) && (dev_workspace->num_H > 0))
+    {
+
+        hblocks = (system->N * HBONDS_SYM_THREADS_PER_ATOM / HBONDS_SYM_BLOCK_SIZE) + 
+            ((system->N * HBONDS_SYM_THREADS_PER_ATOM % HBONDS_SYM_BLOCK_SIZE) == 0 ? 0 : 1);
+        t_1 = Get_Time ();
+        /*
+           int bs = system->N;
+           int ss = 32;
+           fix_sym_hbond_indices <<<bs, ss>>> (*dev_workspace, *(dev_lists + HBONDS), system->N);
+         */
+        New_fix_sym_hbond_indices <<<hblocks, HBONDS_SYM_BLOCK_SIZE>>> (*dev_workspace, *(dev_lists + HBONDS), system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+    }
+    t_2 = Get_Timing_Info ( t_1 );
+
+    t_elapsed = Get_Timing_Info (t_start);
+    d_timing.init_forces+= t_elapsed;
+
+    Cuda_Validate_Lists( system, dev_workspace, &dev_lists, data->step, system->N,
+            system->num_bonds, system->num_hbonds );
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, "Done with Cuda List Validation \n");
+    fprintf (stderr, "Done with Cuda List Validation \n");
 #endif
 
-	//Bonded Force Calculations here.
-	t_start = Get_Time ();
-	Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
-	t_elapsed = Get_Timing_Info (t_start);
-	d_timing.bonded += t_elapsed;
-
-	//Compute the Non Bonded Forces here. 
-	t_start = Get_Time ();
-	Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control );
-	t_elapsed = Get_Timing_Info (t_start);
-	d_timing.nonb += t_elapsed;
-
-	//Compute Total Forces here
-	Cuda_Compute_Total_Force<<< BLOCKS, BLOCK_SIZE >>>
-		(system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, 
-		 *(dev_lists + BONDS), control->ensemble, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	Cuda_Compute_Total_Force_PostProcess<<< BLOCKS, BLOCK_SIZE >>>
-		(system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, 
-		 *(dev_lists + BONDS), control->ensemble, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    //Bonded Force Calculations here.
+    t_start = Get_Time ();
+    Cuda_Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
+    t_elapsed = Get_Timing_Info (t_start);
+    d_timing.bonded += t_elapsed;
+
+    //Compute the Non Bonded Forces here. 
+    t_start = Get_Time ();
+    Cuda_Compute_NonBonded_Forces( system, control, data, workspace, lists, out_control );
+    t_elapsed = Get_Timing_Info (t_start);
+    d_timing.nonb += t_elapsed;
+
+    //Compute Total Forces here
+    Cuda_Compute_Total_Force<<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, 
+         *(dev_lists + BONDS), control->ensemble, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    Cuda_Compute_Total_Force_PostProcess<<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, (simulation_data *)data->d_simulation_data, *dev_workspace, 
+         *(dev_lists + BONDS), control->ensemble, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 void Compute_Forces( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list** lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list** lists, output_controls *out_control )
 {
-	real t_start, t_elapsed;
+    real t_start, t_elapsed;
 
-	t_start = Get_Time( );
-	if( !control->tabulate )
-		Init_Forces( system, control, data, workspace, lists, out_control );
-	else Init_Forces_Tab( system, control, data, workspace, lists, out_control );
-	t_elapsed = Get_Timing_Info( t_start );
-	data->timing.init_forces += t_elapsed;
+    t_start = Get_Time( );
+    if( !control->tabulate )
+        Init_Forces( system, control, data, workspace, lists, out_control );
+    else Init_Forces_Tab( system, control, data, workspace, lists, out_control );
+    t_elapsed = Get_Timing_Info( t_start );
+    data->timing.init_forces += t_elapsed;
 
 #if defined(DEBUG_FOCUS)
-	print_sparse_matrix (system, workspace);
-	fprintf( stderr, "init_forces - ");
+    print_sparse_matrix (system, workspace);
+    fprintf( stderr, "init_forces - ");
 #endif
 
 
-	//analyze_hbonds (system, workspace, lists);
+    //analyze_hbonds (system, workspace, lists);
 
-	t_start = Get_Time( );
-	Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
-	t_elapsed = Get_Timing_Info( t_start );
-	data->timing.bonded += t_elapsed;
+    t_start = Get_Time( );
+    Compute_Bonded_Forces( system, control, data, workspace, lists, out_control );
+    t_elapsed = Get_Timing_Info( t_start );
+    data->timing.bonded += t_elapsed;
 
-	//print_bond_list (system, workspace, lists);
-	//exit (0);
+    //print_bond_list (system, workspace, lists);
+    //exit (0);
 
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "bonded_forces - ");
+    fprintf( stderr, "bonded_forces - ");
 #endif
 
-	t_start = Get_Time( );
-	Compute_NonBonded_Forces( system, control, data, workspace, 
-			lists, out_control );
-	t_elapsed = Get_Timing_Info( t_start );
-	data->timing.nonb += t_elapsed;
+    t_start = Get_Time( );
+    Compute_NonBonded_Forces( system, control, data, workspace, 
+            lists, out_control );
+    t_elapsed = Get_Timing_Info( t_start );
+    data->timing.nonb += t_elapsed;
 
 #ifdef __DEBUG_CUDA__
-	fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed);
+    fprintf( stderr, "non_bonded_forces - %lf \n", t_elapsed);
 #endif
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "nonbondeds - ");
+    fprintf( stderr, "nonbondeds - ");
 #endif
 
-	Compute_Total_Force( system, control, data, workspace, lists );
-	//Print_Total_Force( system, control, data, workspace, lists, out_control );
+    Compute_Total_Force( system, control, data, workspace, lists );
+    //Print_Total_Force( system, control, data, workspace, lists, out_control );
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "totalforces - ");
-	//Print_Total_Force( system, control, data, workspace, lists, out_control );
+    fprintf( stderr, "totalforces - ");
+    //Print_Total_Force( system, control, data, workspace, lists, out_control );
 #endif
 
 #ifdef TEST_FORCES
-	Print_Total_Force( system, control, data, workspace, lists, out_control );
-	Compare_Total_Forces( system, control, data, workspace, lists, out_control );
+    Print_Total_Force( system, control, data, workspace, lists, out_control );
+    Compare_Total_Forces( system, control, data, workspace, lists, out_control );
 #endif
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "forces - ");
+    fprintf( stderr, "forces - ");
 #endif
 }
 
 
 bool validate_device (reax_system *system, simulation_data *data, static_storage *workspace, list **lists )
 {
-	bool retval = false;
+    bool retval = false;
 
 #ifdef __BUILD_DEBUG__
 
-	retval |= validate_neighbors (system, lists);
-	retval |= validate_sym_dbond_indices (system, workspace, lists);
-	retval |= validate_bonds (system, workspace, lists);
-	retval |= validate_sparse_matrix (system, workspace);
-	retval |= validate_three_bodies (system, workspace, lists );
-	retval |= validate_hbonds (system, workspace, lists);
-	retval |= validate_workspace (system, workspace, lists);
-	retval |= validate_data (system, data);
-	retval |= validate_atoms (system, lists);
-	//analyze_hbonds (system, workspace, lists);
-
-	if (!retval) {
-		fprintf (stderr, "Results *DOES NOT* mattch between device and host \n");
-	}
+    retval |= validate_neighbors (system, lists);
+    retval |= validate_sym_dbond_indices (system, workspace, lists);
+    retval |= validate_bonds (system, workspace, lists);
+    retval |= validate_sparse_matrix (system, workspace);
+    retval |= validate_three_bodies (system, workspace, lists );
+    retval |= validate_hbonds (system, workspace, lists);
+    retval |= validate_workspace (system, workspace, lists);
+    retval |= validate_data (system, data);
+    retval |= validate_atoms (system, lists);
+    //analyze_hbonds (system, workspace, lists);
+
+    if (!retval) {
+        fprintf (stderr, "Results *DOES NOT* mattch between device and host \n");
+    }
 #endif
 
-	return retval;
+    return retval;
 }
diff --git a/PuReMD-GPU/src/four_body_interactions.cu b/PuReMD-GPU/src/four_body_interactions.cu
index da72bff7..d7bf757e 100644
--- a/PuReMD-GPU/src/four_body_interactions.cu
+++ b/PuReMD-GPU/src/four_body_interactions.cu
@@ -32,116 +32,116 @@
 #define MIN_SINE 1e-10
 
 HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_jk,
-		rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
-		three_body_interaction_data *p_ijk, 
-		three_body_interaction_data *p_jkl, 
-		rvec dcos_omega_di, rvec dcos_omega_dj, 
-		rvec dcos_omega_dk, rvec dcos_omega_dl, 
-		output_controls *out_control )
+        rvec dvec_kl, real r_kl, rvec dvec_li, real r_li,
+        three_body_interaction_data *p_ijk, 
+        three_body_interaction_data *p_jkl, 
+        rvec dcos_omega_di, rvec dcos_omega_dj, 
+        rvec dcos_omega_dk, rvec dcos_omega_dl, 
+        output_controls *out_control )
 {
-	real unnorm_cos_omega, unnorm_sin_omega, omega;
-	real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
-	real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe;
-	real arg, poem, tel;
-	rvec cross_jk_kl;
-
-	sin_ijk = SIN( p_ijk->theta );
-	cos_ijk = COS( p_ijk->theta );
-	sin_jkl = SIN( p_jkl->theta );
-	cos_jkl = COS( p_jkl->theta );
-
-	/* omega */
-	unnorm_cos_omega = -rvec_Dot( dvec_ij,dvec_jk )*rvec_Dot( dvec_jk,dvec_kl ) +
-		SQR( r_jk ) *  rvec_Dot( dvec_ij,dvec_kl );
-	rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
-	unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
-	omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); 
-
-	/* derivatives */
-	/* coef for adjusments to cos_theta's */
-	/* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li;
-	   coshd = cos_ijk, coshe = cos_jkl;
-	   sinhd = sin_ijk, sinhe = sin_jkl; */
-	htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk );
-	htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl;
-	htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk );
-	hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl );
-	hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk );
-	hnra = r_kl * sin_ijk * sin_jkl;
-	hnrc = r_ij * sin_ijk * sin_jkl;
-	hnhd = r_ij * r_kl * cos_ijk * sin_jkl;
-	hnhe = r_ij * r_kl * sin_ijk * cos_jkl;
-
-
-	poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
-	if( poem < 1e-20 ) poem = 1e-20;
-
-	tel  = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) - 
-		2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + 
-				r_jk * r_kl * cos_jkl );
-
-	arg  = tel / poem;
-	if( arg >  1.0 ) arg =  1.0;
-	if( arg < -1.0 ) arg = -1.0;
-
-
-	/*fprintf( out_control->etor, 
-	  "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-	  htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe );
-	  fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-	  dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij );
-	  fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-	  -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk );
-	  fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-	  -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl );
-	  fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
-	  r_li, dvec_li[0], dvec_li[1], dvec_li[2] );
-	  fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
-	  r_ij, r_jk, r_kl, r_li ); 
-	  fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", 
-	  cos_ijk, cos_jkl, sin_ijk, sin_jkl ); 
-	  fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-	  poem, tel, arg );*/
-	/* fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-	   -p_ijk->dcos_dk[0]/sin_ijk, 
-	   -p_ijk->dcos_dk[1]/sin_ijk, 
-	   -p_ijk->dcos_dk[2]/sin_ijk );
-	   fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
-	   -p_jkl->dcos_dk[0]/sin_jkl, 
-	   -p_jkl->dcos_dk[1]/sin_jkl, 
-	   -p_jkl->dcos_dk[2]/sin_jkl );*/
-
-	if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE;
-	else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE;
-	if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE;
-	else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE;
-
-	// dcos_omega_di
-	rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li );
-	rvec_ScaledAdd( dcos_omega_di,-(hthd - arg*hnhd)/sin_ijk, p_ijk->dcos_dk );
-	rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di );
-
-	// dcos_omega_dj
-	rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, 
-			-htrb / r_jk, dvec_jk );
-	rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_dj );
-	rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_di );
-	rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj );
-
-	// dcos_omega_dk
-	rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc) / r_kl, dvec_kl,  
-			htrb / r_jk, dvec_jk );
-	rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_di );
-	rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dj );
-	rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk );
-
-	// dcos_omega_dl
-	rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc) / r_kl, dvec_kl, 1., dvec_li );
-	rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dk );
-	rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl );
-
-	return omega;  
-	//return arg;
+    real unnorm_cos_omega, unnorm_sin_omega, omega;
+    real sin_ijk, cos_ijk, sin_jkl, cos_jkl;
+    real htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe;
+    real arg, poem, tel;
+    rvec cross_jk_kl;
+
+    sin_ijk = SIN( p_ijk->theta );
+    cos_ijk = COS( p_ijk->theta );
+    sin_jkl = SIN( p_jkl->theta );
+    cos_jkl = COS( p_jkl->theta );
+
+    /* omega */
+    unnorm_cos_omega = -rvec_Dot( dvec_ij,dvec_jk )*rvec_Dot( dvec_jk,dvec_kl ) +
+        SQR( r_jk ) *  rvec_Dot( dvec_ij,dvec_kl );
+    rvec_Cross( cross_jk_kl, dvec_jk, dvec_kl );
+    unnorm_sin_omega = -r_jk * rvec_Dot( dvec_ij, cross_jk_kl );
+    omega = atan2( unnorm_sin_omega, unnorm_cos_omega ); 
+
+    /* derivatives */
+    /* coef for adjusments to cos_theta's */
+    /* rla = r_ij, rlb = r_jk, rlc = r_kl, r4 = r_li;
+       coshd = cos_ijk, coshe = cos_jkl;
+       sinhd = sin_ijk, sinhe = sin_jkl; */
+    htra = r_ij + cos_ijk * ( r_kl * cos_jkl - r_jk );
+    htrb = r_jk - r_ij * cos_ijk - r_kl * cos_jkl;
+    htrc = r_kl + cos_jkl * ( r_ij * cos_ijk - r_jk );
+    hthd = r_ij * sin_ijk * ( r_jk - r_kl * cos_jkl );
+    hthe = r_kl * sin_jkl * ( r_jk - r_ij * cos_ijk );
+    hnra = r_kl * sin_ijk * sin_jkl;
+    hnrc = r_ij * sin_ijk * sin_jkl;
+    hnhd = r_ij * r_kl * cos_ijk * sin_jkl;
+    hnhe = r_ij * r_kl * sin_ijk * cos_jkl;
+
+
+    poem = 2.0 * r_ij * r_kl * sin_ijk * sin_jkl;
+    if( poem < 1e-20 ) poem = 1e-20;
+
+    tel  = (SQR(r_ij) + SQR(r_jk) + SQR(r_kl) - SQR(r_li)) - 
+        2.0 * ( r_ij * r_jk * cos_ijk - r_ij * r_kl * cos_ijk * cos_jkl + 
+                r_jk * r_kl * cos_jkl );
+
+    arg  = tel / poem;
+    if( arg >  1.0 ) arg =  1.0;
+    if( arg < -1.0 ) arg = -1.0;
+
+
+    /*fprintf( out_control->etor, 
+      "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+      htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+      dvec_ij[0]/r_ij, dvec_ij[1]/r_ij, dvec_ij[2]/r_ij );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+      -dvec_jk[0]/r_jk, -dvec_jk[1]/r_jk, -dvec_jk[2]/r_jk );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+      -dvec_kl[0]/r_kl, -dvec_kl[1]/r_kl, -dvec_kl[2]/r_kl );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
+      r_li, dvec_li[0], dvec_li[1], dvec_li[2] );
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n",
+      r_ij, r_jk, r_kl, r_li ); 
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e%23.15e\n", 
+      cos_ijk, cos_jkl, sin_ijk, sin_jkl ); 
+      fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+      poem, tel, arg );*/
+    /* fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+       -p_ijk->dcos_dk[0]/sin_ijk, 
+       -p_ijk->dcos_dk[1]/sin_ijk, 
+       -p_ijk->dcos_dk[2]/sin_ijk );
+       fprintf( out_control->etor, "%23.15e%23.15e%23.15e\n",
+       -p_jkl->dcos_dk[0]/sin_jkl, 
+       -p_jkl->dcos_dk[1]/sin_jkl, 
+       -p_jkl->dcos_dk[2]/sin_jkl );*/
+
+    if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) sin_ijk = MIN_SINE;
+    else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) sin_ijk = -MIN_SINE;
+    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) sin_jkl = MIN_SINE;
+    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) sin_jkl = -MIN_SINE;
+
+    // dcos_omega_di
+    rvec_ScaledSum( dcos_omega_di, (htra-arg*hnra)/r_ij, dvec_ij, -1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_di,-(hthd - arg*hnhd)/sin_ijk, p_ijk->dcos_dk );
+    rvec_Scale( dcos_omega_di, 2.0 / poem, dcos_omega_di );
+
+    // dcos_omega_dj
+    rvec_ScaledSum( dcos_omega_dj,-(htra-arg*hnra)/r_ij, dvec_ij, 
+            -htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dj,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_dj );
+    rvec_ScaledAdd( dcos_omega_dj,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_di );
+    rvec_Scale( dcos_omega_dj, 2.0 / poem, dcos_omega_dj );
+
+    // dcos_omega_dk
+    rvec_ScaledSum( dcos_omega_dk,-(htrc-arg*hnrc) / r_kl, dvec_kl,  
+            htrb / r_jk, dvec_jk );
+    rvec_ScaledAdd( dcos_omega_dk,-(hthd-arg*hnhd) / sin_ijk, p_ijk->dcos_di );
+    rvec_ScaledAdd( dcos_omega_dk,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dj );
+    rvec_Scale( dcos_omega_dk, 2.0 / poem, dcos_omega_dk );
+
+    // dcos_omega_dl
+    rvec_ScaledSum( dcos_omega_dl, (htrc-arg*hnrc) / r_kl, dvec_kl, 1., dvec_li );
+    rvec_ScaledAdd( dcos_omega_dl,-(hthe-arg*hnhe) / sin_jkl, p_jkl->dcos_dk );
+    rvec_Scale( dcos_omega_dl, 2.0 / poem, dcos_omega_dl );
+
+    return omega;  
+    //return arg;
 }
 
 
@@ -149,519 +149,519 @@ HOST_DEVICE real Calculate_Omega( rvec dvec_ij, real r_ij, rvec dvec_jk, real r_
 
 
 void Four_Body_Interactions( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
-	int i, j, k, l, pi, pj, pk, pl, pij, plk;
-	int type_i, type_j, type_k, type_l;
-	int start_j, end_j, start_k, end_k;
-	int start_pj, end_pj, start_pk, end_pk;
-	int num_frb_intrs = 0;
-
-	real Delta_j, Delta_k;
-	real r_ij, r_jk, r_kl, r_li;
-	real BOA_ij, BOA_jk, BOA_kl;
-
-	real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
-	real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
-	real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
-	real fn10, f11_DjDk, dfn11, fn12;
-
-	real theta_ijk, theta_jkl;
-	real sin_ijk, sin_jkl;
-	real cos_ijk, cos_jkl;
-	real tan_ijk_i, tan_jkl_i;
-
-	real omega, cos_omega, cos2omega, cos3omega;
-	rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
-
-	real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
-	real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
-	real Cconj, CEconj1, CEconj2, CEconj3;
-	real CEconj4, CEconj5, CEconj6;
-
-	real e_tor, e_con;
-	rvec dvec_li;
-	rvec force, ext_press;
-	ivec rel_box_jl;
-	// rtensor total_rtensor, temp_rtensor;
-
-	four_body_header *fbh;
-	four_body_parameters *fbp;
-	bond_data *pbond_ij, *pbond_jk, *pbond_kl;
-	bond_order_data *bo_ij, *bo_jk, *bo_kl;
-	three_body_interaction_data *p_ijk, *p_jkl;
-
-	real p_tor2 = system->reaxprm.gp.l[23];
-	real p_tor3 = system->reaxprm.gp.l[24];
-	real p_tor4 = system->reaxprm.gp.l[25];
-	real p_cot2 = system->reaxprm.gp.l[27];
-
-	list *bonds = (*lists) + BONDS;
-	list *thb_intrs = (*lists) + THREE_BODIES;
-
-
-	for( j = 0; j < system->N; ++j ) {
-		type_j = system->atoms[j].type;
-		Delta_j = workspace->Delta_boc[j];
-		start_j = Start_Index(j, bonds);
-		end_j = End_Index(j, bonds);
-
-
-		for( pk = start_j; pk < end_j; ++pk ) {
-			pbond_jk = &( bonds->select.bond_list[pk] );
-			k = pbond_jk->nbr;
-			bo_jk = &( pbond_jk->bo_data );
-			BOA_jk = bo_jk->BO - control->thb_cut;
-
-			/* see if there are any 3-body interactions involving j&k
-			   where j is the central atom. Otherwise there is no point in
-			   trying to form a 4-body interaction out of this neighborhood */	
-			if( j < k && bo_jk->BO > control->thb_cut/*0*/ && 
-					Num_Entries(pk, thb_intrs) ) {
-				start_k = Start_Index(k, bonds);
-				end_k = End_Index(k, bonds);	    	       
-				pj = pbond_jk->sym_index; // pj points to j on k's list
-
-				/* do the same check as above: are there any 3-body interactions 
-				   involving k&j where k is the central atom */
-				if( Num_Entries(pj, thb_intrs) ) {
-					type_k = system->atoms[k].type;
-					Delta_k = workspace->Delta_boc[k];
-					r_jk = pbond_jk->d;
-
-					start_pk = Start_Index(pk, thb_intrs );
-					end_pk = End_Index(pk, thb_intrs );
-					start_pj = Start_Index(pj, thb_intrs );
-					end_pj = End_Index(pj, thb_intrs );		
-
-					exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
-					exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
-					exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
-					exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
-					exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
-					f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
-
-
-					/* pick i up from j-k interaction where j is the centre atom */
-					for( pi = start_pk; pi < end_pk; ++pi ) {
-						p_ijk = &( thb_intrs->select.three_body_list[pi] );
-						pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
-						pbond_ij = &( bonds->select.bond_list[pij] );
-						bo_ij = &( pbond_ij->bo_data );
-
-
-						if( bo_ij->BO > control->thb_cut/*0*/ ) {
-							i = p_ijk->thb;
-							type_i = system->atoms[i].type;
-							r_ij = pbond_ij->d;
-							BOA_ij = bo_ij->BO - control->thb_cut;
-
-							theta_ijk = p_ijk->theta;
-							sin_ijk = SIN( theta_ijk );
-							cos_ijk = COS( theta_ijk );
-							//tan_ijk_i = 1. / TAN( theta_ijk );
-							if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
-								tan_ijk_i = cos_ijk / MIN_SINE;
-							else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
-								tan_ijk_i = cos_ijk / -MIN_SINE;
-							else tan_ijk_i = cos_ijk / sin_ijk;
-
-							exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
-							exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
-
-							/* pick l up from j-k intr. where k is the centre */
-							for( pl = start_pj; pl < end_pj; ++pl ) {
-								p_jkl = &( thb_intrs->select.three_body_list[pl] );
-								l = p_jkl->thb;
-								plk = p_jkl->pthb; //pointer to l on k's bond_list!
-								pbond_kl = &( bonds->select.bond_list[plk] );
-								bo_kl = &( pbond_kl->bo_data );
-								type_l = system->atoms[l].type;
-								fbh = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm ) ]);
-								fbp = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm )].prm[0]);
-
-								if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
-										bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
-									++num_frb_intrs;
-									r_kl = pbond_kl->d;
-									BOA_kl = bo_kl->BO - control->thb_cut;
-
-									theta_jkl = p_jkl->theta;
-									sin_jkl = SIN( theta_jkl );
-									cos_jkl = COS( theta_jkl );
-									//tan_jkl_i = 1. / TAN( theta_jkl );
-									if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
-										tan_jkl_i = cos_jkl / MIN_SINE;
-									else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
-										tan_jkl_i = cos_jkl / -MIN_SINE;
-									else tan_jkl_i = cos_jkl /sin_jkl;
-
-									Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, 
-											&(system->box), dvec_li );
-									r_li = rvec_Norm( dvec_li );
-
-
-									/* omega and its derivative */
-									//cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, 
-									omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, 
-											r_jk, pbond_kl->dvec, r_kl,
-											dvec_li, r_li, p_ijk, p_jkl,
-											dcos_omega_di, dcos_omega_dj,
-											dcos_omega_dk, dcos_omega_dl,
-											out_control);
-									cos_omega = COS( omega );
-									cos2omega = COS( 2. * omega );
-									cos3omega = COS( 3. * omega );
-									/* end omega calculations */
-
-									/* torsion energy */
-									exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk));
-									exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
-									exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) );
-									fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
-										(1.0 - exp_tor2_kl);
-
-									CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
-											fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
-											fbp->V3 * (1.0 + cos3omega) );
-									//CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + 
-									//  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
-									//  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
-
-									data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV;
-
-									dfn11 = (-p_tor3 * exp_tor3_DjDk +
-											(p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
-											(2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
-
-									CEtors1 = sin_ijk * sin_jkl * CV;
-
-									CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * 
-										(2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * 
-										sin_ijk * sin_jkl; 
-
-									CEtors3 = CEtors2 * dfn11;
-
-									CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
-										(1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
-
-									CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * 
-										(1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
-
-									CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
-										(1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
-
-									cmn = -fn10 * CV;
-									CEtors7 = cmn * sin_jkl * tan_ijk_i;
-									CEtors8 = cmn * sin_ijk * tan_jkl_i;
-									CEtors9 = fn10 * sin_ijk * sin_jkl * 
-										(0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-										 1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
-									//cmn = -fn10 * CV;
-									//CEtors7 = cmn * sin_jkl * cos_ijk;
-									//CEtors8 = cmn * sin_ijk * cos_jkl;
-									//CEtors9 = fn10 * sin_ijk * sin_jkl * 
-									//  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-									//   fbp->V3 * (6*SQR(cos_omega) - 1.50));
-									/* end  of torsion energy */
-
-
-									/* 4-body conjugation energy */
-									fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-									data->E_Con += e_con = fbp->p_cot1 * fn12 * 
-										(1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
-
-									Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
-										(1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
-
-									CEconj1 = Cconj * (BOA_ij - 1.5e0);
-									CEconj2 = Cconj * (BOA_jk - 1.5e0);
-									CEconj3 = Cconj * (BOA_kl - 1.5e0);
-
-									CEconj4 = -fbp->p_cot1 * fn12 * 
-										(SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
-									CEconj5 = -fbp->p_cot1 * fn12 * 
-										(SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
-									//CEconj4 = -fbp->p_cot1 * fn12 * 
-									//  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
-									//CEconj5 = -fbp->p_cot1 * fn12 * 
-									//  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
-									CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
-										cos_omega * sin_ijk * sin_jkl;
-									/* end 4-body conjugation energy */
-
-									//fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
-									//   workspace->orig_id[i], workspace->orig_id[j],
-									//       workspace->orig_id[k], workspace->orig_id[l], 
-									//    omega, cos_omega, cos2omega, cos3omega );
-									//fprintf(stdout, 
-									//    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-									//    CEtors2, CEtors3, CEtors4, CEtors5, 
-									//    CEtors6, CEtors7, CEtors8, CEtors9 );
-									//fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-									//    theta_ijk, theta_jkl, sin_ijk, 
-									//    sin_jkl, cos_jkl, tan_jkl_i );
-
-									/* forces */
-									bo_jk->Cdbopi += CEtors2;
-									workspace->CdDelta[j] += CEtors3;
-									workspace->CdDelta[k] += CEtors3;
-									bo_ij->Cdbo += (CEtors4 + CEconj1);
-									bo_jk->Cdbo += (CEtors5 + CEconj2);
-
-									bo_kl->Cdbo += (CEtors6 + CEconj3);
-
-									if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-										/* dcos_theta_ijk */
-										rvec_ScaledAdd( system->atoms[i].f, 
-												CEtors7 + CEconj4, p_ijk->dcos_dk );
-										rvec_ScaledAdd( system->atoms[j].f, 
-												CEtors7 + CEconj4, p_ijk->dcos_dj );
-										rvec_ScaledAdd( system->atoms[k].f, 
-												CEtors7 + CEconj4, p_ijk->dcos_di );
-
-										/* dcos_theta_jkl */
-										rvec_ScaledAdd( system->atoms[j].f, 
-												CEtors8 + CEconj5, p_jkl->dcos_di );
-										rvec_ScaledAdd( system->atoms[k].f, 
-												CEtors8 + CEconj5, p_jkl->dcos_dj );
-										rvec_ScaledAdd( system->atoms[l].f, 
-												CEtors8 + CEconj5, p_jkl->dcos_dk );
-
-										/* dcos_omega */
-										rvec_ScaledAdd( system->atoms[i].f, 
-												CEtors9 + CEconj6, dcos_omega_di );
-										rvec_ScaledAdd( system->atoms[j].f, 
-												CEtors9 + CEconj6, dcos_omega_dj );
-										rvec_ScaledAdd( system->atoms[k].f, 
-												CEtors9 + CEconj6, dcos_omega_dk );
-										rvec_ScaledAdd( system->atoms[l].f, 
-												CEtors9 + CEconj6, dcos_omega_dl );
-									}
-									else {
-										ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
-
-										/* dcos_theta_ijk */
-										rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
-										rvec_Add( system->atoms[i].f, force );
-										rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-										rvec_Add( data->ext_press, ext_press );
-
-										rvec_ScaledAdd( system->atoms[j].f, 
-												CEtors7 + CEconj4, p_ijk->dcos_dj );
-
-										rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
-										rvec_Add( system->atoms[k].f, force );
-										rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-										rvec_Add( data->ext_press, ext_press );
-
-
-										/* dcos_theta_jkl */
-										rvec_ScaledAdd( system->atoms[j].f, 
-												CEtors8 + CEconj5, p_jkl->dcos_di );
-
-										rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
-										rvec_Add( system->atoms[k].f, force );
-										rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-										rvec_Add( data->ext_press, ext_press );
-
-										rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
-										rvec_Add( system->atoms[l].f, force );
-										rvec_iMultiply( ext_press, rel_box_jl, force );
-										rvec_Add( data->ext_press, ext_press );
-
-
-										/* dcos_omega */				      
-										rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
-										rvec_Add( system->atoms[i].f, force );
-										rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-										rvec_Add( data->ext_press, ext_press );
-
-										rvec_ScaledAdd( system->atoms[j].f, 
-												CEtors9 + CEconj6, dcos_omega_dj );
-
-										rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
-										rvec_Add( system->atoms[k].f, force );
-										rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-										rvec_Add( data->ext_press, ext_press );
-
-										rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
-										rvec_Add( system->atoms[l].f, force );
-										rvec_iMultiply( ext_press, rel_box_jl, force );
-										rvec_Add( data->ext_press, ext_press );
-
-
-										/* This part is intended for a fully-flexible box */
-										/* rvec_ScaledSum( temp_rvec, 
-										   CEtors7 + CEconj4, p_ijk->dcos_dk,      // i     
-										   CEtors9 + CEconj6, dcos_omega_di );
-										   rvec_OuterProduct( temp_rtensor, 
-										   temp_rvec, system->atoms[i].x );
-										   rtensor_Copy( total_rtensor, temp_rtensor );
-
-										   rvec_ScaledSum( temp_rvec, 
-										   CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
-										   CEtors8 + CEconj5, p_jkl->dcos_di );
-										   rvec_ScaledAdd( temp_rvec, 
-										   CEtors9 + CEconj6, dcos_omega_dj );
-										   rvec_OuterProduct( temp_rtensor, 
-										   temp_rvec, system->atoms[j].x );
-										   rtensor_Add( total_rtensor, temp_rtensor );
-
-										   rvec_ScaledSum( temp_rvec, 
-										   CEtors7 + CEconj4, p_ijk->dcos_di,      // k
-										   CEtors8 + CEconj5, p_jkl->dcos_dj );
-										   rvec_ScaledAdd( temp_rvec, 
-										   CEtors9 + CEconj6, dcos_omega_dk );
-										   rvec_OuterProduct( temp_rtensor, 
-										   temp_rvec, system->atoms[k].x );
-										   rtensor_Add( total_rtensor, temp_rtensor );
-
-										   rvec_ScaledSum( temp_rvec, 
-										   CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
-										   CEtors9 + CEconj6, dcos_omega_dl );
-										   rvec_OuterProduct( temp_rtensor, 
-										   temp_rvec, system->atoms[l].x );
-										   rtensor_Copy( total_rtensor, temp_rtensor );
-
-										   if( pbond_ij->imaginary || pbond_jk->imaginary || 
-										   pbond_kl->imaginary )
-										   rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
-										   else
-										   rtensor_Add( data->flex_bar.P, total_rtensor ); */
-									}
+    int i, j, k, l, pi, pj, pk, pl, pij, plk;
+    int type_i, type_j, type_k, type_l;
+    int start_j, end_j, start_k, end_k;
+    int start_pj, end_pj, start_pk, end_pk;
+    int num_frb_intrs = 0;
+
+    real Delta_j, Delta_k;
+    real r_ij, r_jk, r_kl, r_li;
+    real BOA_ij, BOA_jk, BOA_kl;
+
+    real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
+    real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
+    real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
+    real fn10, f11_DjDk, dfn11, fn12;
+
+    real theta_ijk, theta_jkl;
+    real sin_ijk, sin_jkl;
+    real cos_ijk, cos_jkl;
+    real tan_ijk_i, tan_jkl_i;
+
+    real omega, cos_omega, cos2omega, cos3omega;
+    rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
+
+    real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
+    real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
+    real Cconj, CEconj1, CEconj2, CEconj3;
+    real CEconj4, CEconj5, CEconj6;
+
+    real e_tor, e_con;
+    rvec dvec_li;
+    rvec force, ext_press;
+    ivec rel_box_jl;
+    // rtensor total_rtensor, temp_rtensor;
+
+    four_body_header *fbh;
+    four_body_parameters *fbp;
+    bond_data *pbond_ij, *pbond_jk, *pbond_kl;
+    bond_order_data *bo_ij, *bo_jk, *bo_kl;
+    three_body_interaction_data *p_ijk, *p_jkl;
+
+    real p_tor2 = system->reaxprm.gp.l[23];
+    real p_tor3 = system->reaxprm.gp.l[24];
+    real p_tor4 = system->reaxprm.gp.l[25];
+    real p_cot2 = system->reaxprm.gp.l[27];
+
+    list *bonds = (*lists) + BONDS;
+    list *thb_intrs = (*lists) + THREE_BODIES;
+
+
+    for( j = 0; j < system->N; ++j ) {
+        type_j = system->atoms[j].type;
+        Delta_j = workspace->Delta_boc[j];
+        start_j = Start_Index(j, bonds);
+        end_j = End_Index(j, bonds);
+
+
+        for( pk = start_j; pk < end_j; ++pk ) {
+            pbond_jk = &( bonds->select.bond_list[pk] );
+            k = pbond_jk->nbr;
+            bo_jk = &( pbond_jk->bo_data );
+            BOA_jk = bo_jk->BO - control->thb_cut;
+
+            /* see if there are any 3-body interactions involving j&k
+               where j is the central atom. Otherwise there is no point in
+               trying to form a 4-body interaction out of this neighborhood */    
+            if( j < k && bo_jk->BO > control->thb_cut/*0*/ && 
+                    Num_Entries(pk, thb_intrs) ) {
+                start_k = Start_Index(k, bonds);
+                end_k = End_Index(k, bonds);                   
+                pj = pbond_jk->sym_index; // pj points to j on k's list
+
+                /* do the same check as above: are there any 3-body interactions 
+                   involving k&j where k is the central atom */
+                if( Num_Entries(pj, thb_intrs) ) {
+                    type_k = system->atoms[k].type;
+                    Delta_k = workspace->Delta_boc[k];
+                    r_jk = pbond_jk->d;
+
+                    start_pk = Start_Index(pk, thb_intrs );
+                    end_pk = End_Index(pk, thb_intrs );
+                    start_pj = Start_Index(pj, thb_intrs );
+                    end_pj = End_Index(pj, thb_intrs );        
+
+                    exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
+                    exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
+                    exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
+                    exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
+                    exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
+                    f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
+
+
+                    /* pick i up from j-k interaction where j is the centre atom */
+                    for( pi = start_pk; pi < end_pk; ++pi ) {
+                        p_ijk = &( thb_intrs->select.three_body_list[pi] );
+                        pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
+                        pbond_ij = &( bonds->select.bond_list[pij] );
+                        bo_ij = &( pbond_ij->bo_data );
+
+
+                        if( bo_ij->BO > control->thb_cut/*0*/ ) {
+                            i = p_ijk->thb;
+                            type_i = system->atoms[i].type;
+                            r_ij = pbond_ij->d;
+                            BOA_ij = bo_ij->BO - control->thb_cut;
+
+                            theta_ijk = p_ijk->theta;
+                            sin_ijk = SIN( theta_ijk );
+                            cos_ijk = COS( theta_ijk );
+                            //tan_ijk_i = 1. / TAN( theta_ijk );
+                            if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
+                                tan_ijk_i = cos_ijk / MIN_SINE;
+                            else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
+                                tan_ijk_i = cos_ijk / -MIN_SINE;
+                            else tan_ijk_i = cos_ijk / sin_ijk;
+
+                            exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
+                            exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
+
+                            /* pick l up from j-k intr. where k is the centre */
+                            for( pl = start_pj; pl < end_pj; ++pl ) {
+                                p_jkl = &( thb_intrs->select.three_body_list[pl] );
+                                l = p_jkl->thb;
+                                plk = p_jkl->pthb; //pointer to l on k's bond_list!
+                                pbond_kl = &( bonds->select.bond_list[plk] );
+                                bo_kl = &( pbond_kl->bo_data );
+                                type_l = system->atoms[l].type;
+                                fbh = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm ) ]);
+                                fbp = &(system->reaxprm.fbp[ index_fbp (type_i,type_j,type_k,type_l,&system->reaxprm )].prm[0]);
+
+                                if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
+                                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
+                                    ++num_frb_intrs;
+                                    r_kl = pbond_kl->d;
+                                    BOA_kl = bo_kl->BO - control->thb_cut;
+
+                                    theta_jkl = p_jkl->theta;
+                                    sin_jkl = SIN( theta_jkl );
+                                    cos_jkl = COS( theta_jkl );
+                                    //tan_jkl_i = 1. / TAN( theta_jkl );
+                                    if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
+                                        tan_jkl_i = cos_jkl / MIN_SINE;
+                                    else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
+                                        tan_jkl_i = cos_jkl / -MIN_SINE;
+                                    else tan_jkl_i = cos_jkl /sin_jkl;
+
+                                    Sq_Distance_on_T3( system->atoms[l].x, system->atoms[i].x, 
+                                            &(system->box), dvec_li );
+                                    r_li = rvec_Norm( dvec_li );
+
+
+                                    /* omega and its derivative */
+                                    //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, 
+                                    omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, 
+                                            r_jk, pbond_kl->dvec, r_kl,
+                                            dvec_li, r_li, p_ijk, p_jkl,
+                                            dcos_omega_di, dcos_omega_dj,
+                                            dcos_omega_dk, dcos_omega_dl,
+                                            out_control);
+                                    cos_omega = COS( omega );
+                                    cos2omega = COS( 2. * omega );
+                                    cos3omega = COS( 3. * omega );
+                                    /* end omega calculations */
+
+                                    /* torsion energy */
+                                    exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk));
+                                    exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
+                                    exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) );
+                                    fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
+                                        (1.0 - exp_tor2_kl);
+
+                                    CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
+                                            fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
+                                            fbp->V3 * (1.0 + cos3omega) );
+                                    //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + 
+                                    //  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
+                                    //  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
+
+                                    data->E_Tor += e_tor = fn10 * sin_ijk * sin_jkl * CV;
+
+                                    dfn11 = (-p_tor3 * exp_tor3_DjDk +
+                                            (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
+                                            (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
+
+                                    CEtors1 = sin_ijk * sin_jkl * CV;
+
+                                    CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * 
+                                        (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * 
+                                        sin_ijk * sin_jkl; 
+
+                                    CEtors3 = CEtors2 * dfn11;
+
+                                    CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
+                                        (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
+
+                                    CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * 
+                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
+
+                                    CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
+                                        (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
+
+                                    cmn = -fn10 * CV;
+                                    CEtors7 = cmn * sin_jkl * tan_ijk_i;
+                                    CEtors8 = cmn * sin_ijk * tan_jkl_i;
+                                    CEtors9 = fn10 * sin_ijk * sin_jkl * 
+                                        (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                         1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
+                                    //cmn = -fn10 * CV;
+                                    //CEtors7 = cmn * sin_jkl * cos_ijk;
+                                    //CEtors8 = cmn * sin_ijk * cos_jkl;
+                                    //CEtors9 = fn10 * sin_ijk * sin_jkl * 
+                                    //  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                    //   fbp->V3 * (6*SQR(cos_omega) - 1.50));
+                                    /* end  of torsion energy */
+
+
+                                    /* 4-body conjugation energy */
+                                    fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
+                                    data->E_Con += e_con = fbp->p_cot1 * fn12 * 
+                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+
+                                    Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
+                                        (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+
+                                    CEconj1 = Cconj * (BOA_ij - 1.5e0);
+                                    CEconj2 = Cconj * (BOA_jk - 1.5e0);
+                                    CEconj3 = Cconj * (BOA_kl - 1.5e0);
+
+                                    CEconj4 = -fbp->p_cot1 * fn12 * 
+                                        (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
+                                    CEconj5 = -fbp->p_cot1 * fn12 * 
+                                        (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
+                                    //CEconj4 = -fbp->p_cot1 * fn12 * 
+                                    //  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
+                                    //CEconj5 = -fbp->p_cot1 * fn12 * 
+                                    //  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
+                                    CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
+                                        cos_omega * sin_ijk * sin_jkl;
+                                    /* end 4-body conjugation energy */
+
+                                    //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
+                                    //   workspace->orig_id[i], workspace->orig_id[j],
+                                    //       workspace->orig_id[k], workspace->orig_id[l], 
+                                    //    omega, cos_omega, cos2omega, cos3omega );
+                                    //fprintf(stdout, 
+                                    //    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                    //    CEtors2, CEtors3, CEtors4, CEtors5, 
+                                    //    CEtors6, CEtors7, CEtors8, CEtors9 );
+                                    //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                    //    theta_ijk, theta_jkl, sin_ijk, 
+                                    //    sin_jkl, cos_jkl, tan_jkl_i );
+
+                                    /* forces */
+                                    bo_jk->Cdbopi += CEtors2;
+                                    workspace->CdDelta[j] += CEtors3;
+                                    workspace->CdDelta[k] += CEtors3;
+                                    bo_ij->Cdbo += (CEtors4 + CEconj1);
+                                    bo_jk->Cdbo += (CEtors5 + CEconj2);
+
+                                    bo_kl->Cdbo += (CEtors6 + CEconj3);
+
+                                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                                        /* dcos_theta_ijk */
+                                        rvec_ScaledAdd( system->atoms[i].f, 
+                                                CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[k].f, 
+                                                CEtors7 + CEconj4, p_ijk->dcos_di );
+
+                                        /* dcos_theta_jkl */
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors8 + CEconj5, p_jkl->dcos_di );
+                                        rvec_ScaledAdd( system->atoms[k].f, 
+                                                CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                        rvec_ScaledAdd( system->atoms[l].f, 
+                                                CEtors8 + CEconj5, p_jkl->dcos_dk );
+
+                                        /* dcos_omega */
+                                        rvec_ScaledAdd( system->atoms[i].f, 
+                                                CEtors9 + CEconj6, dcos_omega_di );
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors9 + CEconj6, dcos_omega_dj );
+                                        rvec_ScaledAdd( system->atoms[k].f, 
+                                                CEtors9 + CEconj6, dcos_omega_dk );
+                                        rvec_ScaledAdd( system->atoms[l].f, 
+                                                CEtors9 + CEconj6, dcos_omega_dl );
+                                    }
+                                    else {
+                                        ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
+
+                                        /* dcos_theta_ijk */
+                                        rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                        rvec_Add( system->atoms[i].f, force );
+                                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors7 + CEconj4, p_ijk->dcos_dj );
+
+                                        rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
+                                        rvec_Add( system->atoms[k].f, force );
+                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+
+
+                                        /* dcos_theta_jkl */
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors8 + CEconj5, p_jkl->dcos_di );
+
+                                        rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                        rvec_Add( system->atoms[k].f, force );
+                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+
+                                        rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                        rvec_Add( system->atoms[l].f, force );
+                                        rvec_iMultiply( ext_press, rel_box_jl, force );
+                                        rvec_Add( data->ext_press, ext_press );
+
+
+                                        /* dcos_omega */                      
+                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
+                                        rvec_Add( system->atoms[i].f, force );
+                                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+
+                                        rvec_ScaledAdd( system->atoms[j].f, 
+                                                CEtors9 + CEconj6, dcos_omega_dj );
+
+                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
+                                        rvec_Add( system->atoms[k].f, force );
+                                        rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                        rvec_Add( data->ext_press, ext_press );
+
+                                        rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
+                                        rvec_Add( system->atoms[l].f, force );
+                                        rvec_iMultiply( ext_press, rel_box_jl, force );
+                                        rvec_Add( data->ext_press, ext_press );
+
+
+                                        /* This part is intended for a fully-flexible box */
+                                        /* rvec_ScaledSum( temp_rvec, 
+                                           CEtors7 + CEconj4, p_ijk->dcos_dk,      // i     
+                                           CEtors9 + CEconj6, dcos_omega_di );
+                                           rvec_OuterProduct( temp_rtensor, 
+                                           temp_rvec, system->atoms[i].x );
+                                           rtensor_Copy( total_rtensor, temp_rtensor );
+
+                                           rvec_ScaledSum( temp_rvec, 
+                                           CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
+                                           CEtors8 + CEconj5, p_jkl->dcos_di );
+                                           rvec_ScaledAdd( temp_rvec, 
+                                           CEtors9 + CEconj6, dcos_omega_dj );
+                                           rvec_OuterProduct( temp_rtensor, 
+                                           temp_rvec, system->atoms[j].x );
+                                           rtensor_Add( total_rtensor, temp_rtensor );
+
+                                           rvec_ScaledSum( temp_rvec, 
+                                           CEtors7 + CEconj4, p_ijk->dcos_di,      // k
+                                           CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                           rvec_ScaledAdd( temp_rvec, 
+                                           CEtors9 + CEconj6, dcos_omega_dk );
+                                           rvec_OuterProduct( temp_rtensor, 
+                                           temp_rvec, system->atoms[k].x );
+                                           rtensor_Add( total_rtensor, temp_rtensor );
+
+                                           rvec_ScaledSum( temp_rvec, 
+                                           CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
+                                           CEtors9 + CEconj6, dcos_omega_dl );
+                                           rvec_OuterProduct( temp_rtensor, 
+                                           temp_rvec, system->atoms[l].x );
+                                           rtensor_Copy( total_rtensor, temp_rtensor );
+
+                                           if( pbond_ij->imaginary || pbond_jk->imaginary || 
+                                           pbond_kl->imaginary )
+                                           rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
+                                           else
+                                           rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                                    }
 
 #ifdef TEST_ENERGY
-									/*fprintf( out_control->etor, 
-									//"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-									//r_ij, r_jk, r_kl, 
-									"%12.8f%12.8f%12.8f%12.8f\n",
-									cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
-									// fprintf( out_control->etor, "%12.8f\n", dfn11 );
-									fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", 
-											fn10, cos_omega, CV );
-
-									fprintf( out_control->etor, 
-											"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-											CEtors2, CEtors3, CEtors4, CEtors5, 
-											CEtors6, CEtors7, CEtors8, CEtors9 );
-
-									/* fprintf( out_control->etor, 
-									   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-									   htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
-
-									fprintf( out_control->etor, 
-											"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-											CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
-									/* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
-									   fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
-
-									fprintf( out_control->etor, 
-											//"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", 
-											"%6d%6d%6d%6d%12.8f%12.8f\n", 
-											workspace->orig_id[i], workspace->orig_id[j], 
-											workspace->orig_id[k], workspace->orig_id[l], 
-											e_tor, e_con );
-									//RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
-
-									fprintf( out_control->econ, 
-											"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-											workspace->orig_id[i], workspace->orig_id[j], 
-											workspace->orig_id[k], workspace->orig_id[l], 
-											RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
-											e_con,data->E_Con );
-
-									/* fprintf( out_control->etor, 
-									   "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",	   
-									   (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], 
-									   (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], 
-									   (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
-									   (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], 
-									   (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], 
-									   (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
-									   (CEtors7 + CEconj4)*p_ijk->dcos_di[0], 
-									   (CEtors7 + CEconj4)*p_ijk->dcos_di[1], 
-									   (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
-
-
-									/* fprintf( out_control->etor, 
-									   "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-									   (CEtors8 + CEconj5)*p_jkl->dcos_di[0], 
-									   (CEtors8 + CEconj5)*p_jkl->dcos_di[1], 
-									   (CEtors8 + CEconj5)*p_jkl->dcos_di[2], 
-									   (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], 
-									   (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], 
-									   (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], 
-									   (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], 
-									   (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], 
-									   (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
-
-									fprintf( out_control->etor, 
-											"%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-											dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], 
-											dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], 
-											dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
-											dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
+                                    /*fprintf( out_control->etor, 
+                                    //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                    //r_ij, r_jk, r_kl, 
+                                    "%12.8f%12.8f%12.8f%12.8f\n",
+                                    cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
+                                    // fprintf( out_control->etor, "%12.8f\n", dfn11 );
+                                    fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", 
+                                            fn10, cos_omega, CV );
+
+                                    fprintf( out_control->etor, 
+                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                            CEtors2, CEtors3, CEtors4, CEtors5, 
+                                            CEtors6, CEtors7, CEtors8, CEtors9 );
+
+                                    /* fprintf( out_control->etor, 
+                                       "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                       htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
+
+                                    fprintf( out_control->etor, 
+                                            "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                            CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
+                                    /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
+                                       fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
+
+                                    fprintf( out_control->etor, 
+                                            //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", 
+                                            "%6d%6d%6d%6d%12.8f%12.8f\n", 
+                                            workspace->orig_id[i], workspace->orig_id[j], 
+                                            workspace->orig_id[k], workspace->orig_id[l], 
+                                            e_tor, e_con );
+                                    //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
+
+                                    fprintf( out_control->econ, 
+                                            "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
+                                            workspace->orig_id[i], workspace->orig_id[j], 
+                                            workspace->orig_id[k], workspace->orig_id[l], 
+                                            RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
+                                            e_con,data->E_Con );
+
+                                    /* fprintf( out_control->etor, 
+                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",       
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[0], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[1], 
+                                       (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
+
+
+                                    /* fprintf( out_control->etor, 
+                                       "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[0], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[1], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_di[2], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], 
+                                       (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
+
+                                    fprintf( out_control->etor, 
+                                            "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                            dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], 
+                                            dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], 
+                                            dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
+                                            dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
 #endif
 
 #ifdef TEST_FORCES
-									// Torsion Forces 
-									Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., 
-											workspace->f_tor, workspace->f_tor);
-									Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
-									Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
-									Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
-									Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
-									Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
-
-									rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk);
-									rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj);
-									rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di);
-
-									rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di);
-									rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj);
-									rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk);
-
-									rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di );
-									rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj );
-									rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
-									rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
-
-									// Conjugation Forces 
-									Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
-									Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
-									Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
-
-									rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk);
-									rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj);
-									rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di);
-
-									rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di);
-									rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj);
-									rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk);
-
-									rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di );
-									rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj );
-									rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk );
-									rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl );
+                                    // Torsion Forces 
+                                    Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., 
+                                            workspace->f_tor, workspace->f_tor);
+                                    Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
+                                    Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
+                                    Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
+                                    Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
+                                    Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
+
+                                    rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk);
+                                    rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj);
+                                    rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di);
+
+                                    rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di);
+                                    rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj);
+                                    rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk);
+
+                                    rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di );
+                                    rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj );
+                                    rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
+                                    rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
+
+                                    // Conjugation Forces 
+                                    Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
+                                    Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
+                                    Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
+
+                                    rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk);
+                                    rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj);
+                                    rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di);
+
+                                    rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di);
+                                    rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj);
+                                    rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk);
+
+                                    rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di );
+                                    rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj );
+                                    rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk );
+                                    rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl );
 #endif
-								} // pl check ends
-							} // pl loop ends
-						} // pi check ends
-					} // pi loop ends
-				} // k-j neighbor check ends
-			} // j<k && j-k neighbor check ends
-		} // pk loop ends
-	} // j loop
-
-	/* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", 
-	   data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
+                                } // pl check ends
+                            } // pl loop ends
+                        } // pi check ends
+                    } // pi loop ends
+                } // k-j neighbor check ends
+            } // j<k && j-k neighbor check ends
+        } // pk loop ends
+    } // j loop
+
+    /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", 
+       data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
 
 #ifdef TEST_FORCES
-	fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs );
-	fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", 
-			data->E_Tor, data->E_Con );
+    fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs );
+    fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", 
+            data->E_Tor, data->E_Con );
 #endif
 }
 
@@ -671,692 +671,692 @@ void Four_Body_Interactions( reax_system *system, control_params *control,
 ////////////////////////////////////////////////////////////////////////
 
 GLOBAL void Four_Body_Interactions ( reax_atom *atoms, 
-		global_parameters g_params,
-		four_body_header *d_fbp,
-		control_params *control,
-		list p_bonds, list p_thb_intrs,
-		simulation_box *box,
-		simulation_data *data,
-		static_storage p_workspace,
-		int N, int num_atom_types, 
-		real *E_Tor, real *E_Con, rvec *aux_ext_press)
+        global_parameters g_params,
+        four_body_header *d_fbp,
+        control_params *control,
+        list p_bonds, list p_thb_intrs,
+        simulation_box *box,
+        simulation_data *data,
+        static_storage p_workspace,
+        int N, int num_atom_types, 
+        real *E_Tor, real *E_Con, rvec *aux_ext_press)
 {
-	/*
-	   extern __shared__ real _tor[];
-	   extern __shared__ real _con [];
-	   extern __shared__ rvec _press[];
-	   real *sh_tor, *sh_con; rvec *sh_press;
-	 */
-
-	int i, j, k, l, pi, pj, pk, pl, pij, plk;
-	int type_i, type_j, type_k, type_l;
-	int start_j, end_j, start_k, end_k;
-	int start_pj, end_pj, start_pk, end_pk;
-	int num_frb_intrs = 0;
-
-	real Delta_j, Delta_k;
-	real r_ij, r_jk, r_kl, r_li;
-	real BOA_ij, BOA_jk, BOA_kl;
-
-	real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
-	real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
-	real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
-	real fn10, f11_DjDk, dfn11, fn12;
-
-	real theta_ijk, theta_jkl;
-	real sin_ijk, sin_jkl;
-	real cos_ijk, cos_jkl;
-	real tan_ijk_i, tan_jkl_i;
-
-	real omega, cos_omega, cos2omega, cos3omega;
-	rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
-
-	real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
-	real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
-	real Cconj, CEconj1, CEconj2, CEconj3;
-	real CEconj4, CEconj5, CEconj6;
-
-	real e_tor, e_con;
-	rvec dvec_li;
-	rvec force, ext_press;
-	ivec rel_box_jl;
-	// rtensor total_rtensor, temp_rtensor;
-
-	four_body_header *fbh;
-	four_body_parameters *fbp;
-	bond_data *pbond_ij, *pbond_jk, *pbond_kl;
-	bond_order_data *bo_ij, *bo_jk, *bo_kl;
-	three_body_interaction_data *p_ijk, *p_jkl;
-
-	j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= N) return;
-	//    j = blockIdx.x;
-
-	real p_tor2 = g_params.l[23];
-	real p_tor3 = g_params.l[24];
-	real p_tor4 = g_params.l[25];
-	real p_cot2 = g_params.l[27];
-
-	list *bonds = &p_bonds;
-	list *thb_intrs = &p_thb_intrs;
-	static_storage *workspace = &p_workspace;
-
-
-	//for( j = 0; j < system->N; ++j ) {
-	type_j = atoms[j].type;
-	Delta_j = workspace->Delta_boc[j];
-	start_j = Start_Index(j, bonds);
-	end_j = End_Index(j, bonds);
-
-	/*
-	   sh_tor = _tor;
-	   sh_con = sh_tor + blockDim.x;
-	   sh_press = (rvec *) (sh_tor + 2*blockDim.x);
-
-	   sh_tor[threadIdx.x] = 0;
-	   sh_con [threadIdx.x] = 0;
-	   rvec_MakeZero (sh_press [threadIdx.x] );
-	   pk = threadIdx.x + start_j;
-	 */
-
-	E_Tor [j] = 0;
-	E_Con [j] = 0;
-	rvec_MakeZero (aux_ext_press [j]);
-
-
-	for( pk = start_j; pk < end_j; ++pk ) 
-		//while (pk < end_j)
-	{
-		pbond_jk = &( bonds->select.bond_list[pk] );
-		k = pbond_jk->nbr;
-		bo_jk = &( pbond_jk->bo_data );
-		BOA_jk = bo_jk->BO - control->thb_cut;
-
-		/* see if there are any 3-body interactions involving j&k
-		   where j is the central atom. Otherwise there is no point in
-		   trying to form a 4-body interaction out of this neighborhood */	
-		if( j < k && bo_jk->BO > control->thb_cut/*0*/ && 
-				Num_Entries(pk, thb_intrs) ) {
-			start_k = Start_Index(k, bonds);
-			end_k = End_Index(k, bonds);	    	       
-			pj = pbond_jk->sym_index; // pj points to j on k's list
-
-			/* do the same check as above: are there any 3-body interactions 
-			   involving k&j where k is the central atom */
-			if( Num_Entries(pj, thb_intrs) ) {
-				type_k = atoms[k].type;
-				Delta_k = workspace->Delta_boc[k];
-				r_jk = pbond_jk->d;
-
-				start_pk = Start_Index(pk, thb_intrs );
-				end_pk = End_Index(pk, thb_intrs );
-				start_pj = Start_Index(pj, thb_intrs );
-				end_pj = End_Index(pj, thb_intrs );		
-
-				exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
-				exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
-				exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
-				exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
-				exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
-				f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
-
-
-				/* pick i up from j-k interaction where j is the centre atom */
-				for( pi = start_pk; pi < end_pk; ++pi ) {
-					p_ijk = &( thb_intrs->select.three_body_list[pi] );
-					pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
-					pbond_ij = &( bonds->select.bond_list[pij] );
-					bo_ij = &( pbond_ij->bo_data );
-
-
-					if( bo_ij->BO > control->thb_cut/*0*/ ) {
-						i = p_ijk->thb;
-						type_i = atoms[i].type;
-						r_ij = pbond_ij->d;
-						BOA_ij = bo_ij->BO - control->thb_cut;
-
-						theta_ijk = p_ijk->theta;
-						sin_ijk = SIN( theta_ijk );
-						cos_ijk = COS( theta_ijk );
-						//tan_ijk_i = 1. / TAN( theta_ijk );
-						if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
-							tan_ijk_i = cos_ijk / MIN_SINE;
-						else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
-							tan_ijk_i = cos_ijk / -MIN_SINE;
-						else tan_ijk_i = cos_ijk / sin_ijk;
-
-						exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
-						exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
-
-						/* pick l up from j-k intr. where k is the centre */
-						for( pl = start_pj; pl < end_pj; ++pl ) {
-							p_jkl = &( thb_intrs->select.three_body_list[pl] );
-							l = p_jkl->thb;
-							plk = p_jkl->pthb; //pointer to l on k's bond_list!
-							pbond_kl = &( bonds->select.bond_list[plk] );
-							bo_kl = &( pbond_kl->bo_data );
-							type_l = atoms[l].type;
-							fbh = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types) ]);
-							fbp = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]);
-
-							if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
-									bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
-								++num_frb_intrs;
-								r_kl = pbond_kl->d;
-								BOA_kl = bo_kl->BO - control->thb_cut;
-
-								theta_jkl = p_jkl->theta;
-								sin_jkl = SIN( theta_jkl );
-								cos_jkl = COS( theta_jkl );
-								//tan_jkl_i = 1. / TAN( theta_jkl );
-								if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
-									tan_jkl_i = cos_jkl / MIN_SINE;
-								else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
-									tan_jkl_i = cos_jkl / -MIN_SINE;
-								else tan_jkl_i = cos_jkl /sin_jkl;
-
-								Sq_Distance_on_T3( atoms[l].x, atoms[i].x, 
-										box, dvec_li );
-								r_li = rvec_Norm( dvec_li );
-
-
-								/* omega and its derivative */
-								//cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, 
-								omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, 
-										r_jk, pbond_kl->dvec, r_kl,
-										dvec_li, r_li, p_ijk, p_jkl,
-										dcos_omega_di, dcos_omega_dj,
-										dcos_omega_dk, dcos_omega_dl,
-										NULL); //TODO *check*
-								cos_omega = COS( omega );
-								cos2omega = COS( 2. * omega );
-								cos3omega = COS( 3. * omega );
-								/* end omega calculations */
-
-								/* torsion energy */
-								exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk));
-								exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
-								exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) );
-								fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
-									(1.0 - exp_tor2_kl);
-
-								CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
-										fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
-										fbp->V3 * (1.0 + cos3omega) );
-								//CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + 
-								//  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
-								//  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
-
-								//PERFORMANCE IMPACT
-								e_tor = fn10 * sin_ijk * sin_jkl * CV;
-								//atomicAdd (&data->E_Tor ,e_tor );
-								E_Tor [j] += e_tor;
-								//sh_tor [threadIdx.x] += e_tor;
-
-								dfn11 = (-p_tor3 * exp_tor3_DjDk +
-										(p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
-										(2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
-
-								CEtors1 = sin_ijk * sin_jkl * CV;
-
-								CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * 
-									(2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * 
-									sin_ijk * sin_jkl; 
-
-								CEtors3 = CEtors2 * dfn11;
-
-								CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
-									(1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
-
-								CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * 
-									(1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
-
-								CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
-									(1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
-
-								cmn = -fn10 * CV;
-								CEtors7 = cmn * sin_jkl * tan_ijk_i;
-								CEtors8 = cmn * sin_ijk * tan_jkl_i;
-								CEtors9 = fn10 * sin_ijk * sin_jkl * 
-									(0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-									 1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
-								//cmn = -fn10 * CV;
-								//CEtors7 = cmn * sin_jkl * cos_ijk;
-								//CEtors8 = cmn * sin_ijk * cos_jkl;
-								//CEtors9 = fn10 * sin_ijk * sin_jkl * 
-								//  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
-								//   fbp->V3 * (6*SQR(cos_omega) - 1.50));
-								/* end  of torsion energy */
-
-
-								/* 4-body conjugation energy */
-								fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
-								//PERFORMANCE IMPACT
-								e_con = fbp->p_cot1 * fn12 * (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
-								//atomicAdd (&data->E_Con ,e_con );
-								E_Con [j] += e_con ;
-								//sh_con [threadIdx.x] += e_con;
-
-								Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
-									(1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
-
-								CEconj1 = Cconj * (BOA_ij - 1.5e0);
-								CEconj2 = Cconj * (BOA_jk - 1.5e0);
-								CEconj3 = Cconj * (BOA_kl - 1.5e0);
-
-								CEconj4 = -fbp->p_cot1 * fn12 * 
-									(SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
-								CEconj5 = -fbp->p_cot1 * fn12 * 
-									(SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
-								//CEconj4 = -fbp->p_cot1 * fn12 * 
-								//  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
-								//CEconj5 = -fbp->p_cot1 * fn12 * 
-								//  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
-								CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
-									cos_omega * sin_ijk * sin_jkl;
-								/* end 4-body conjugation energy */
-
-								//fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
-								//   workspace->orig_id[i], workspace->orig_id[j],
-								//       workspace->orig_id[k], workspace->orig_id[l], 
-								//    omega, cos_omega, cos2omega, cos3omega );
-								//fprintf(stdout, 
-								//    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								//    CEtors2, CEtors3, CEtors4, CEtors5, 
-								//    CEtors6, CEtors7, CEtors8, CEtors9 );
-								//fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								//    theta_ijk, theta_jkl, sin_ijk, 
-								//    sin_jkl, cos_jkl, tan_jkl_i );
-
-								/* forces */
-								//PERFORMANCE IMPACT
-								/*
-								   atomicAdd ( &bo_jk->Cdbopi, CEtors2 );
-								   atomicAdd ( &workspace->CdDelta[j], CEtors3 );
-								   atomicAdd ( &workspace->CdDelta[k], CEtors3 );
-								   atomicAdd ( &bo_ij->Cdbo, (CEtors4 + CEconj1) );
-								   atomicAdd ( &bo_jk->Cdbo, (CEtors5 + CEconj2) );
-								   atomicAdd ( &bo_kl->Cdbo, (CEtors6 + CEconj3) );
-								 */
-
-								//PERFORMANCE IMPACT
-								bo_jk->Cdbopi += CEtors2;
-								workspace->CdDelta[j] += CEtors3;
-								pbond_jk->CdDelta_jk += CEtors3;
-								bo_ij->Cdbo += CEtors4 + CEconj1;
-								bo_jk->Cdbo += CEtors5 + CEconj2;
-
-								//TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE
-								atomicAdd (&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 );
-								//TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE
-
-								if( control->ensemble == NVE || control->ensemble == NVT ||control->ensemble == bNVT) {
-									/* dcos_theta_ijk */
-									//PERFORMANCE IMPACT
-									atomic_rvecScaledAdd (pbond_ij->i_f, 
-											CEtors7 + CEconj4, p_ijk->dcos_dk );
-									rvec_ScaledAdd( atoms[j].f, 
-											CEtors7 + CEconj4, p_ijk->dcos_dj );
-									atomic_rvecScaledAdd( pbond_jk->k_f, 
-											CEtors7 + CEconj4, p_ijk->dcos_di );
-
-
-									/* dcos_theta_jkl */
-									//PERFORMANCE IMPACT
-									rvec_ScaledAdd( atoms[j].f, 
-											CEtors8 + CEconj5, p_jkl->dcos_di );
-									atomic_rvecScaledAdd( pbond_jk->i_f, 
-											CEtors8 + CEconj5, p_jkl->dcos_dj );
-									atomic_rvecScaledAdd( pbond_kl->k_f, 
-											CEtors8 + CEconj5, p_jkl->dcos_dk );
-
-									/* dcos_omega */
-									//PERFORMANCE IMPACT
-									atomic_rvecScaledAdd( pbond_ij->i_f, 
-											CEtors9 + CEconj6, dcos_omega_di );
-									rvec_ScaledAdd( atoms[j].f, 
-											CEtors9 + CEconj6, dcos_omega_dj );
-									atomic_rvecScaledAdd( pbond_jk->i_f, 
-											CEtors9 + CEconj6, dcos_omega_dk );
-									atomic_rvecScaledAdd( pbond_kl->k_f, 
-											CEtors9 + CEconj6, dcos_omega_dl );
-								}
-								else {
-									ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
-
-									/* dcos_theta_ijk */
-									rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
-									//PERFORMANCE IMPACT
-									atomic_rvecAdd( pbond_ij->i_f, force );
-									rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-									rvec_Add( aux_ext_press [j], ext_press );
-									//rvec_Add (sh_press [threadIdx.x], ext_press);
-
-									//PERFORMANCE IMPACT
-									rvec_ScaledAdd( atoms[j].f, 
-											CEtors7 + CEconj4, p_ijk->dcos_dj );
-
-									rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
-									//PERFORMANCE IMPACT
-									atomic_rvecAdd( pbond_jk->i_f, force );
-									rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-									//PERFORMANCE IMPACT
-									rvec_Add( aux_ext_press [j], ext_press );
-									//rvec_Add (sh_press [threadIdx.x], ext_press);
-
-
-									/* dcos_theta_jkl */
-									//PERFORMANCE IMPACT
-									rvec_ScaledAdd( atoms[j].f, 
-											CEtors8 + CEconj5, p_jkl->dcos_di );
-
-									rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
-									//PERFORMANCE IMPACT
-									atomic_rvecAdd( pbond_jk->i_f, force );
-									rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-									rvec_Add( aux_ext_press [j], ext_press );
-									//rvec_Add (sh_press [threadIdx.x], ext_press);
-
-									rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
-									//PERFORMANCE IMPACT
-									atomic_rvecAdd( pbond_kl->k_f, force );
-									rvec_iMultiply( ext_press, rel_box_jl, force );
-									rvec_Add( aux_ext_press [j], ext_press );
-									//rvec_Add (sh_press [threadIdx.x], ext_press);
-
-
-									/* dcos_omega */				      
-									rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
-									//PERFORMANCE IMPACT
-									atomic_rvecAdd( pbond_ij->i_f, force );
-									rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-									rvec_Add( aux_ext_press [j], ext_press );
-									//rvec_Add (sh_press [threadIdx.x], ext_press);
-
-									//PERFORMANCE IMPACT
-									rvec_ScaledAdd( atoms[j].f, 
-											CEtors9 + CEconj6, dcos_omega_dj );
-
-									rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
-									//PERFORMANCE IMPACT
-									atomic_rvecAdd( pbond_jk->i_f, force );
-									rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-									rvec_Add( aux_ext_press [j], ext_press );
-									//rvec_Add (sh_press [threadIdx.x], ext_press);
-
-									rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
-									//PERFORMANCE IMPACT
-									atomic_rvecAdd( pbond_kl->k_f, force );
-									rvec_iMultiply( ext_press, rel_box_jl, force );
-									rvec_Add( aux_ext_press [j], ext_press );
-									//rvec_Add (sh_press [threadIdx.x], ext_press);
-
-
-									/* This part is intended for a fully-flexible box */
-									/* rvec_ScaledSum( temp_rvec, 
-									   CEtors7 + CEconj4, p_ijk->dcos_dk,      // i     
-									   CEtors9 + CEconj6, dcos_omega_di );
-									   rvec_OuterProduct( temp_rtensor, 
-									   temp_rvec, system->atoms[i].x );
-									   rtensor_Copy( total_rtensor, temp_rtensor );
-
-									   rvec_ScaledSum( temp_rvec, 
-									   CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
-									   CEtors8 + CEconj5, p_jkl->dcos_di );
-									   rvec_ScaledAdd( temp_rvec, 
-									   CEtors9 + CEconj6, dcos_omega_dj );
-									   rvec_OuterProduct( temp_rtensor, 
-									   temp_rvec, system->atoms[j].x );
-									   rtensor_Add( total_rtensor, temp_rtensor );
-
-									   rvec_ScaledSum( temp_rvec, 
-									   CEtors7 + CEconj4, p_ijk->dcos_di,      // k
-									   CEtors8 + CEconj5, p_jkl->dcos_dj );
-									   rvec_ScaledAdd( temp_rvec, 
-									   CEtors9 + CEconj6, dcos_omega_dk );
-									   rvec_OuterProduct( temp_rtensor, 
-									   temp_rvec, system->atoms[k].x );
-									   rtensor_Add( total_rtensor, temp_rtensor );
-
-									   rvec_ScaledSum( temp_rvec, 
-									   CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
-									   CEtors9 + CEconj6, dcos_omega_dl );
-									   rvec_OuterProduct( temp_rtensor, 
-									   temp_rvec, system->atoms[l].x );
-									   rtensor_Copy( total_rtensor, temp_rtensor );
-
-									   if( pbond_ij->imaginary || pbond_jk->imaginary || 
-									   pbond_kl->imaginary )
-									   rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
-									   else
-									   rtensor_Add( data->flex_bar.P, total_rtensor ); */
-								}
+    /*
+       extern __shared__ real _tor[];
+       extern __shared__ real _con [];
+       extern __shared__ rvec _press[];
+       real *sh_tor, *sh_con; rvec *sh_press;
+     */
+
+    int i, j, k, l, pi, pj, pk, pl, pij, plk;
+    int type_i, type_j, type_k, type_l;
+    int start_j, end_j, start_k, end_k;
+    int start_pj, end_pj, start_pk, end_pk;
+    int num_frb_intrs = 0;
+
+    real Delta_j, Delta_k;
+    real r_ij, r_jk, r_kl, r_li;
+    real BOA_ij, BOA_jk, BOA_kl;
+
+    real exp_tor2_ij, exp_tor2_jk, exp_tor2_kl;
+    real exp_tor1, exp_tor3_DjDk, exp_tor4_DjDk, exp_tor34_inv;
+    real exp_cot2_jk, exp_cot2_ij, exp_cot2_kl;
+    real fn10, f11_DjDk, dfn11, fn12;
+
+    real theta_ijk, theta_jkl;
+    real sin_ijk, sin_jkl;
+    real cos_ijk, cos_jkl;
+    real tan_ijk_i, tan_jkl_i;
+
+    real omega, cos_omega, cos2omega, cos3omega;
+    rvec dcos_omega_di, dcos_omega_dj, dcos_omega_dk, dcos_omega_dl;
+
+    real CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
+    real CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
+    real Cconj, CEconj1, CEconj2, CEconj3;
+    real CEconj4, CEconj5, CEconj6;
+
+    real e_tor, e_con;
+    rvec dvec_li;
+    rvec force, ext_press;
+    ivec rel_box_jl;
+    // rtensor total_rtensor, temp_rtensor;
+
+    four_body_header *fbh;
+    four_body_parameters *fbp;
+    bond_data *pbond_ij, *pbond_jk, *pbond_kl;
+    bond_order_data *bo_ij, *bo_jk, *bo_kl;
+    three_body_interaction_data *p_ijk, *p_jkl;
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
+    //    j = blockIdx.x;
+
+    real p_tor2 = g_params.l[23];
+    real p_tor3 = g_params.l[24];
+    real p_tor4 = g_params.l[25];
+    real p_cot2 = g_params.l[27];
+
+    list *bonds = &p_bonds;
+    list *thb_intrs = &p_thb_intrs;
+    static_storage *workspace = &p_workspace;
+
+
+    //for( j = 0; j < system->N; ++j ) {
+    type_j = atoms[j].type;
+    Delta_j = workspace->Delta_boc[j];
+    start_j = Start_Index(j, bonds);
+    end_j = End_Index(j, bonds);
+
+    /*
+       sh_tor = _tor;
+       sh_con = sh_tor + blockDim.x;
+       sh_press = (rvec *) (sh_tor + 2*blockDim.x);
+
+       sh_tor[threadIdx.x] = 0;
+       sh_con [threadIdx.x] = 0;
+       rvec_MakeZero (sh_press [threadIdx.x] );
+       pk = threadIdx.x + start_j;
+     */
+
+    E_Tor [j] = 0;
+    E_Con [j] = 0;
+    rvec_MakeZero (aux_ext_press [j]);
+
+
+    for( pk = start_j; pk < end_j; ++pk ) 
+        //while (pk < end_j)
+    {
+        pbond_jk = &( bonds->select.bond_list[pk] );
+        k = pbond_jk->nbr;
+        bo_jk = &( pbond_jk->bo_data );
+        BOA_jk = bo_jk->BO - control->thb_cut;
+
+        /* see if there are any 3-body interactions involving j&k
+           where j is the central atom. Otherwise there is no point in
+           trying to form a 4-body interaction out of this neighborhood */    
+        if( j < k && bo_jk->BO > control->thb_cut/*0*/ && 
+                Num_Entries(pk, thb_intrs) ) {
+            start_k = Start_Index(k, bonds);
+            end_k = End_Index(k, bonds);                   
+            pj = pbond_jk->sym_index; // pj points to j on k's list
+
+            /* do the same check as above: are there any 3-body interactions 
+               involving k&j where k is the central atom */
+            if( Num_Entries(pj, thb_intrs) ) {
+                type_k = atoms[k].type;
+                Delta_k = workspace->Delta_boc[k];
+                r_jk = pbond_jk->d;
+
+                start_pk = Start_Index(pk, thb_intrs );
+                end_pk = End_Index(pk, thb_intrs );
+                start_pj = Start_Index(pj, thb_intrs );
+                end_pj = End_Index(pj, thb_intrs );        
+
+                exp_tor2_jk = EXP( -p_tor2 * BOA_jk );
+                exp_cot2_jk = EXP( -p_cot2 * SQR(BOA_jk - 1.5) );
+                exp_tor3_DjDk = EXP( -p_tor3 * (Delta_j + Delta_k) );
+                exp_tor4_DjDk = EXP( p_tor4  * (Delta_j + Delta_k) );
+                exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DjDk + exp_tor4_DjDk);
+                f11_DjDk = (2.0 + exp_tor3_DjDk) * exp_tor34_inv;
+
+
+                /* pick i up from j-k interaction where j is the centre atom */
+                for( pi = start_pk; pi < end_pk; ++pi ) {
+                    p_ijk = &( thb_intrs->select.three_body_list[pi] );
+                    pij = p_ijk->pthb; // pij is pointer to i on j's bond_list
+                    pbond_ij = &( bonds->select.bond_list[pij] );
+                    bo_ij = &( pbond_ij->bo_data );
+
+
+                    if( bo_ij->BO > control->thb_cut/*0*/ ) {
+                        i = p_ijk->thb;
+                        type_i = atoms[i].type;
+                        r_ij = pbond_ij->d;
+                        BOA_ij = bo_ij->BO - control->thb_cut;
+
+                        theta_ijk = p_ijk->theta;
+                        sin_ijk = SIN( theta_ijk );
+                        cos_ijk = COS( theta_ijk );
+                        //tan_ijk_i = 1. / TAN( theta_ijk );
+                        if( sin_ijk >= 0 && sin_ijk <= MIN_SINE ) 
+                            tan_ijk_i = cos_ijk / MIN_SINE;
+                        else if( sin_ijk <= 0 && sin_ijk >= -MIN_SINE ) 
+                            tan_ijk_i = cos_ijk / -MIN_SINE;
+                        else tan_ijk_i = cos_ijk / sin_ijk;
+
+                        exp_tor2_ij = EXP( -p_tor2 * BOA_ij );
+                        exp_cot2_ij = EXP( -p_cot2 * SQR(BOA_ij -1.5) );
+
+                        /* pick l up from j-k intr. where k is the centre */
+                        for( pl = start_pj; pl < end_pj; ++pl ) {
+                            p_jkl = &( thb_intrs->select.three_body_list[pl] );
+                            l = p_jkl->thb;
+                            plk = p_jkl->pthb; //pointer to l on k's bond_list!
+                            pbond_kl = &( bonds->select.bond_list[plk] );
+                            bo_kl = &( pbond_kl->bo_data );
+                            type_l = atoms[l].type;
+                            fbh = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types) ]);
+                            fbp = &(d_fbp[ index_fbp (type_i,type_j,type_k,type_l,num_atom_types)].prm[0]);
+
+                            if( i != l && fbh->cnt && bo_kl->BO > control->thb_cut/*0*/ &&
+                                    bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/ ){
+                                ++num_frb_intrs;
+                                r_kl = pbond_kl->d;
+                                BOA_kl = bo_kl->BO - control->thb_cut;
+
+                                theta_jkl = p_jkl->theta;
+                                sin_jkl = SIN( theta_jkl );
+                                cos_jkl = COS( theta_jkl );
+                                //tan_jkl_i = 1. / TAN( theta_jkl );
+                                if( sin_jkl >= 0 && sin_jkl <= MIN_SINE ) 
+                                    tan_jkl_i = cos_jkl / MIN_SINE;
+                                else if( sin_jkl <= 0 && sin_jkl >= -MIN_SINE ) 
+                                    tan_jkl_i = cos_jkl / -MIN_SINE;
+                                else tan_jkl_i = cos_jkl /sin_jkl;
+
+                                Sq_Distance_on_T3( atoms[l].x, atoms[i].x, 
+                                        box, dvec_li );
+                                r_li = rvec_Norm( dvec_li );
+
+
+                                /* omega and its derivative */
+                                //cos_omega=Calculate_Omega(pbond_ij->dvec,r_ij,pbond_jk->dvec, 
+                                omega = Calculate_Omega(pbond_ij->dvec, r_ij, pbond_jk->dvec, 
+                                        r_jk, pbond_kl->dvec, r_kl,
+                                        dvec_li, r_li, p_ijk, p_jkl,
+                                        dcos_omega_di, dcos_omega_dj,
+                                        dcos_omega_dk, dcos_omega_dl,
+                                        NULL); //TODO *check*
+                                cos_omega = COS( omega );
+                                cos2omega = COS( 2. * omega );
+                                cos3omega = COS( 3. * omega );
+                                /* end omega calculations */
+
+                                /* torsion energy */
+                                exp_tor1 = EXP(fbp->p_tor1 * SQR(2.-bo_jk->BO_pi-f11_DjDk));
+                                exp_tor2_kl = EXP( -p_tor2 * BOA_kl );
+                                exp_cot2_kl = EXP( -p_cot2 * SQR(BOA_kl-1.5) );
+                                fn10 = (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk) * 
+                                    (1.0 - exp_tor2_kl);
+
+                                CV = 0.5 * ( fbp->V1 * (1.0 + cos_omega) + 
+                                        fbp->V2 * exp_tor1 * (1.0 - cos2omega) +
+                                        fbp->V3 * (1.0 + cos3omega) );
+                                //CV = 0.5 * fbp->V1 * (1.0 + cos_omega) + 
+                                //  fbp->V2 * exp_tor1 * (1.0 - SQR(cos_omega)) +
+                                //  fbp->V3 * (0.5 + 2.0*CUBE(cos_omega) - 1.5 * cos_omega);
+
+                                //PERFORMANCE IMPACT
+                                e_tor = fn10 * sin_ijk * sin_jkl * CV;
+                                //atomicAdd (&data->E_Tor ,e_tor );
+                                E_Tor [j] += e_tor;
+                                //sh_tor [threadIdx.x] += e_tor;
+
+                                dfn11 = (-p_tor3 * exp_tor3_DjDk +
+                                        (p_tor3 * exp_tor3_DjDk - p_tor4 * exp_tor4_DjDk) *
+                                        (2.+exp_tor3_DjDk) * exp_tor34_inv) * exp_tor34_inv;
+
+                                CEtors1 = sin_ijk * sin_jkl * CV;
+
+                                CEtors2 = -fn10 * 2.0 * fbp->p_tor1 * fbp->V2 * exp_tor1 * 
+                                    (2.0 - bo_jk->BO_pi - f11_DjDk) * (1.0 - SQR(cos_omega)) * 
+                                    sin_ijk * sin_jkl; 
+
+                                CEtors3 = CEtors2 * dfn11;
+
+                                CEtors4 = CEtors1 * p_tor2 * exp_tor2_ij * 
+                                    (1.0 - exp_tor2_jk) * (1.0 - exp_tor2_kl);
+
+                                CEtors5 = CEtors1 * p_tor2 * exp_tor2_jk * 
+                                    (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_kl);
+
+                                CEtors6 = CEtors1 * p_tor2 * exp_tor2_kl *
+                                    (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jk);
+
+                                cmn = -fn10 * CV;
+                                CEtors7 = cmn * sin_jkl * tan_ijk_i;
+                                CEtors8 = cmn * sin_ijk * tan_jkl_i;
+                                CEtors9 = fn10 * sin_ijk * sin_jkl * 
+                                    (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                     1.5 * fbp->V3 * (cos2omega + 2. * SQR(cos_omega)));
+                                //cmn = -fn10 * CV;
+                                //CEtors7 = cmn * sin_jkl * cos_ijk;
+                                //CEtors8 = cmn * sin_ijk * cos_jkl;
+                                //CEtors9 = fn10 * sin_ijk * sin_jkl * 
+                                //  (0.5 * fbp->V1 - 2.0 * fbp->V2 * exp_tor1 * cos_omega +
+                                //   fbp->V3 * (6*SQR(cos_omega) - 1.50));
+                                /* end  of torsion energy */
+
+
+                                /* 4-body conjugation energy */
+                                fn12 = exp_cot2_ij * exp_cot2_jk * exp_cot2_kl;
+                                //PERFORMANCE IMPACT
+                                e_con = fbp->p_cot1 * fn12 * (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+                                //atomicAdd (&data->E_Con ,e_con );
+                                E_Con [j] += e_con ;
+                                //sh_con [threadIdx.x] += e_con;
+
+                                Cconj = -2.0 * fn12 * fbp->p_cot1 * p_cot2 * 
+                                    (1. + (SQR(cos_omega)-1.) * sin_ijk*sin_jkl);
+
+                                CEconj1 = Cconj * (BOA_ij - 1.5e0);
+                                CEconj2 = Cconj * (BOA_jk - 1.5e0);
+                                CEconj3 = Cconj * (BOA_kl - 1.5e0);
+
+                                CEconj4 = -fbp->p_cot1 * fn12 * 
+                                    (SQR(cos_omega) - 1.0) * sin_jkl * tan_ijk_i;
+                                CEconj5 = -fbp->p_cot1 * fn12 * 
+                                    (SQR(cos_omega) - 1.0) * sin_ijk * tan_jkl_i;
+                                //CEconj4 = -fbp->p_cot1 * fn12 * 
+                                //  (SQR(cos_omega) - 1.0) * sin_jkl * cos_ijk;
+                                //CEconj5 = -fbp->p_cot1 * fn12 * 
+                                //  (SQR(cos_omega) - 1.0) * sin_ijk * cos_jkl;
+                                CEconj6 = 2.0 * fbp->p_cot1 * fn12 * 
+                                    cos_omega * sin_ijk * sin_jkl;
+                                /* end 4-body conjugation energy */
+
+                                //fprintf(stdout, "%6d %6d %6d %6d %7.3f %7.3f %7.3f %7.3f ",
+                                //   workspace->orig_id[i], workspace->orig_id[j],
+                                //       workspace->orig_id[k], workspace->orig_id[l], 
+                                //    omega, cos_omega, cos2omega, cos3omega );
+                                //fprintf(stdout, 
+                                //    "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                //    CEtors2, CEtors3, CEtors4, CEtors5, 
+                                //    CEtors6, CEtors7, CEtors8, CEtors9 );
+                                //fprintf(stdout, "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                //    theta_ijk, theta_jkl, sin_ijk, 
+                                //    sin_jkl, cos_jkl, tan_jkl_i );
+
+                                /* forces */
+                                //PERFORMANCE IMPACT
+                                /*
+                                   atomicAdd ( &bo_jk->Cdbopi, CEtors2 );
+                                   atomicAdd ( &workspace->CdDelta[j], CEtors3 );
+                                   atomicAdd ( &workspace->CdDelta[k], CEtors3 );
+                                   atomicAdd ( &bo_ij->Cdbo, (CEtors4 + CEconj1) );
+                                   atomicAdd ( &bo_jk->Cdbo, (CEtors5 + CEconj2) );
+                                   atomicAdd ( &bo_kl->Cdbo, (CEtors6 + CEconj3) );
+                                 */
+
+                                //PERFORMANCE IMPACT
+                                bo_jk->Cdbopi += CEtors2;
+                                workspace->CdDelta[j] += CEtors3;
+                                pbond_jk->CdDelta_jk += CEtors3;
+                                bo_ij->Cdbo += CEtors4 + CEconj1;
+                                bo_jk->Cdbo += CEtors5 + CEconj2;
+
+                                //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE
+                                atomicAdd (&pbond_kl->Cdbo_kl, CEtors6 + CEconj3 );
+                                //TODO REMOVE THIS ATOMIC OPERATION IF POSSIBLE
+
+                                if( control->ensemble == NVE || control->ensemble == NVT ||control->ensemble == bNVT) {
+                                    /* dcos_theta_ijk */
+                                    //PERFORMANCE IMPACT
+                                    atomic_rvecScaledAdd (pbond_ij->i_f, 
+                                            CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                    rvec_ScaledAdd( atoms[j].f, 
+                                            CEtors7 + CEconj4, p_ijk->dcos_dj );
+                                    atomic_rvecScaledAdd( pbond_jk->k_f, 
+                                            CEtors7 + CEconj4, p_ijk->dcos_di );
+
+
+                                    /* dcos_theta_jkl */
+                                    //PERFORMANCE IMPACT
+                                    rvec_ScaledAdd( atoms[j].f, 
+                                            CEtors8 + CEconj5, p_jkl->dcos_di );
+                                    atomic_rvecScaledAdd( pbond_jk->i_f, 
+                                            CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                    atomic_rvecScaledAdd( pbond_kl->k_f, 
+                                            CEtors8 + CEconj5, p_jkl->dcos_dk );
+
+                                    /* dcos_omega */
+                                    //PERFORMANCE IMPACT
+                                    atomic_rvecScaledAdd( pbond_ij->i_f, 
+                                            CEtors9 + CEconj6, dcos_omega_di );
+                                    rvec_ScaledAdd( atoms[j].f, 
+                                            CEtors9 + CEconj6, dcos_omega_dj );
+                                    atomic_rvecScaledAdd( pbond_jk->i_f, 
+                                            CEtors9 + CEconj6, dcos_omega_dk );
+                                    atomic_rvecScaledAdd( pbond_kl->k_f, 
+                                            CEtors9 + CEconj6, dcos_omega_dl );
+                                }
+                                else {
+                                    ivec_Sum(rel_box_jl, pbond_jk->rel_box, pbond_kl->rel_box);
+
+                                    /* dcos_theta_ijk */
+                                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_dk );
+                                    //PERFORMANCE IMPACT
+                                    atomic_rvecAdd( pbond_ij->i_f, force );
+                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                    rvec_Add( aux_ext_press [j], ext_press );
+                                    //rvec_Add (sh_press [threadIdx.x], ext_press);
+
+                                    //PERFORMANCE IMPACT
+                                    rvec_ScaledAdd( atoms[j].f, 
+                                            CEtors7 + CEconj4, p_ijk->dcos_dj );
+
+                                    rvec_Scale( force, CEtors7 + CEconj4, p_ijk->dcos_di );
+                                    //PERFORMANCE IMPACT
+                                    atomic_rvecAdd( pbond_jk->i_f, force );
+                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                    //PERFORMANCE IMPACT
+                                    rvec_Add( aux_ext_press [j], ext_press );
+                                    //rvec_Add (sh_press [threadIdx.x], ext_press);
+
+
+                                    /* dcos_theta_jkl */
+                                    //PERFORMANCE IMPACT
+                                    rvec_ScaledAdd( atoms[j].f, 
+                                            CEtors8 + CEconj5, p_jkl->dcos_di );
+
+                                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                    //PERFORMANCE IMPACT
+                                    atomic_rvecAdd( pbond_jk->i_f, force );
+                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                    rvec_Add( aux_ext_press [j], ext_press );
+                                    //rvec_Add (sh_press [threadIdx.x], ext_press);
+
+                                    rvec_Scale( force, CEtors8 + CEconj5, p_jkl->dcos_dk );
+                                    //PERFORMANCE IMPACT
+                                    atomic_rvecAdd( pbond_kl->k_f, force );
+                                    rvec_iMultiply( ext_press, rel_box_jl, force );
+                                    rvec_Add( aux_ext_press [j], ext_press );
+                                    //rvec_Add (sh_press [threadIdx.x], ext_press);
+
+
+                                    /* dcos_omega */                      
+                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_di );
+                                    //PERFORMANCE IMPACT
+                                    atomic_rvecAdd( pbond_ij->i_f, force );
+                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                    rvec_Add( aux_ext_press [j], ext_press );
+                                    //rvec_Add (sh_press [threadIdx.x], ext_press);
+
+                                    //PERFORMANCE IMPACT
+                                    rvec_ScaledAdd( atoms[j].f, 
+                                            CEtors9 + CEconj6, dcos_omega_dj );
+
+                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dk );
+                                    //PERFORMANCE IMPACT
+                                    atomic_rvecAdd( pbond_jk->i_f, force );
+                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                    rvec_Add( aux_ext_press [j], ext_press );
+                                    //rvec_Add (sh_press [threadIdx.x], ext_press);
+
+                                    rvec_Scale( force, CEtors9 + CEconj6, dcos_omega_dl );
+                                    //PERFORMANCE IMPACT
+                                    atomic_rvecAdd( pbond_kl->k_f, force );
+                                    rvec_iMultiply( ext_press, rel_box_jl, force );
+                                    rvec_Add( aux_ext_press [j], ext_press );
+                                    //rvec_Add (sh_press [threadIdx.x], ext_press);
+
+
+                                    /* This part is intended for a fully-flexible box */
+                                    /* rvec_ScaledSum( temp_rvec, 
+                                       CEtors7 + CEconj4, p_ijk->dcos_dk,      // i     
+                                       CEtors9 + CEconj6, dcos_omega_di );
+                                       rvec_OuterProduct( temp_rtensor, 
+                                       temp_rvec, system->atoms[i].x );
+                                       rtensor_Copy( total_rtensor, temp_rtensor );
+
+                                       rvec_ScaledSum( temp_rvec, 
+                                       CEtors7 + CEconj4, p_ijk->dcos_dj,      // j
+                                       CEtors8 + CEconj5, p_jkl->dcos_di );
+                                       rvec_ScaledAdd( temp_rvec, 
+                                       CEtors9 + CEconj6, dcos_omega_dj );
+                                       rvec_OuterProduct( temp_rtensor, 
+                                       temp_rvec, system->atoms[j].x );
+                                       rtensor_Add( total_rtensor, temp_rtensor );
+
+                                       rvec_ScaledSum( temp_rvec, 
+                                       CEtors7 + CEconj4, p_ijk->dcos_di,      // k
+                                       CEtors8 + CEconj5, p_jkl->dcos_dj );
+                                       rvec_ScaledAdd( temp_rvec, 
+                                       CEtors9 + CEconj6, dcos_omega_dk );
+                                       rvec_OuterProduct( temp_rtensor, 
+                                       temp_rvec, system->atoms[k].x );
+                                       rtensor_Add( total_rtensor, temp_rtensor );
+
+                                       rvec_ScaledSum( temp_rvec, 
+                                       CEtors8 + CEconj5, p_jkl->dcos_dk,      // l
+                                       CEtors9 + CEconj6, dcos_omega_dl );
+                                       rvec_OuterProduct( temp_rtensor, 
+                                       temp_rvec, system->atoms[l].x );
+                                       rtensor_Copy( total_rtensor, temp_rtensor );
+
+                                       if( pbond_ij->imaginary || pbond_jk->imaginary || 
+                                       pbond_kl->imaginary )
+                                       rtensor_ScaledAdd( data->flex_bar.P, -1., total_rtensor );
+                                       else
+                                       rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                                }
 
 #ifdef TEST_ENERGY
-								/*fprintf( out_control->etor, 
-								//"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								//r_ij, r_jk, r_kl, 
-								"%12.8f%12.8f%12.8f%12.8f\n",
-								cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
-								// fprintf( out_control->etor, "%12.8f\n", dfn11 );
-								/*
-								   fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", 
-								   fn10, cos_omega, CV );
-
-								   fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								   CEtors2, CEtors3, CEtors4, CEtors5, 
-								   CEtors6, CEtors7, CEtors8, CEtors9 );
-								 */
-								//end
-
-								/* fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								   htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
-
-								/*
-								   fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
-								   CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
-								 */
-								//end
-								/* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
-								   fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
-
-								/*
-
-								   fprintf( out_control->etor, 
-								//"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", 
-								"%6d%6d%6d%6d%12.8f%12.8f\n", 
-								workspace->orig_id[i], workspace->orig_id[j], 
-								workspace->orig_id[k], workspace->orig_id[l], 
-								e_tor, e_con );
-								//RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
-
-								fprintf( out_control->econ, 
-								"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-								workspace->orig_id[i], workspace->orig_id[j], 
-								workspace->orig_id[k], workspace->orig_id[l], 
-								RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
-								e_con,data->E_Con );
-								 */
-								//end
-
-								/* fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",	   
-								   (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], 
-								   (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], 
-								   (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
-								   (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], 
-								   (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], 
-								   (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
-								   (CEtors7 + CEconj4)*p_ijk->dcos_di[0], 
-								   (CEtors7 + CEconj4)*p_ijk->dcos_di[1], 
-								   (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
-
-
-								/* fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-								   (CEtors8 + CEconj5)*p_jkl->dcos_di[0], 
-								   (CEtors8 + CEconj5)*p_jkl->dcos_di[1], 
-								   (CEtors8 + CEconj5)*p_jkl->dcos_di[2], 
-								   (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], 
-								   (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], 
-								   (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], 
-								   (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], 
-								   (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], 
-								   (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
-
-								/*
-								   fprintf( out_control->etor, 
-								   "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
-								   dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], 
-								   dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], 
-								   dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
-								   dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
-								 */
-								//end
+                                /*fprintf( out_control->etor, 
+                                //"%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                //r_ij, r_jk, r_kl, 
+                                "%12.8f%12.8f%12.8f%12.8f\n",
+                                cos_ijk, cos_jkl, sin_ijk, sin_jkl );*/
+                                // fprintf( out_control->etor, "%12.8f\n", dfn11 );
+                                /*
+                                   fprintf( out_control->etor, "%12.8f%12.8f%12.8f\n", 
+                                   fn10, cos_omega, CV );
+
+                                   fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                   CEtors2, CEtors3, CEtors4, CEtors5, 
+                                   CEtors6, CEtors7, CEtors8, CEtors9 );
+                                 */
+                                //end
+
+                                /* fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                   htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe ); */
+
+                                /*
+                                   fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f%12.8f%12.8f%12.8f\n",
+                                   CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6 );
+                                 */
+                                //end
+                                /* fprintf(out_control->etor,"%23.15e%23.15e%23.15e%23.15e\n",
+                                   fbp->V1, fbp->V2, fbp->V3, fbp->p_tor1 );*/
+
+                                /*
+
+                                   fprintf( out_control->etor, 
+                                //"%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e\n", 
+                                "%6d%6d%6d%6d%12.8f%12.8f\n", 
+                                workspace->orig_id[i], workspace->orig_id[j], 
+                                workspace->orig_id[k], workspace->orig_id[l], 
+                                e_tor, e_con );
+                                //RAD2DEG(omega), BOA_jk, e_tor, data->E_Tor );
+
+                                fprintf( out_control->econ, 
+                                "%6d%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
+                                workspace->orig_id[i], workspace->orig_id[j], 
+                                workspace->orig_id[k], workspace->orig_id[l], 
+                                RAD2DEG(omega), BOA_ij, BOA_jk, BOA_kl, 
+                                e_con,data->E_Con );
+                                 */
+                                //end
+
+                                /* fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",       
+                                   (CEtors7 + CEconj4)*p_ijk->dcos_dk[0], 
+                                   (CEtors7 + CEconj4)*p_ijk->dcos_dk[1], 
+                                   (CEtors7 + CEconj4)*p_ijk->dcos_dk[2],
+                                   (CEtors7 + CEconj4)*p_ijk->dcos_dj[0], 
+                                   (CEtors7 + CEconj4)*p_ijk->dcos_dj[1], 
+                                   (CEtors7 + CEconj4)*p_ijk->dcos_dj[2],
+                                   (CEtors7 + CEconj4)*p_ijk->dcos_di[0], 
+                                   (CEtors7 + CEconj4)*p_ijk->dcos_di[1], 
+                                   (CEtors7 + CEconj4)*p_ijk->dcos_di[2] ); */
+
+
+                                /* fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                   (CEtors8 + CEconj5)*p_jkl->dcos_di[0], 
+                                   (CEtors8 + CEconj5)*p_jkl->dcos_di[1], 
+                                   (CEtors8 + CEconj5)*p_jkl->dcos_di[2], 
+                                   (CEtors8 + CEconj5)*p_jkl->dcos_dj[0], 
+                                   (CEtors8 + CEconj5)*p_jkl->dcos_dj[1], 
+                                   (CEtors8 + CEconj5)*p_jkl->dcos_dj[2], 
+                                   (CEtors8 + CEconj5)*p_jkl->dcos_dk[0], 
+                                   (CEtors8 + CEconj5)*p_jkl->dcos_dk[1], 
+                                   (CEtors8 + CEconj5)*p_jkl->dcos_dk[2] ); */
+
+                                /*
+                                   fprintf( out_control->etor, 
+                                   "%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n%12.8f%12.8f%12.8f\n",
+                                   dcos_omega_di[0], dcos_omega_di[1], dcos_omega_di[2], 
+                                   dcos_omega_dj[0], dcos_omega_dj[1], dcos_omega_dj[2], 
+                                   dcos_omega_dk[0], dcos_omega_dk[1], dcos_omega_dk[2],
+                                   dcos_omega_dl[0], dcos_omega_dl[1], dcos_omega_dl[2] );
+                                 */
+                                //end
 #endif
 
 #ifdef TEST_FORCES
-								/*
-								// Torsion Forces 
-								Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., 
-								workspace->f_tor, workspace->f_tor);
-								Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
-								Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
-								Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
-								Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
-								Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
-
-								rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk);
-								rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj);
-								rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di);
-
-								rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di);
-								rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj);
-								rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk);
-
-								rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di );
-								rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj );
-								rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
-								rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
-
-								// Conjugation Forces 
-								Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
-								Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
-								Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
-
-								rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk);
-								rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj);
-								rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di);
-
-								rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di);
-								rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj);
-								rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk);
-
-								rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di );
-								rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj );
-								rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk );
-								rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl );
-								 */
-								//end
+                                /*
+                                // Torsion Forces 
+                                Add_dBOpinpi2(system, lists, j, pk, CEtors2, 0., 
+                                workspace->f_tor, workspace->f_tor);
+                                Add_dDelta( system, lists, j, CEtors3, workspace->f_tor );
+                                Add_dDelta( system, lists, k, CEtors3, workspace->f_tor );
+                                Add_dBO( system, lists, j, pij, CEtors4, workspace->f_tor );
+                                Add_dBO( system, lists, j, pk, CEtors5, workspace->f_tor );
+                                Add_dBO( system, lists, k, plk, CEtors6, workspace->f_tor );
+
+                                rvec_ScaledAdd(workspace->f_tor[i], CEtors7, p_ijk->dcos_dk);
+                                rvec_ScaledAdd(workspace->f_tor[j], CEtors7, p_ijk->dcos_dj);
+                                rvec_ScaledAdd(workspace->f_tor[k], CEtors7, p_ijk->dcos_di);
+
+                                rvec_ScaledAdd(workspace->f_tor[j], CEtors8, p_jkl->dcos_di);
+                                rvec_ScaledAdd(workspace->f_tor[k], CEtors8, p_jkl->dcos_dj);
+                                rvec_ScaledAdd(workspace->f_tor[l], CEtors8, p_jkl->dcos_dk);
+
+                                rvec_ScaledAdd( workspace->f_tor[i], CEtors9, dcos_omega_di );
+                                rvec_ScaledAdd( workspace->f_tor[j], CEtors9, dcos_omega_dj );
+                                rvec_ScaledAdd( workspace->f_tor[k], CEtors9, dcos_omega_dk );
+                                rvec_ScaledAdd( workspace->f_tor[l], CEtors9, dcos_omega_dl );
+
+                                // Conjugation Forces 
+                                Add_dBO( system, lists, j, pij, CEconj1, workspace->f_con );
+                                Add_dBO( system, lists, j, pk, CEconj2, workspace->f_con );
+                                Add_dBO( system, lists, k, plk, CEconj3, workspace->f_con );
+
+                                rvec_ScaledAdd(workspace->f_con[i], CEconj4, p_ijk->dcos_dk);
+                                rvec_ScaledAdd(workspace->f_con[j], CEconj4, p_ijk->dcos_dj);
+                                rvec_ScaledAdd(workspace->f_con[k], CEconj4, p_ijk->dcos_di);
+
+                                rvec_ScaledAdd(workspace->f_con[j], CEconj5, p_jkl->dcos_di);
+                                rvec_ScaledAdd(workspace->f_con[k], CEconj5, p_jkl->dcos_dj);
+                                rvec_ScaledAdd(workspace->f_con[l], CEconj5, p_jkl->dcos_dk);
+
+                                rvec_ScaledAdd( workspace->f_con[i], CEconj6, dcos_omega_di );
+                                rvec_ScaledAdd( workspace->f_con[j], CEconj6, dcos_omega_dj );
+                                rvec_ScaledAdd( workspace->f_con[k], CEconj6, dcos_omega_dk );
+                                rvec_ScaledAdd( workspace->f_con[l], CEconj6, dcos_omega_dl );
+                                 */
+                                //end
 #endif
-							} // pl check ends
-						} // pl loop ends
-					} // pi check ends
-				} // pi loop ends
-			} // k-j neighbor check ends
-		} // j<k && j-k neighbor check ends
+                            } // pl check ends
+                        } // pl loop ends
+                    } // pi check ends
+                } // pi loop ends
+            } // k-j neighbor check ends
+        } // j<k && j-k neighbor check ends
 
 
-		//pk += blockDim.x;
+        //pk += blockDim.x;
 
 
 
-	} // pk loop ends
-	//} // j loop -- REMOVED FOR CUDA
+    } // pk loop ends
+    //} // j loop -- REMOVED FOR CUDA
 
-	/* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", 
-	   data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
+    /* fprintf( stderr, "4body: ext_press (%23.15e %23.15e %23.15e)\n", 
+       data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
 
 #ifdef TEST_FORCES
-	/*
-	   fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs );
-	   fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", 
-	   data->E_Tor, data->E_Con );
-	 */
+    /*
+       fprintf( stderr, "Number of torsion angles: %d\n", num_frb_intrs );
+       fprintf( stderr, "Torsion Energy: %g\t Conjugation Energy: %g\n", 
+       data->E_Tor, data->E_Con );
+     */
 #endif
 
-	/*
-	//do the reduction for the shared memory variables
-	// now do a reduce inside the warp for E_vdW, E_Ele and force.
-	if (threadIdx.x < 16) {
-	sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 16];
-	sh_con [threadIdx.x] += sh_con [threadIdx.x + 16];
-	rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 16]);
-	}
-	if (threadIdx.x < 8) {
-	sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 8];
-	sh_con [threadIdx.x] += sh_con [threadIdx.x + 8];
-	rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 8]);
-	}
-	if (threadIdx.x < 4) {
-	sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 4];
-	sh_con [threadIdx.x] += sh_con [threadIdx.x + 4];
-	rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 4]);
-	}
-	if (threadIdx.x < 2) {
-	sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 2];
-	sh_con [threadIdx.x] += sh_con [threadIdx.x + 2];
-	rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 2]);
-	}
-	if (threadIdx.x < 1) {
-	sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 1];
-	sh_con [threadIdx.x] += sh_con [threadIdx.x + 1];
-	rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 1]);
-	}
-
-	if (threadIdx.x == 0) {
-	E_Tor[j] = sh_tor [threadIdx.x];
-	E_Con[j] = sh_con [threadIdx.x];
-	rvec_Copy (aux_ext_press[j], sh_press[threadIdx.x]);
-	}
-	 */
+    /*
+    //do the reduction for the shared memory variables
+    // now do a reduce inside the warp for E_vdW, E_Ele and force.
+    if (threadIdx.x < 16) {
+    sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 16];
+    sh_con [threadIdx.x] += sh_con [threadIdx.x + 16];
+    rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 16]);
+    }
+    if (threadIdx.x < 8) {
+    sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 8];
+    sh_con [threadIdx.x] += sh_con [threadIdx.x + 8];
+    rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 8]);
+    }
+    if (threadIdx.x < 4) {
+    sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 4];
+    sh_con [threadIdx.x] += sh_con [threadIdx.x + 4];
+    rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 4]);
+    }
+    if (threadIdx.x < 2) {
+    sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 2];
+    sh_con [threadIdx.x] += sh_con [threadIdx.x + 2];
+    rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 2]);
+    }
+    if (threadIdx.x < 1) {
+    sh_tor [threadIdx.x] += sh_tor [threadIdx.x + 1];
+    sh_con [threadIdx.x] += sh_con [threadIdx.x + 1];
+    rvec_Add (sh_press [threadIdx.x], sh_press[threadIdx.x + 1]);
+    }
+
+    if (threadIdx.x == 0) {
+    E_Tor[j] = sh_tor [threadIdx.x];
+    E_Con[j] = sh_con [threadIdx.x];
+    rvec_Copy (aux_ext_press[j], sh_press[threadIdx.x]);
+    }
+     */
 
 }
 
 
 GLOBAL void Four_Body_Postprocess ( reax_atom *atoms, 
-		static_storage p_workspace, 
-		list p_bonds, int N )
+        static_storage p_workspace, 
+        list p_bonds, int N )
 {
-	int i, pj;
+    int i, pj;
 
-	bond_data *pbond;
-	bond_data *sym_index_bond;
-	bond_order_data *bo_data;
+    bond_data *pbond;
+    bond_data *sym_index_bond;
+    bond_order_data *bo_data;
 
-	list *bonds = &p_bonds;
-	static_storage *workspace = &p_workspace;
+    list *bonds = &p_bonds;
+    static_storage *workspace = &p_workspace;
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if ( i >= N) return;
+    if ( i >= N) return;
 
-	for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
 
-		pbond = &(bonds->select.bond_list[pj]);
-		bo_data = &pbond->bo_data;
-		sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
+        pbond = &(bonds->select.bond_list[pj]);
+        bo_data = &pbond->bo_data;
+        sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
 
-		workspace->CdDelta [i] += sym_index_bond->CdDelta_jk;
+        workspace->CdDelta [i] += sym_index_bond->CdDelta_jk;
 
-		//bo_data->Cdbo += sym_index_bond->Cdbo_kl;
-		bo_data->Cdbo += pbond->Cdbo_kl;
+        //bo_data->Cdbo += sym_index_bond->Cdbo_kl;
+        bo_data->Cdbo += pbond->Cdbo_kl;
 
-		//update f vector
-		rvec_Add (atoms [i].f, sym_index_bond->i_f ); 
-		rvec_Add (atoms [i].f, sym_index_bond->k_f );
-	}
+        //update f vector
+        rvec_Add (atoms [i].f, sym_index_bond->i_f ); 
+        rvec_Add (atoms [i].f, sym_index_bond->k_f );
+    }
 }
diff --git a/PuReMD-GPU/src/grid.cu b/PuReMD-GPU/src/grid.cu
index 27447a1a..00e638f4 100644
--- a/PuReMD-GPU/src/grid.cu
+++ b/PuReMD-GPU/src/grid.cu
@@ -28,459 +28,459 @@
 
 int Estimate_GCell_Population( reax_system* system )
 {
-	int i, j, k, l;
-	int max_atoms;
-	grid *g;
-
-	g = &( system->g );
-	Reset_Grid( g );
-
-	for( l = 0; l < system->N; l++ ) {
-		i = (int)(system->atoms[l].x[0] * g->inv_len[0]);
-		j = (int)(system->atoms[l].x[1] * g->inv_len[1]);
-		k = (int)(system->atoms[l].x[2] * g->inv_len[2]);
-		g->top[index_grid_3d (i, j, k, g)]++;
-		// fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", 
-		// l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
-		// i, j, k );
-	}
-
-	max_atoms = 0;
-	for( i = 0; i < g->ncell[0]; i++ )
-		for( j = 0; j < g->ncell[1]; j++ )
-			for( k = 0; k < g->ncell[2]; k++ )
-				if( max_atoms < g->top[index_grid_3d (i, j, k, g)] )
-					max_atoms = g->top[index_grid_3d (i, j, k, g)];  
-
-	return MAX(max_atoms*SAFE_ZONE, MIN_GCELL_POPL); 
+    int i, j, k, l;
+    int max_atoms;
+    grid *g;
+
+    g = &( system->g );
+    Reset_Grid( g );
+
+    for( l = 0; l < system->N; l++ ) {
+        i = (int)(system->atoms[l].x[0] * g->inv_len[0]);
+        j = (int)(system->atoms[l].x[1] * g->inv_len[1]);
+        k = (int)(system->atoms[l].x[2] * g->inv_len[2]);
+        g->top[index_grid_3d (i, j, k, g)]++;
+        // fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", 
+        // l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
+        // i, j, k );
+    }
+
+    max_atoms = 0;
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ )
+                if( max_atoms < g->top[index_grid_3d (i, j, k, g)] )
+                    max_atoms = g->top[index_grid_3d (i, j, k, g)];  
+
+    return MAX(max_atoms*SAFE_ZONE, MIN_GCELL_POPL); 
 }
 
 
 void Allocate_Space_for_Grid( reax_system *system )
 {
-	int i, j, k, l;
-	grid *g = &(system->g);
-
-	int total = g->ncell[0] * g->ncell[1] * g->ncell[2];
-
-	g = &(system->g);
-	g->max_nbrs = (2*g->spread[0]+1) * (2*g->spread[1]+1) * (2*g->spread[2]+1)+3; 
-
-	/* allocate space for the new grid */
-	g->top = (int*) calloc( total, sizeof( int ));
-	g->mark = (int*) calloc( total, sizeof( int ));
-	g->start = (int*) calloc( total, sizeof( int ));
-	g->end = (int*) calloc( total, sizeof( int ));
-	g->nbrs = (ivec*) calloc( total * g->max_nbrs, sizeof( ivec ));
-	g->nbrs_cp = (rvec*) calloc( total * g->max_nbrs, sizeof( rvec ));
-
-	for( i = 0; i < g->ncell[0]; i++ ) {
-		for( j = 0; j < g->ncell[1]; j++ ) {
-			for( k = 0; k < g->ncell[2]; k++ ) {
-				for( l = 0; l < g->max_nbrs; ++l ){ 
-					g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][0] = -1;
-					g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][1] = -1;
-					g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][2] = -1;
-
-					g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][0] = -1;
-					g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][1] = -1;
-					g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][2] = -1;
-				}
-			}
-		}
-	}
-
-	g->max_atoms = Estimate_GCell_Population( system );
-
-	g->atoms = (int*) calloc( total * g->max_atoms, sizeof( int ));
+    int i, j, k, l;
+    grid *g = &(system->g);
+
+    int total = g->ncell[0] * g->ncell[1] * g->ncell[2];
+
+    g = &(system->g);
+    g->max_nbrs = (2*g->spread[0]+1) * (2*g->spread[1]+1) * (2*g->spread[2]+1)+3; 
+
+    /* allocate space for the new grid */
+    g->top = (int*) calloc( total, sizeof( int ));
+    g->mark = (int*) calloc( total, sizeof( int ));
+    g->start = (int*) calloc( total, sizeof( int ));
+    g->end = (int*) calloc( total, sizeof( int ));
+    g->nbrs = (ivec*) calloc( total * g->max_nbrs, sizeof( ivec ));
+    g->nbrs_cp = (rvec*) calloc( total * g->max_nbrs, sizeof( rvec ));
+
+    for( i = 0; i < g->ncell[0]; i++ ) {
+        for( j = 0; j < g->ncell[1]; j++ ) {
+            for( k = 0; k < g->ncell[2]; k++ ) {
+                for( l = 0; l < g->max_nbrs; ++l ){ 
+                    g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][0] = -1;
+                    g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][1] = -1;
+                    g->nbrs[ index_grid_nbrs (i, j, k, l, g) ][2] = -1;
+
+                    g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][0] = -1;
+                    g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][1] = -1;
+                    g->nbrs_cp[ index_grid_nbrs (i, j, k, l, g) ][2] = -1;
+                }
+            }
+        }
+    }
+
+    g->max_atoms = Estimate_GCell_Population( system );
+
+    g->atoms = (int*) calloc( total * g->max_atoms, sizeof( int ));
 }
 
 
 void Deallocate_Grid_Space( grid *g )
 {
-	free( g->atoms );
-	free( g->top );
-	free( g->mark );
-	free( g->nbrs );
-	free( g->nbrs_cp );
+    free( g->atoms );
+    free( g->top );
+    free( g->mark );
+    free( g->nbrs );
+    free( g->nbrs_cp );
 }
 
 
 int Shift(int p, int dp, int dim, grid *g )
 {
-	int dim_len = 0;
-	int newp = p + dp;
-
-	switch( dim ) {
-		case 0: dim_len = g->ncell[0];
-			break;
-		case 1: dim_len = g->ncell[1];
-			break;
-		case 2: dim_len = g->ncell[2];
-	}
-
-	while( newp < 0 )        newp = newp + dim_len;
-	while( newp >= dim_len ) newp = newp - dim_len;
-	return newp;
+    int dim_len = 0;
+    int newp = p + dp;
+
+    switch( dim ) {
+        case 0: dim_len = g->ncell[0];
+            break;
+        case 1: dim_len = g->ncell[1];
+            break;
+        case 2: dim_len = g->ncell[2];
+    }
+
+    while( newp < 0 )        newp = newp + dim_len;
+    while( newp >= dim_len ) newp = newp - dim_len;
+    return newp;
 }
 
 
 /* finds the closest point between two grid cells denoted by c1 and c2.
    periodic boundary conditions are taken into consideration as well. */
 void Find_Closest_Point( grid *g, int c1x, int c1y, int c1z, 
-		int c2x, int c2y, int c2z, rvec closest_point )
+        int c2x, int c2y, int c2z, rvec closest_point )
 {
-	int  i, d;
-	ivec c1 = { c1x, c1y, c1z };
-	ivec c2 = { c2x, c2y, c2z };
-
-	for( i = 0; i < 3; i++ ) {
-		if( g->ncell[i] < 5 ) {
-			closest_point[i] = NEG_INF - 1.;
-			continue;
-		}
-
-		d = c2[i] - c1[i];
-		if( abs(d) <= g->ncell[i] / 2 ) {
-			if( d > 0 )
-				closest_point[i] = c2[i] * g->len[i];
-			else if ( d == 0 )
-				closest_point[i] = NEG_INF - 1.;
-			else
-				closest_point[i] = ( c2[i] + 1 ) * g->len[i];
-		}
-		else {
-			if( d > 0 )
-				closest_point[i] = ( c2[i] - g->ncell[i] + 1 ) * g->len[i];
-			else	
-				closest_point[i] = ( c2[i] + g->ncell[i] ) * g->len[i];
-		}
-	}
+    int  i, d;
+    ivec c1 = { c1x, c1y, c1z };
+    ivec c2 = { c2x, c2y, c2z };
+
+    for( i = 0; i < 3; i++ ) {
+        if( g->ncell[i] < 5 ) {
+            closest_point[i] = NEG_INF - 1.;
+            continue;
+        }
+
+        d = c2[i] - c1[i];
+        if( abs(d) <= g->ncell[i] / 2 ) {
+            if( d > 0 )
+                closest_point[i] = c2[i] * g->len[i];
+            else if ( d == 0 )
+                closest_point[i] = NEG_INF - 1.;
+            else
+                closest_point[i] = ( c2[i] + 1 ) * g->len[i];
+        }
+        else {
+            if( d > 0 )
+                closest_point[i] = ( c2[i] - g->ncell[i] + 1 ) * g->len[i];
+            else    
+                closest_point[i] = ( c2[i] + g->ncell[i] ) * g->len[i];
+        }
+    }
 }
 
 
 void Find_Neighbor_GridCells( grid *g )
 {
-	int i, j, k;
-	int di, dj, dk;
-	int x, y, z;
-	int stack_top;
-	ivec *nbrs_stack;
-	rvec *cp_stack;
-
-	/* pick up a cell in the grid */
-	for( i = 0; i < g->ncell[0]; i++ )
-		for( j = 0; j < g->ncell[1]; j++ )
-			for( k = 0; k < g->ncell[2]; k++ ) {
-				nbrs_stack = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] );
-				cp_stack = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] );
-				stack_top = 0;
-				//fprintf( stderr, "grid1: %d %d %d\n", i, j, k );
-
-				/* choose an unmarked neighbor cell*/
-				for( di = -g->spread[0]; di <= g->spread[0]; di++ ) {
-					x = Shift( i, di, 0, g );
-
-					for( dj = -g->spread[1]; dj <= g->spread[1]; dj++ ) {
-						y = Shift( j, dj, 1, g );
-
-						for( dk = -g->spread[2]; dk <= g->spread[2]; dk++ ) {
-							z = Shift( k, dk, 2, g );
-							//fprintf( stderr, "\tgrid2: %d %d %d\n", x, y, z );
-
-							if( !g->mark[ index_grid_3d (x, y, z, g) ] ) {
-								/*(di < 0 || // 9 combinations
-								  (di == 0 && dj < 0) || // 3 combinations
-								  (di == 0 && dj == 0 && dk < 0) ) )*/ 
-								/* put the neighbor cell into the stack and mark it */
-								nbrs_stack[stack_top][0] = x;
-								nbrs_stack[stack_top][1] = y;
-								nbrs_stack[stack_top][2] = z;
-								g->mark[ index_grid_3d(x,y,z,g) ] = 1;
-
-								Find_Closest_Point( g, i, j, k, x, y, z, cp_stack[stack_top] );
-								//fprintf( stderr, "\tcp: %lf %lf %lf\n", 
-								// cp_stack[stack_top][0], cp_stack[stack_top][1], 
-								// cp_stack[stack_top][2]);
-								stack_top++;
-							}
-						}
-					}
-				}
-
-				/*nbrs_stack[stack_top][0] = i;
-				  nbrs_stack[stack_top][1] = j;
-				  nbrs_stack[stack_top][2] = k;
-				  Find_Closest_Point( g, i, j, k, i, j, k, cp_stack[stack_top] );
-				  nbrs_stack[stack_top+1][0] = -1;
-				  nbrs_stack[stack_top+1][1] = -1;
-				  nbrs_stack[stack_top+1][2] = -1;
-				  Reset_Marks( g, nbrs_stack, stack_top+1 );*/
-				nbrs_stack[stack_top][0] = -1;
-				nbrs_stack[stack_top][1] = -1;
-				nbrs_stack[stack_top][2] = -1;
-				Reset_Marks( g, nbrs_stack, stack_top );
-			}
+    int i, j, k;
+    int di, dj, dk;
+    int x, y, z;
+    int stack_top;
+    ivec *nbrs_stack;
+    rvec *cp_stack;
+
+    /* pick up a cell in the grid */
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ ) {
+                nbrs_stack = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] );
+                cp_stack = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] );
+                stack_top = 0;
+                //fprintf( stderr, "grid1: %d %d %d\n", i, j, k );
+
+                /* choose an unmarked neighbor cell*/
+                for( di = -g->spread[0]; di <= g->spread[0]; di++ ) {
+                    x = Shift( i, di, 0, g );
+
+                    for( dj = -g->spread[1]; dj <= g->spread[1]; dj++ ) {
+                        y = Shift( j, dj, 1, g );
+
+                        for( dk = -g->spread[2]; dk <= g->spread[2]; dk++ ) {
+                            z = Shift( k, dk, 2, g );
+                            //fprintf( stderr, "\tgrid2: %d %d %d\n", x, y, z );
+
+                            if( !g->mark[ index_grid_3d (x, y, z, g) ] ) {
+                                /*(di < 0 || // 9 combinations
+                                  (di == 0 && dj < 0) || // 3 combinations
+                                  (di == 0 && dj == 0 && dk < 0) ) )*/ 
+                                /* put the neighbor cell into the stack and mark it */
+                                nbrs_stack[stack_top][0] = x;
+                                nbrs_stack[stack_top][1] = y;
+                                nbrs_stack[stack_top][2] = z;
+                                g->mark[ index_grid_3d(x,y,z,g) ] = 1;
+
+                                Find_Closest_Point( g, i, j, k, x, y, z, cp_stack[stack_top] );
+                                //fprintf( stderr, "\tcp: %lf %lf %lf\n", 
+                                // cp_stack[stack_top][0], cp_stack[stack_top][1], 
+                                // cp_stack[stack_top][2]);
+                                stack_top++;
+                            }
+                        }
+                    }
+                }
+
+                /*nbrs_stack[stack_top][0] = i;
+                  nbrs_stack[stack_top][1] = j;
+                  nbrs_stack[stack_top][2] = k;
+                  Find_Closest_Point( g, i, j, k, i, j, k, cp_stack[stack_top] );
+                  nbrs_stack[stack_top+1][0] = -1;
+                  nbrs_stack[stack_top+1][1] = -1;
+                  nbrs_stack[stack_top+1][2] = -1;
+                  Reset_Marks( g, nbrs_stack, stack_top+1 );*/
+                nbrs_stack[stack_top][0] = -1;
+                nbrs_stack[stack_top][1] = -1;
+                nbrs_stack[stack_top][2] = -1;
+                Reset_Marks( g, nbrs_stack, stack_top );
+            }
 }
 
 
 
 void Setup_Grid( reax_system* system )
 {
-	int  d;
-	ivec ncell;
-	grid *g = &( system->g );
-	simulation_box *my_box = &( system->box );
+    int  d;
+    ivec ncell;
+    grid *g = &( system->g );
+    simulation_box *my_box = &( system->box );
 
-	/* determine number of grid cells in each direction */
-	ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms );
+    /* determine number of grid cells in each direction */
+    ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms );
 
-	for( d = 0; d < 3; ++d )
-		if( ncell[d] <= 0 )
-			ncell[d] = 1;
+    for( d = 0; d < 3; ++d )
+        if( ncell[d] <= 0 )
+            ncell[d] = 1;
 
-	/* find the number of grid cells */
-	g->total = ncell[0] * ncell[1] * ncell[2];
-	ivec_Copy( g->ncell, ncell );
+    /* find the number of grid cells */
+    g->total = ncell[0] * ncell[1] * ncell[2];
+    ivec_Copy( g->ncell, ncell );
 
-	/* compute cell lengths */
-	rvec_iDivide( g->len, my_box->box_norms, g->ncell );
-	rvec_Invert( g->inv_len, g->len );
+    /* compute cell lengths */
+    rvec_iDivide( g->len, my_box->box_norms, g->ncell );
+    rvec_Invert( g->inv_len, g->len );
 
-	Allocate_Space_for_Grid( system );
-	Find_Neighbor_GridCells( g );
+    Allocate_Space_for_Grid( system );
+    Find_Neighbor_GridCells( g );
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "setting up the grid: " );
-	fprintf( stderr, "ncell[%d %d %d] ", g->ncell[0], g->ncell[1], g->ncell[2] );
-	fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", g->len[0], g->len[1], g->len[2] );
-	fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms );
+    fprintf( stderr, "setting up the grid: " );
+    fprintf( stderr, "ncell[%d %d %d] ", g->ncell[0], g->ncell[1], g->ncell[2] );
+    fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", g->len[0], g->len[1], g->len[2] );
+    fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms );
 #endif
 }
 
 
 void Update_Grid( reax_system* system )
 {
-	int  d, i, j, k, x, y, z, itr;
-	ivec ncell;
-	ivec *nbrs;
-	rvec *nbrs_cp;
-	grid *g = &( system->g );
-	simulation_box *my_box = &( system->box );
-
-	/* determine number of grid cells in each direction */
-	ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms );
-
-	for( d = 0; d < 3; ++d )
-		if( ncell[d] == 0 )
-			ncell[d] = 1;
-
-	if( ivec_isEqual( ncell, g->ncell ) ) {/* ncell are unchanged */
-		/* update cell lengths */
-		rvec_iDivide( g->len, my_box->box_norms, g->ncell );
-		rvec_Invert( g->inv_len, g->len );
-
-		/* update closest point distances between gcells */
-		for( i = 0; i < g->ncell[0]; i++ )
-			for( j = 0; j < g->ncell[1]; j++ )
-				for( k = 0; k < g->ncell[2]; k++ ) {
-					nbrs = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] );
-					nbrs_cp = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] );
-					//fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
-
-					itr = 0;
-					while( nbrs[itr][0] >= 0 ){
-						x = nbrs[itr][0];
-						y = nbrs[itr][1];
-						z = nbrs[itr][2];
-
-						Find_Closest_Point( g, i, j, k, x, y, z, nbrs_cp[itr] );
-						++itr;
-					}
-				}
-	}
-	else{  /* at least one of ncell has changed */
-		Deallocate_Grid_Space( g );    
-		/* update number of grid cells */
-		g->total = ncell[0] * ncell[1] * ncell[2];
-		ivec_Copy( g->ncell, ncell );
-		/* update cell lengths */
-		rvec_iDivide( g->len, my_box->box_norms, g->ncell );
-		rvec_Invert( g->inv_len, g->len );
-
-		Allocate_Space_for_Grid( system );
-		Find_Neighbor_GridCells( g );
+    int  d, i, j, k, x, y, z, itr;
+    ivec ncell;
+    ivec *nbrs;
+    rvec *nbrs_cp;
+    grid *g = &( system->g );
+    simulation_box *my_box = &( system->box );
+
+    /* determine number of grid cells in each direction */
+    ivec_rScale( ncell, 1. / g->cell_size, my_box->box_norms );
+
+    for( d = 0; d < 3; ++d )
+        if( ncell[d] == 0 )
+            ncell[d] = 1;
+
+    if( ivec_isEqual( ncell, g->ncell ) ) {/* ncell are unchanged */
+        /* update cell lengths */
+        rvec_iDivide( g->len, my_box->box_norms, g->ncell );
+        rvec_Invert( g->inv_len, g->len );
+
+        /* update closest point distances between gcells */
+        for( i = 0; i < g->ncell[0]; i++ )
+            for( j = 0; j < g->ncell[1]; j++ )
+                for( k = 0; k < g->ncell[2]; k++ ) {
+                    nbrs = &( g->nbrs[ index_grid_nbrs (i, j, k, 0, g) ] );
+                    nbrs_cp = &( g->nbrs_cp[ index_grid_nbrs (i, j, k, 0, g) ] );
+                    //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
+
+                    itr = 0;
+                    while( nbrs[itr][0] >= 0 ){
+                        x = nbrs[itr][0];
+                        y = nbrs[itr][1];
+                        z = nbrs[itr][2];
+
+                        Find_Closest_Point( g, i, j, k, x, y, z, nbrs_cp[itr] );
+                        ++itr;
+                    }
+                }
+    }
+    else{  /* at least one of ncell has changed */
+        Deallocate_Grid_Space( g );    
+        /* update number of grid cells */
+        g->total = ncell[0] * ncell[1] * ncell[2];
+        ivec_Copy( g->ncell, ncell );
+        /* update cell lengths */
+        rvec_iDivide( g->len, my_box->box_norms, g->ncell );
+        rvec_Invert( g->inv_len, g->len );
+
+        Allocate_Space_for_Grid( system );
+        Find_Neighbor_GridCells( g );
 #if defined(DEBUG_FOCUS)
-		fprintf( stderr, "updated grid: " );
-		fprintf( stderr, "ncell[%d %d %d] ", 
-				g->ncell[0], g->ncell[1], g->ncell[2] );
-		fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", 
-				g->len[0], g->len[1], g->len[2] );
-		fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms );
+        fprintf( stderr, "updated grid: " );
+        fprintf( stderr, "ncell[%d %d %d] ", 
+                g->ncell[0], g->ncell[1], g->ncell[2] );
+        fprintf( stderr, "len[%5.2f %5.2f %5.2f] ", 
+                g->len[0], g->len[1], g->len[2] );
+        fprintf( stderr, "g->max_atoms = %d\n", g->max_atoms );
 #endif
-	}
+    }
 }
 
 
 void Bin_Atoms( reax_system* system, static_storage *workspace )
 {
-	int i, j, k, l;
-	int max_atoms;
-	grid *g = &( system->g );
+    int i, j, k, l;
+    int max_atoms;
+    grid *g = &( system->g );
 
-	Reset_Grid( g );
+    Reset_Grid( g );
 
-	for( l = 0; l < system->N; l++ ) {
-		i = (int)(system->atoms[l].x[0] * g->inv_len[0]);
-		j = (int)(system->atoms[l].x[1] * g->inv_len[1]);
-		k = (int)(system->atoms[l].x[2] * g->inv_len[2]);
+    for( l = 0; l < system->N; l++ ) {
+        i = (int)(system->atoms[l].x[0] * g->inv_len[0]);
+        j = (int)(system->atoms[l].x[1] * g->inv_len[1]);
+        k = (int)(system->atoms[l].x[2] * g->inv_len[2]);
 
 #ifdef __BNVT_FIX__
-		if (i >= g->ncell[0]) i = g->ncell[0]-1;
-		if (j >= g->ncell[1]) j = g->ncell[1]-1;
-		if (k >= g->ncell[2]) k = g->ncell[2]-1;
+        if (i >= g->ncell[0]) i = g->ncell[0]-1;
+        if (j >= g->ncell[1]) j = g->ncell[1]-1;
+        if (k >= g->ncell[2]) k = g->ncell[2]-1;
 #endif
 
-		g->atoms[ index_grid_atoms (i,j,k,g->top[ index_grid_3d (i,j,k,g) ], g) ] = l;
-		g->top[index_grid_3d (i,j,k,g) ]++;
+        g->atoms[ index_grid_atoms (i,j,k,g->top[ index_grid_3d (i,j,k,g) ], g) ] = l;
+        g->top[index_grid_3d (i,j,k,g) ]++;
 
-		//fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", 
-		//l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
-		//i, j, k );
-	}
+        //fprintf( stderr, "\tatom%-6d (%8.3f%8.3f%8.3f) --> (%3d%3d%3d)\n", 
+        //l, system->atoms[l].x[0], system->atoms[l].x[1], system->atoms[l].x[2],
+        //i, j, k );
+    }
 
-	max_atoms = 0;
-	for( i = 0; i < g->ncell[0]; i++ )
-		for( j = 0; j < g->ncell[1]; j++ )
-			for( k = 0; k < g->ncell[2]; k++ )
-				if( max_atoms < g->top[ index_grid_3d (i, j, k, g) ] )
-					max_atoms = g->top[ index_grid_3d (i, j, k, g) ];  
+    max_atoms = 0;
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ )
+                if( max_atoms < g->top[ index_grid_3d (i, j, k, g) ] )
+                    max_atoms = g->top[ index_grid_3d (i, j, k, g) ];  
 
-	/* check if current gcell->max_atoms is safe */
-	if( max_atoms >= g->max_atoms * SAFE_ZONE ) 
-		workspace->realloc.gcell_atoms = MAX(max_atoms*SAFE_ZONE,MIN_GCELL_POPL); 
+    /* check if current gcell->max_atoms is safe */
+    if( max_atoms >= g->max_atoms * SAFE_ZONE ) 
+        workspace->realloc.gcell_atoms = MAX(max_atoms*SAFE_ZONE,MIN_GCELL_POPL); 
 }
 
 void Cuda_Bin_Atoms (reax_system *system, static_storage *workspace )
 {
-	Cuda_Reset_Grid ( &system->d_g);
+    Cuda_Reset_Grid ( &system->d_g);
 
-	Bin_Atoms ( system, workspace );
+    Bin_Atoms ( system, workspace );
 
-	dev_workspace->realloc.gcell_atoms = workspace->realloc.gcell_atoms;
+    dev_workspace->realloc.gcell_atoms = workspace->realloc.gcell_atoms;
 }
 
 void Cuda_Bin_Atoms_Sync (reax_system *system)
 {
-	copy_host_device (system->g.top, system->d_g.top, 
-			INT_SIZE * system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_TOP);
+    copy_host_device (system->g.top, system->d_g.top, 
+            INT_SIZE * system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_TOP);
 
-	copy_host_device (system->g.atoms, system->d_g.atoms, 
-			INT_SIZE * system->g.max_atoms*system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_ATOMS);
+    copy_host_device (system->g.atoms, system->d_g.atoms, 
+            INT_SIZE * system->g.max_atoms*system->g.ncell[0]*system->g.ncell[1]*system->g.ncell[2], cudaMemcpyHostToDevice, RES_GRID_ATOMS);
 }
 
 inline void reax_atom_Copy( reax_atom *dest, reax_atom *src )
 {
-	dest->type = src->type;
-	rvec_Copy( dest->x, src->x );
-	rvec_Copy( dest->v, src->v );
-	strcpy( dest->name, src->name );
+    dest->type = src->type;
+    rvec_Copy( dest->x, src->x );
+    rvec_Copy( dest->v, src->v );
+    strcpy( dest->name, src->name );
 }
 
 
 void Copy_Storage( reax_system *system, static_storage *workspace, 
-		int top, int old_id, int old_type, 
-		int *num_H, real *v, real *s, real *t, 
-		int *orig_id, rvec *f_old )
+        int top, int old_id, int old_type, 
+        int *num_H, real *v, real *s, real *t, 
+        int *orig_id, rvec *f_old )
 {
-	int i;
+    int i;
 
-	for( i = 0; i < RESTART+1; ++i )
-		v[ index_wkspace_sys (i,top, system) ] = workspace->v[ index_wkspace_sys (i,old_id, system) ];
+    for( i = 0; i < RESTART+1; ++i )
+        v[ index_wkspace_sys (i,top, system) ] = workspace->v[ index_wkspace_sys (i,old_id, system) ];
 
-	for( i = 0; i < 3; ++i ) {
-		s[ index_wkspace_sys (i,top, system) ] = workspace->s[ index_wkspace_sys (i,old_id, system) ];
-		t[ index_wkspace_sys (i,top, system) ] = workspace->t[ index_wkspace_sys (i,old_id, system) ];
-	}
+    for( i = 0; i < 3; ++i ) {
+        s[ index_wkspace_sys (i,top, system) ] = workspace->s[ index_wkspace_sys (i,old_id, system) ];
+        t[ index_wkspace_sys (i,top, system) ] = workspace->t[ index_wkspace_sys (i,old_id, system) ];
+    }
 
-	orig_id[top]  = workspace->orig_id[old_id];
+    orig_id[top]  = workspace->orig_id[old_id];
 
-	workspace->Hdia_inv[top] = 1. / system->reaxprm.sbp[ old_type ].eta;
-	workspace->b_s[top] = -system->reaxprm.sbp[ old_type ].chi;
-	workspace->b_t[top] = -1.0;	      
+    workspace->Hdia_inv[top] = 1. / system->reaxprm.sbp[ old_type ].eta;
+    workspace->b_s[top] = -system->reaxprm.sbp[ old_type ].chi;
+    workspace->b_t[top] = -1.0;          
 
-	if( system->reaxprm.sbp[ old_type ].p_hbond == 1 ) // H atom
-		workspace->hbond_index[top] = (*num_H)++;
-	else workspace->hbond_index[top] = -1;
+    if( system->reaxprm.sbp[ old_type ].p_hbond == 1 ) // H atom
+        workspace->hbond_index[top] = (*num_H)++;
+    else workspace->hbond_index[top] = -1;
 
-	rvec_Copy( f_old[top], workspace->f_old[old_id] );
+    rvec_Copy( f_old[top], workspace->f_old[old_id] );
 }
 
 
 void Free_Storage( static_storage *workspace )
 {
-	free( workspace->v );
-	free( workspace->s );
-	free( workspace->t );
-	free( workspace->orig_id );  
+    free( workspace->v );
+    free( workspace->s );
+    free( workspace->t );
+    free( workspace->orig_id );  
 }
 
 
 void Assign_New_Storage( static_storage *workspace, 
-		real *v, real *s, real *t, 
-		int *orig_id, rvec *f_old )
+        real *v, real *s, real *t, 
+        int *orig_id, rvec *f_old )
 {
-	workspace->v = v;
+    workspace->v = v;
 
-	workspace->s = s;
-	workspace->t = t;
+    workspace->s = s;
+    workspace->t = t;
 
-	workspace->orig_id = orig_id;
+    workspace->orig_id = orig_id;
 
-	workspace->f_old = f_old;
+    workspace->f_old = f_old;
 }
 
 
 void Cluster_Atoms( reax_system *system, static_storage *workspace )
 {
-	int         i, j, k, l, top, old_id, num_H = 0;
-	reax_atom  *old_atom;
-	grid       *g = &( system->g );
-	reax_atom  *new_atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
-	int        *orig_id = (int  *) calloc( system->N, sizeof( int ) );
-	real       *v;
-	real       *s, *t;
-	rvec       *f_old = (rvec*) calloc( system->N, sizeof(rvec) );
-
-	s = (real*) calloc( 3, sizeof( real ) * system->N );
-	t = (real*) calloc( 3, sizeof( real ) * system->N );
-	v = (real*) calloc( RESTART+1, sizeof( real ) * system->N );
-
-	top = 0;
-
-	for( i = 0; i < g->ncell[0]; i++ )
-		for( j = 0; j < g->ncell[1]; j++ )
-			for( k = 0; k < g->ncell[2]; k++ ) {
-				g->start[ index_grid_3d (i, j, k, g) ] = top;
-
-				for( l = 0; l < g->top[ index_grid_3d (i, j, k, g) ]; ++l ) {
-					old_id   = g->atoms[ index_grid_atoms (i, j, k, l, g) ];
-					old_atom = &( system->atoms[old_id] );
-					// fprintf( stderr, "%d <-- %d\n", top, old_id );
-
-					reax_atom_Copy( &(new_atoms[top]), old_atom );
-					Copy_Storage( system, workspace, top, old_id, old_atom->type, 
-							&num_H, v, s, t, orig_id, f_old );
-					++top;
-				}
-
-				g->end[ index_grid_3d (i, j, k, g) ] = top;
-			}
-
-
-	free( system->atoms );
-	Free_Storage( workspace );
-
-	system->atoms = new_atoms;
-	Assign_New_Storage( workspace, v, s, t, orig_id, f_old );
+    int         i, j, k, l, top, old_id, num_H = 0;
+    reax_atom  *old_atom;
+    grid       *g = &( system->g );
+    reax_atom  *new_atoms = (reax_atom*) calloc( system->N, sizeof(reax_atom) );
+    int        *orig_id = (int  *) calloc( system->N, sizeof( int ) );
+    real       *v;
+    real       *s, *t;
+    rvec       *f_old = (rvec*) calloc( system->N, sizeof(rvec) );
+
+    s = (real*) calloc( 3, sizeof( real ) * system->N );
+    t = (real*) calloc( 3, sizeof( real ) * system->N );
+    v = (real*) calloc( RESTART+1, sizeof( real ) * system->N );
+
+    top = 0;
+
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ ) {
+                g->start[ index_grid_3d (i, j, k, g) ] = top;
+
+                for( l = 0; l < g->top[ index_grid_3d (i, j, k, g) ]; ++l ) {
+                    old_id   = g->atoms[ index_grid_atoms (i, j, k, l, g) ];
+                    old_atom = &( system->atoms[old_id] );
+                    // fprintf( stderr, "%d <-- %d\n", top, old_id );
+
+                    reax_atom_Copy( &(new_atoms[top]), old_atom );
+                    Copy_Storage( system, workspace, top, old_id, old_atom->type, 
+                            &num_H, v, s, t, orig_id, f_old );
+                    ++top;
+                }
+
+                g->end[ index_grid_3d (i, j, k, g) ] = top;
+            }
+
+
+    free( system->atoms );
+    Free_Storage( workspace );
+
+    system->atoms = new_atoms;
+    Assign_New_Storage( workspace, v, s, t, orig_id, f_old );
 }
diff --git a/PuReMD-GPU/src/helpers.cu b/PuReMD-GPU/src/helpers.cu
index 82c8e248..29ae31e3 100644
--- a/PuReMD-GPU/src/helpers.cu
+++ b/PuReMD-GPU/src/helpers.cu
@@ -24,12 +24,12 @@
 
 GLOBAL void compute_Inc_on_T3 (reax_atom *atoms, unsigned int N, simulation_box *box, real d1, real d2, real d3)
 {
-	int index = blockIdx.x * blockDim.x + threadIdx.x;
-	rvec dx;
-	dx[0] = d1;
-	dx[1] = d2;
-	dx[2] = d3;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    rvec dx;
+    dx[0] = d1;
+    dx[1] = d2;
+    dx[2] = d3;
 
-	if (index < N )
-		Inc_on_T3( atoms[index].x, dx, box );
+    if (index < N )
+        Inc_on_T3( atoms[index].x, dx, box );
 }
diff --git a/PuReMD-GPU/src/init_md.cu b/PuReMD-GPU/src/init_md.cu
index 3c8ace27..e1912d3c 100644
--- a/PuReMD-GPU/src/init_md.cu
+++ b/PuReMD-GPU/src/init_md.cu
@@ -41,1321 +41,1321 @@
 #include "helpers.h"
 #include "reduction.h"
 
-#include	 "index_utils.h"
+#include     "index_utils.h"
 
 #include "validation.h"
 
 void Generate_Initial_Velocities(reax_system *system, real T )
 {
-	int i;
-	real scale, norm;
+    int i;
+    real scale, norm;
 
 
-	if( T <= 0.1 ) {
-		for (i=0; i < system->N; i++)
-			rvec_MakeZero( system->atoms[i].v );
+    if( T <= 0.1 ) {
+        for (i=0; i < system->N; i++)
+            rvec_MakeZero( system->atoms[i].v );
 #if defined(DEBUG)
-		fprintf( stderr, "no random velocities...\n" );
+        fprintf( stderr, "no random velocities...\n" );
 #endif
-	}
-	else {
-		for( i = 0; i < system->N; i++ ) {
-			rvec_Random( system->atoms[i].v );
-
-			norm = rvec_Norm_Sqr( system->atoms[i].v );
-			scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * 
-					norm / (3.0 * K_B * T) );
-
-			rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v );
-
-			/*
-			   fprintf( stderr, "v = %f %f %f\n", 
-			   system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
-			   fprintf( stderr, "scale = %f\n", scale );
-			   fprintf( stderr, "v = %f %f %f\n",
-			   system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
-			 */
-		}
-	}
+    }
+    else {
+        for( i = 0; i < system->N; i++ ) {
+            rvec_Random( system->atoms[i].v );
+
+            norm = rvec_Norm_Sqr( system->atoms[i].v );
+            scale = SQRT( system->reaxprm.sbp[ system->atoms[i].type ].mass * 
+                    norm / (3.0 * K_B * T) );
+
+            rvec_Scale( system->atoms[i].v, 1.0/scale, system->atoms[i].v );
+
+            /*
+               fprintf( stderr, "v = %f %f %f\n", 
+               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
+               fprintf( stderr, "scale = %f\n", scale );
+               fprintf( stderr, "v = %f %f %f\n",
+               system->atoms[i].v[0],system->atoms[i].v[1],system->atoms[i].v[2]);
+             */
+        }
+    }
 }
 
 
 void Init_System( reax_system *system, control_params *control, 
-		simulation_data *data )
+        simulation_data *data )
 {
-	int i;
-	rvec dx;
-
-	if( !control->restart )
-		Reset_Atoms( system );
-
-	Compute_Total_Mass( system, data );
-
-	Compute_Center_of_Mass( system, data, stderr );
-
-	/* reposition atoms */
-	// just fit the atoms to the periodic box
-	if( control->reposition_atoms == 0 ) {
-		rvec_MakeZero( dx );
-	}
-	// put the center of mass to the center of the box
-	else if( control->reposition_atoms == 1 ) {
-		rvec_Scale( dx, 0.5, system->box.box_norms );
-		rvec_ScaledAdd( dx, -1., data->xcm );
-	}
-	// put the center of mass to the origin
-	else if( control->reposition_atoms == 2 ) {
-		rvec_Scale( dx, -1., data->xcm );
-	}
-	else {
-		fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
-		exit( UNKNOWN_OPTION );
-	}
-
-	for( i = 0; i < system->N; ++i ) {
-		Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
-		/*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", 
-		  i, system->atoms[i].type, 
-		  system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/
-	}
-
-	/* Initialize velocities so that desired init T can be attained */
-	if( !control->restart || (control->restart && control->random_vel) )  {
-		Generate_Initial_Velocities( system, control->T_init );
-	}
-
-	Setup_Grid( system );
+    int i;
+    rvec dx;
+
+    if( !control->restart )
+        Reset_Atoms( system );
+
+    Compute_Total_Mass( system, data );
+
+    Compute_Center_of_Mass( system, data, stderr );
+
+    /* reposition atoms */
+    // just fit the atoms to the periodic box
+    if( control->reposition_atoms == 0 ) {
+        rvec_MakeZero( dx );
+    }
+    // put the center of mass to the center of the box
+    else if( control->reposition_atoms == 1 ) {
+        rvec_Scale( dx, 0.5, system->box.box_norms );
+        rvec_ScaledAdd( dx, -1., data->xcm );
+    }
+    // put the center of mass to the origin
+    else if( control->reposition_atoms == 2 ) {
+        rvec_Scale( dx, -1., data->xcm );
+    }
+    else {
+        fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
+        exit( UNKNOWN_OPTION );
+    }
+
+    for( i = 0; i < system->N; ++i ) {
+        Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
+        /*fprintf( stderr, "%6d%2d%8.3f%8.3f%8.3f\n", 
+          i, system->atoms[i].type, 
+          system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );*/
+    }
+
+    /* Initialize velocities so that desired init T can be attained */
+    if( !control->restart || (control->restart && control->random_vel) )  {
+        Generate_Initial_Velocities( system, control->T_init );
+    }
+
+    Setup_Grid( system );
 }
 
 
 void Cuda_Init_System( reax_system *system, control_params *control, 
-		simulation_data *data )
+        simulation_data *data )
 {
-	int i;
-	rvec dx;
-
-	if( !control->restart )
-		Cuda_Reset_Atoms( system );
-
-	Cuda_Compute_Total_Mass( system, data );
-
-	Cuda_Compute_Center_of_Mass( system, data, stderr );
-
-	/* reposition atoms */
-	// just fit the atoms to the periodic box
-	if( control->reposition_atoms == 0 ) {
-		rvec_MakeZero( dx );
-	}
-	// put the center of mass to the center of the box
-	else if( control->reposition_atoms == 1 ) {
-		rvec_Scale( dx, 0.5, system->box.box_norms );
-		rvec_ScaledAdd( dx, -1., data->xcm );
-	}
-	// put the center of mass to the origin
-	else if( control->reposition_atoms == 2 ) {
-		rvec_Scale( dx, -1., data->xcm );
-	}
-	else {
-		fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
-		exit( UNKNOWN_OPTION );
-	}
-
-	compute_Inc_on_T3 <<<BLOCKS_POW_2, BLOCK_SIZE>>>
-		(system->d_atoms, system->N, system->d_box, dx[0], dx[1], dx[2]);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	//copy back the atoms from device to the host
-	copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , 
-			cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
-
-	/* Initialize velocities so that desired init T can be attained */
-	if( !control->restart || (control->restart && control->random_vel) )  {
-		Generate_Initial_Velocities( system, control->T_init );
-	}
-
-	Setup_Grid( system );
+    int i;
+    rvec dx;
+
+    if( !control->restart )
+        Cuda_Reset_Atoms( system );
+
+    Cuda_Compute_Total_Mass( system, data );
+
+    Cuda_Compute_Center_of_Mass( system, data, stderr );
+
+    /* reposition atoms */
+    // just fit the atoms to the periodic box
+    if( control->reposition_atoms == 0 ) {
+        rvec_MakeZero( dx );
+    }
+    // put the center of mass to the center of the box
+    else if( control->reposition_atoms == 1 ) {
+        rvec_Scale( dx, 0.5, system->box.box_norms );
+        rvec_ScaledAdd( dx, -1., data->xcm );
+    }
+    // put the center of mass to the origin
+    else if( control->reposition_atoms == 2 ) {
+        rvec_Scale( dx, -1., data->xcm );
+    }
+    else {
+        fprintf( stderr, "UNKNOWN OPTION: reposition_atoms. Terminating...\n" );
+        exit( UNKNOWN_OPTION );
+    }
+
+    compute_Inc_on_T3 <<<BLOCKS_POW_2, BLOCK_SIZE>>>
+        (system->d_atoms, system->N, system->d_box, dx[0], dx[1], dx[2]);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //copy back the atoms from device to the host
+    copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , 
+            cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
+
+    /* Initialize velocities so that desired init T can be attained */
+    if( !control->restart || (control->restart && control->random_vel) )  {
+        Generate_Initial_Velocities( system, control->T_init );
+    }
+
+    Setup_Grid( system );
 }
 
 
 
 void Init_Simulation_Data( reax_system *system, control_params *control, 
-		simulation_data *data, output_controls *out_control, 
-		evolve_function *Evolve )
+        simulation_data *data, output_controls *out_control, 
+        evolve_function *Evolve )
 {
 
-	Reset_Simulation_Data( data );
+    Reset_Simulation_Data( data );
 
-	if( !control->restart )  
-		data->step = data->prev_steps = 0;
+    if( !control->restart )  
+        data->step = data->prev_steps = 0;
 
-	switch( control->ensemble ) {
-		case NVE:
-			data->N_f = 3 * system->N;
-			*Evolve = Velocity_Verlet_NVE;
-			break;
+    switch( control->ensemble ) {
+        case NVE:
+            data->N_f = 3 * system->N;
+            *Evolve = Velocity_Verlet_NVE;
+            break;
 
 
-		case NVT:
-			data->N_f = 3 * system->N + 1;
-			//control->Tau_T = 100 * data->N_f * K_B * control->T_final;
-			if( !control->restart || (control->restart && control->random_vel) ) {
-				data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-						data->N_f * K_B * control->T );
-				data->therm.v_xi = data->therm.G_xi * control->dt;
-				data->therm.v_xi_old = 0;
-				data->therm.xi = 0;
+        case NVT:
+            data->N_f = 3 * system->N + 1;
+            //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
+            if( !control->restart || (control->restart && control->random_vel) ) {
+                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
+                        data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->therm.v_xi_old = 0;
+                data->therm.xi = 0;
 #if defined(DEBUG_FOCUS)
-				fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
-						data->therm.G_xi, control->Tau_T, data->E_Kin, 
-						data->N_f, data->therm.v_xi );
+                fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
+                        data->therm.G_xi, control->Tau_T, data->E_Kin, 
+                        data->N_f, data->therm.v_xi );
 #endif
-			}
-
-			*Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
-			break;
-
-
-		case NPT: // Anisotropic NPT
-			fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-			exit( UNKNOWN_OPTION );
-			data->N_f = 3 * system->N + 9;
-			if( !control->restart ) {
-				data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-						data->N_f * K_B * control->T );
-				data->therm.v_xi = data->therm.G_xi * control->dt;
-				data->iso_bar.eps = 0.33333 * log(system->box.volume);
-				//data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
-				//Compute_Pressure( system, data, workspace );
-			}
-			*Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-			break;
-
-
-		case sNPT: // Semi-Isotropic NPT
-			data->N_f = 3 * system->N + 4;
-			*Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
-			break;
-
-
-		case iNPT: // Isotropic NPT
-			data->N_f = 3 * system->N + 2;
-			*Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-			break;
-
-		case bNVT: //berendensen NVT
-			data->N_f = 3 * system->N + 1; 
-			*Evolve = Velocity_Verlet_Berendsen_NVT;
-			break;
-
-		default:
-			break;
-	}
-
-	Compute_Kinetic_Energy( system, data );
-
-	/* init timing info for the host*/
-	data->timing.start = Get_Time( );
-	data->timing.total = data->timing.start;
-	data->timing.nbrs = 0;
-	data->timing.init_forces = 0;
-	data->timing.bonded = 0;
-	data->timing.nonb = 0;
-	data->timing.QEq = 0;
-	data->timing.matvecs = 0;
+            }
+
+            *Evolve = Velocity_Verlet_Nose_Hoover_NVT_Klein;
+            break;
+
+
+        case NPT: // Anisotropic NPT
+            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+            exit( UNKNOWN_OPTION );
+            data->N_f = 3 * system->N + 9;
+            if( !control->restart ) {
+                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
+                        data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->iso_bar.eps = 0.33333 * log(system->box.volume);
+                //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
+                //Compute_Pressure( system, data, workspace );
+            }
+            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+            break;
+
+
+        case sNPT: // Semi-Isotropic NPT
+            data->N_f = 3 * system->N + 4;
+            *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
+            break;
+
+
+        case iNPT: // Isotropic NPT
+            data->N_f = 3 * system->N + 2;
+            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+            break;
+
+        case bNVT: //berendensen NVT
+            data->N_f = 3 * system->N + 1; 
+            *Evolve = Velocity_Verlet_Berendsen_NVT;
+            break;
+
+        default:
+            break;
+    }
+
+    Compute_Kinetic_Energy( system, data );
+
+    /* init timing info for the host*/
+    data->timing.start = Get_Time( );
+    data->timing.total = data->timing.start;
+    data->timing.nbrs = 0;
+    data->timing.init_forces = 0;
+    data->timing.bonded = 0;
+    data->timing.nonb = 0;
+    data->timing.QEq = 0;
+    data->timing.matvecs = 0;
 }
 
 
 void Cuda_Init_Simulation_Data( reax_system *system, control_params *control, 
-		simulation_data *data, output_controls *out_control, 
-		evolve_function *Evolve )
+        simulation_data *data, output_controls *out_control, 
+        evolve_function *Evolve )
 {
 
-	Reset_Simulation_Data( data );
+    Reset_Simulation_Data( data );
 
-	if( !control->restart )  
-		data->step = data->prev_steps = 0;
+    if( !control->restart )  
+        data->step = data->prev_steps = 0;
 
-	switch( control->ensemble ) {
-		case NVE:
-			data->N_f = 3 * system->N;
-			*Evolve = Cuda_Velocity_Verlet_NVE;
-			break;
+    switch( control->ensemble ) {
+        case NVE:
+            data->N_f = 3 * system->N;
+            *Evolve = Cuda_Velocity_Verlet_NVE;
+            break;
 
 
-		case NVT:
-			data->N_f = 3 * system->N + 1;
-			//control->Tau_T = 100 * data->N_f * K_B * control->T_final;
-			if( !control->restart || (control->restart && control->random_vel) ) {
-				data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-						data->N_f * K_B * control->T );
-				data->therm.v_xi = data->therm.G_xi * control->dt;
-				data->therm.v_xi_old = 0;
-				data->therm.xi = 0;
+        case NVT:
+            data->N_f = 3 * system->N + 1;
+            //control->Tau_T = 100 * data->N_f * K_B * control->T_final;
+            if( !control->restart || (control->restart && control->random_vel) ) {
+                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
+                        data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->therm.v_xi_old = 0;
+                data->therm.xi = 0;
 #if defined(DEBUG_FOCUS)
-				fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
-						data->therm.G_xi, control->Tau_T, data->E_Kin, 
-						data->N_f, data->therm.v_xi );
+                fprintf( stderr, "init_md: G_xi=%f Tau_T=%f E_kin=%f N_f=%f v_xi=%f\n",
+                        data->therm.G_xi, control->Tau_T, data->E_Kin, 
+                        data->N_f, data->therm.v_xi );
 #endif
-			}
-
-			*Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein;
-			break;
-
-
-		case NPT: // Anisotropic NPT
-			fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-			exit( UNKNOWN_OPTION );
-			data->N_f = 3 * system->N + 9;
-			if( !control->restart ) {
-				data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
-						data->N_f * K_B * control->T );
-				data->therm.v_xi = data->therm.G_xi * control->dt;
-				data->iso_bar.eps = 0.33333 * log(system->box.volume);
-				//data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
-				//Compute_Pressure( system, data, workspace );
-			}
-			*Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-			break;
-
-
-		case sNPT: // Semi-Isotropic NPT
-			fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-			exit( UNKNOWN_OPTION );
-			data->N_f = 3 * system->N + 4;
-			*Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
-			break;
-
-
-		case iNPT: // Isotropic NPT
-			fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
-			exit( UNKNOWN_OPTION );
-			data->N_f = 3 * system->N + 2;
-			*Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
-			break;
-
-		case bNVT: //berendensen NVT
-			data->N_f = 3 * system->N + 1; 
-			*Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
-			break;
-
-		default:
-			break;
-	}
-
-	Cuda_Compute_Kinetic_Energy (system, data);
+            }
+
+            *Evolve = Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein;
+            break;
+
+
+        case NPT: // Anisotropic NPT
+            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+            exit( UNKNOWN_OPTION );
+            data->N_f = 3 * system->N + 9;
+            if( !control->restart ) {
+                data->therm.G_xi = control->Tau_T * (2.0 * data->E_Kin - 
+                        data->N_f * K_B * control->T );
+                data->therm.v_xi = data->therm.G_xi * control->dt;
+                data->iso_bar.eps = 0.33333 * log(system->box.volume);
+                //data->inv_W = 1. / (data->N_f*K_B*control->T*SQR(control->Tau_P));
+                //Compute_Pressure( system, data, workspace );
+            }
+            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+            break;
+
+
+        case sNPT: // Semi-Isotropic NPT
+            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+            exit( UNKNOWN_OPTION );
+            data->N_f = 3 * system->N + 4;
+            *Evolve = Velocity_Verlet_Berendsen_SemiIsotropic_NPT;
+            break;
+
+
+        case iNPT: // Isotropic NPT
+            fprintf( stderr, "THIS OPTION IS NOT YET IMPLEMENTED! TERMINATING...\n" );
+            exit( UNKNOWN_OPTION );
+            data->N_f = 3 * system->N + 2;
+            *Evolve = Velocity_Verlet_Berendsen_Isotropic_NPT;
+            break;
+
+        case bNVT: //berendensen NVT
+            data->N_f = 3 * system->N + 1; 
+            *Evolve = Cuda_Velocity_Verlet_Berendsen_NVT;
+            break;
+
+        default:
+            break;
+    }
+
+    Cuda_Compute_Kinetic_Energy (system, data);
 
 #ifdef __BUILD_DEBUG__
-	real t_E_Kin = 0;
-	t_E_Kin = data->E_Kin;
+    real t_E_Kin = 0;
+    t_E_Kin = data->E_Kin;
 #endif
 
-	copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, 
-			REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-	data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
-	if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! 
-		data->therm.T = ALMOST_ZERO;
+    copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, 
+            REAL_SIZE, cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+    data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
+    if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! 
+        data->therm.T = ALMOST_ZERO;
 
 #ifdef __BUILD_DEBUG__
-	if (check_zero (t_E_Kin, data->E_Kin)){
-		fprintf (stderr, "SimulationData:E_Kin does not match between host and device (%f %f) \n", t_E_Kin, data->E_Kin );
-		exit (1);
-	}
-	//validate_data ( system, data );
+    if (check_zero (t_E_Kin, data->E_Kin)){
+        fprintf (stderr, "SimulationData:E_Kin does not match between host and device (%f %f) \n", t_E_Kin, data->E_Kin );
+        exit (1);
+    }
+    //validate_data ( system, data );
 #endif
 
-	/* init timing info for the host*/
-	data->timing.start = Get_Time( );
-	data->timing.total = data->timing.start;
-	data->timing.nbrs = 0;
-	data->timing.init_forces = 0;
-	data->timing.bonded = 0;
-	data->timing.nonb = 0;
-	data->timing.QEq = 0;
-	data->timing.matvecs = 0;
-
-	/* init timing info for the device */
-	d_timing.start = Get_Time( );
-	d_timing.total = data->timing.start;
-	d_timing.nbrs = 0;
-	d_timing.init_forces = 0;
-	d_timing.bonded = 0;
-	d_timing.nonb = 0;
-	d_timing.QEq = 0;
-	d_timing.matvecs = 0;
+    /* init timing info for the host*/
+    data->timing.start = Get_Time( );
+    data->timing.total = data->timing.start;
+    data->timing.nbrs = 0;
+    data->timing.init_forces = 0;
+    data->timing.bonded = 0;
+    data->timing.nonb = 0;
+    data->timing.QEq = 0;
+    data->timing.matvecs = 0;
+
+    /* init timing info for the device */
+    d_timing.start = Get_Time( );
+    d_timing.total = data->timing.start;
+    d_timing.nbrs = 0;
+    d_timing.init_forces = 0;
+    d_timing.bonded = 0;
+    d_timing.nonb = 0;
+    d_timing.QEq = 0;
+    d_timing.matvecs = 0;
 }
 
 
 void Init_Workspace( reax_system *system, control_params *control, 
-		static_storage *workspace )
+        static_storage *workspace )
 {  
-	int i;
-
-	/* Allocate space for hydrogen bond list */
-	workspace->hbond_index = (int *) malloc( system->N * sizeof( int ) );
-
-	/* bond order related storage  */
-	workspace->total_bond_order = (real *) malloc( system->N * sizeof( real ) );
-	workspace->Deltap           = (real *) malloc( system->N * sizeof( real ) );
-	workspace->Deltap_boc       = (real *) malloc( system->N * sizeof( real ) );
-	workspace->dDeltap_self     = (rvec *) malloc( system->N * sizeof( rvec ) );
-
-	workspace->Delta	      = (real *) malloc( system->N * sizeof( real ) );
-	workspace->Delta_lp	      = (real *) malloc( system->N * sizeof( real ) );
-	workspace->Delta_lp_temp    = (real *) malloc( system->N * sizeof( real ) );
-	workspace->dDelta_lp	      = (real *) malloc( system->N * sizeof( real ) );
-	workspace->dDelta_lp_temp   = (real *) malloc( system->N * sizeof( real ) );
-	workspace->Delta_e          = (real *) malloc( system->N * sizeof( real ) );
-	workspace->Delta_boc        = (real *) malloc( system->N * sizeof( real ) );
-	workspace->nlp	      = (real *) malloc( system->N * sizeof( real ) );
-	workspace->nlp_temp	      = (real *) malloc( system->N * sizeof( real ) );
-	workspace->Clp	      = (real *) malloc( system->N * sizeof( real ) );
-	workspace->CdDelta	      = (real *) malloc( system->N * sizeof( real ) );
-	workspace->vlpex	      = (real *) malloc( system->N * sizeof( real ) );
-
-	/* QEq storage */
-	//workspace->H        = NULL;
-	//workspace->L        = NULL;
-	//workspace->U        = NULL;
-	//
-	workspace->H.start        = NULL;
-	workspace->L.start        = NULL;
-	workspace->U.start        = NULL;
-
-	workspace->H.entries 		= NULL;
-	workspace->L.entries 		= NULL;
-	workspace->U.entries		= NULL;
-
-	workspace->droptol  = (real *) calloc( system->N, sizeof( real ) );
-	workspace->w        = (real *) calloc( system->N, sizeof( real ) );
-	workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) );
-	workspace->b        = (real *) calloc( system->N * 2, sizeof( real ) );
-	workspace->b_s      = (real *) calloc( system->N, sizeof( real ) );
-	workspace->b_t      = (real *) calloc( system->N, sizeof( real ) );
-	workspace->b_prc    = (real *) calloc( system->N * 2, sizeof( real ) );
-	workspace->b_prm    = (real *) calloc( system->N * 2, sizeof( real ) );
-	workspace->s_t      = (real *) calloc( system->N * 2, sizeof( real ) );
-	workspace->s        = (real *) calloc( 5 * system->N, sizeof( real ) );
-	workspace->t        = (real *) calloc( 5 * system->N, sizeof( real ) );
-	// workspace->s_old    = (real *) calloc( system->N, sizeof( real ) );
-	// workspace->t_old    = (real *) calloc( system->N, sizeof( real ) );
-	// workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) );
-	// workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) );
-
-	for( i = 0; i < system->N; ++i ) {
-		workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta;
-		workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
-		workspace->b_t[i] = -1.0;
-
-		workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
-		workspace->b[i+system->N] = -1.0;
-	}
-
-	/* GMRES storage */
-	workspace->y  = (real *)  calloc( RESTART+1, sizeof( real ) );
-	workspace->z  = (real *)  calloc( RESTART+1, sizeof( real ) );
-	workspace->g  = (real *)  calloc( RESTART+1, sizeof( real ) );
-	workspace->hs = (real *)  calloc( RESTART+1, sizeof( real ) );
-	workspace->hc = (real *)  calloc( RESTART+1, sizeof( real ) );
-
-	workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) );
-	workspace->v  = (real *) calloc( (RESTART+1)*system->N, sizeof( real) );
-	workspace->h  = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) );
-
-	/* CG storage */
-	workspace->r = (real *) calloc( system->N, sizeof( real ) );
-	workspace->d = (real *) calloc( system->N, sizeof( real ) );
-	workspace->q = (real *) calloc( system->N, sizeof( real ) );
-	workspace->p = (real *) calloc( system->N, sizeof( real ) );
-
-
-	/* integrator storage */
-	workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) );
-
-
-	/* storage for analysis */
-	if( control->molec_anal || control->diffusion_coef )
-	{
-		workspace->mark = (int *) calloc( system->N, sizeof(int) );
-		workspace->old_mark = (int *) calloc( system->N, sizeof(int) );
-	}
-	else 
-		workspace->mark = workspace->old_mark = NULL;
-
-	if( control->diffusion_coef )
-		workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) );
-	else workspace->x_old = NULL;
+    int i;
+
+    /* Allocate space for hydrogen bond list */
+    workspace->hbond_index = (int *) malloc( system->N * sizeof( int ) );
+
+    /* bond order related storage  */
+    workspace->total_bond_order = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Deltap           = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Deltap_boc       = (real *) malloc( system->N * sizeof( real ) );
+    workspace->dDeltap_self     = (rvec *) malloc( system->N * sizeof( rvec ) );
+
+    workspace->Delta          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_lp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_lp_temp    = (real *) malloc( system->N * sizeof( real ) );
+    workspace->dDelta_lp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->dDelta_lp_temp   = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_e          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Delta_boc        = (real *) malloc( system->N * sizeof( real ) );
+    workspace->nlp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->nlp_temp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->Clp          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->CdDelta          = (real *) malloc( system->N * sizeof( real ) );
+    workspace->vlpex          = (real *) malloc( system->N * sizeof( real ) );
+
+    /* QEq storage */
+    //workspace->H        = NULL;
+    //workspace->L        = NULL;
+    //workspace->U        = NULL;
+    //
+    workspace->H.start        = NULL;
+    workspace->L.start        = NULL;
+    workspace->U.start        = NULL;
+
+    workspace->H.entries         = NULL;
+    workspace->L.entries         = NULL;
+    workspace->U.entries        = NULL;
+
+    workspace->droptol  = (real *) calloc( system->N, sizeof( real ) );
+    workspace->w        = (real *) calloc( system->N, sizeof( real ) );
+    workspace->Hdia_inv = (real *) calloc( system->N, sizeof( real ) );
+    workspace->b        = (real *) calloc( system->N * 2, sizeof( real ) );
+    workspace->b_s      = (real *) calloc( system->N, sizeof( real ) );
+    workspace->b_t      = (real *) calloc( system->N, sizeof( real ) );
+    workspace->b_prc    = (real *) calloc( system->N * 2, sizeof( real ) );
+    workspace->b_prm    = (real *) calloc( system->N * 2, sizeof( real ) );
+    workspace->s_t      = (real *) calloc( system->N * 2, sizeof( real ) );
+    workspace->s        = (real *) calloc( 5 * system->N, sizeof( real ) );
+    workspace->t        = (real *) calloc( 5 * system->N, sizeof( real ) );
+    // workspace->s_old    = (real *) calloc( system->N, sizeof( real ) );
+    // workspace->t_old    = (real *) calloc( system->N, sizeof( real ) );
+    // workspace->s_oldest = (real *) calloc( system->N, sizeof( real ) );
+    // workspace->t_oldest = (real *) calloc( system->N, sizeof( real ) );
+
+    for( i = 0; i < system->N; ++i ) {
+        workspace->Hdia_inv[i] = 1./system->reaxprm.sbp[system->atoms[i].type].eta;
+        workspace->b_s[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+        workspace->b_t[i] = -1.0;
+
+        workspace->b[i] = -system->reaxprm.sbp[ system->atoms[i].type ].chi;
+        workspace->b[i+system->N] = -1.0;
+    }
+
+    /* GMRES storage */
+    workspace->y  = (real *)  calloc( RESTART+1, sizeof( real ) );
+    workspace->z  = (real *)  calloc( RESTART+1, sizeof( real ) );
+    workspace->g  = (real *)  calloc( RESTART+1, sizeof( real ) );
+    workspace->hs = (real *)  calloc( RESTART+1, sizeof( real ) );
+    workspace->hc = (real *)  calloc( RESTART+1, sizeof( real ) );
+
+    workspace->rn = (real *) calloc( (RESTART+1)*system->N*2, sizeof( real) );
+    workspace->v  = (real *) calloc( (RESTART+1)*system->N, sizeof( real) );
+    workspace->h  = (real *) calloc( (RESTART+1)*(RESTART+1), sizeof( real) );
+
+    /* CG storage */
+    workspace->r = (real *) calloc( system->N, sizeof( real ) );
+    workspace->d = (real *) calloc( system->N, sizeof( real ) );
+    workspace->q = (real *) calloc( system->N, sizeof( real ) );
+    workspace->p = (real *) calloc( system->N, sizeof( real ) );
+
+
+    /* integrator storage */
+    workspace->a = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_old = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->v_const = (rvec *) malloc( system->N * sizeof( rvec ) );
+
+
+    /* storage for analysis */
+    if( control->molec_anal || control->diffusion_coef )
+    {
+        workspace->mark = (int *) calloc( system->N, sizeof(int) );
+        workspace->old_mark = (int *) calloc( system->N, sizeof(int) );
+    }
+    else 
+        workspace->mark = workspace->old_mark = NULL;
+
+    if( control->diffusion_coef )
+        workspace->x_old = (rvec *) calloc( system->N, sizeof( rvec ) );
+    else workspace->x_old = NULL;
 
 
 #ifdef TEST_FORCES
-	workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_ele = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_vdw = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_bo = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_be = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_lp = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_ov = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_un = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_ang = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_coa = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_pen = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_hb = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_tor = (rvec *) malloc( system->N * sizeof( rvec ) );
-	workspace->f_con = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->dDelta = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_ele = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_vdw = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_bo = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_be = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_lp = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_ov = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_un = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_ang = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_coa = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_pen = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_hb = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_tor = (rvec *) malloc( system->N * sizeof( rvec ) );
+    workspace->f_con = (rvec *) malloc( system->N * sizeof( rvec ) );
 #endif
 
-	workspace->realloc.num_far = -1;
-	workspace->realloc.Htop = -1;
-	workspace->realloc.hbonds = -1;
-	workspace->realloc.bonds = -1;
-	workspace->realloc.num_3body = -1;
-	workspace->realloc.gcell_atoms = -1;
+    workspace->realloc.num_far = -1;
+    workspace->realloc.Htop = -1;
+    workspace->realloc.hbonds = -1;
+    workspace->realloc.bonds = -1;
+    workspace->realloc.num_3body = -1;
+    workspace->realloc.gcell_atoms = -1;
 
-	Reset_Workspace( system, workspace );
+    Reset_Workspace( system, workspace );
 }
 
 void compare_far_neighbors (int *test, int *start, int *end, far_neighbor_data *data, list *slist, int N)
 {
-	int index = 0;
-	int count = 0;
-	int jicount = 0;
-	int end_index, gpu_index, gpu_end, k;
-	far_neighbor_data gpu, cpu;
-
-	/*
-	   for (int i = 0; i < N ; i++ )
-	   {
-	   if (test[i] != start[i]) {
-	   fprintf (stderr, "start index does not match \n");
-	   exit (0);
-	   }
-
-	   if (test[i+1] != (end[i]) ){
-	   fprintf (stderr, "end index does not match for atom %d (cpu: %d gpu: %d) \n", i, test[i+1], end[i]);
-	   exit (0);
-	   }
-	   }
-	 */
-
-
-	for (int i = 0; i < N; i++){
-		index = Start_Index (i, slist);
-		//fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]);
-
-
-		for (int j = start[i]; j < end[i]; j++){
-			gpu = data[j];
-
-			if (i < data[j].nbr) continue;
-			/*
-			   if (i < data[j].nbr) {
-			//fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j);
-			int src = data[j].nbr;
-			int dest = i;
-			int x;
-
-
-			for (x = start[src]; x < end[src]; x++) {
-			if (data[x].nbr != dest) continue;
-
-			gpu = data[x];
-			cpu = data[j];
-
-			if (  (gpu.d != cpu.d) ||
-			(cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
-			(cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
-			fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, 
-			data[j].d, 
-			data[j].rel_box[0],
-			data[j].rel_box[1],
-			data[j].rel_box[2],
-			data[j].dvec[0], 
-			data[j].dvec[1], 
-			data[j].dvec[2] 
-			);
-			fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr,
-			data[x].d,
-			data[x].rel_box[0],
-			data[x].rel_box[1],
-			data[x].rel_box[2],
-			data[x].dvec[0],
-			data[x].dvec[1],
-			data[x].dvec[2]
-			);
-			jicount++;
-			}
-			break;
-			}
-
-			if (x >= end[src]) {
-			fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src );
-			exit (0);
-			}
-
-			continue;
-			}
-			 */
-
-			cpu = slist->select.far_nbr_list[index];
-			//if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){
-			//if ( (gpu->d != cpu->d) ){
-			if (  (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ||
-					(cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
-					(cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
-				//if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) ||
-				//		(gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) {
-				//if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){
-
-				fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d ) (%d %d %d) \n", i, start[i], end[i], j, gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
-				fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, slist), End_Index (i, slist), index);
-
-				/*
-				   fprintf (stdout, "Far neighbors does not match atom: %d \n", i );
-				   fprintf (stdout, "neighbor %d ,  %d \n",  cpu.nbr, gpu.nbr);
-				   fprintf (stdout, "d %f ,  %f \n", slist->select.far_nbr_list[index].d, data[j].d);
-				   fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", 
-				   cpu.dvec[0], cpu.dvec[1], cpu.dvec[2],
-				   gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] );
-
-				   fprintf (stdout, "ivec (%d %d %d) (%d %d %d) \n", 
-				   cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2],
-				   gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
-
-				 */
-				count ++;
-			}
-
-			//fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d);
-			index ++;
-			}
-
-			if (index != End_Index (i, slist))
-			{
-				fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, slist), End_Index(i, slist),
-						start[i], end[i]);
-				exit (10);
-			}
-			}
-
-			fprintf (stderr, "Far neighbors MATCH between CPU and GPU -->%d  reverse %d \n", count, jicount);
-
-			/*
-			   for (int i = 0; i < N; i++) 
-			   {
-			   index = Start_Index (i, slist);
-			   end_index = End_Index (i, slist);
-
-			   gpu_index = start[i];
-			   gpu_end = end[i];
-			   for (int j = index; j < end_index; j++) 
-			   {
-			   far_neighbor_data *cpu = &slist->select.far_nbr_list[j];
-			   far_neighbor_data *gpu;
-
-			   for (k = gpu_index; k < gpu_end; k++) {
-			   gpu = &data[k];
-			   if (gpu->nbr == cpu->nbr) break;
-			   }
-
-			   if (k == gpu_end) { fprintf (stderr, " could not find neighbor for atom %d \n", i); exit (1); }
-
-			   if ( (gpu->nbr != cpu->nbr) || (gpu->d != cpu->d) ||
-			   ((cpu->dvec[0] || gpu->dvec[0]) || (cpu->dvec[1] || gpu->dvec[1]) || (cpu->dvec[2] || gpu->dvec[2])) ||
-			   ((cpu->rel_box[0] || gpu->rel_box[0]) || (cpu->rel_box[1] || gpu->rel_box[1]) || (cpu->rel_box[2] || gpu->rel_box[2])) ) {
-
-			   fprintf (stderr, "Far neighbors does not match atom: %d \n", i );
-			   fprintf (stderr, "neighbor %d ,  %d \n",  cpu->nbr, gpu->nbr);
-			   fprintf (stderr, "d %d ,  %d \n", cpu->d, gpu->d);
-			   fprintf (stderr, "dvec (%f %f %f) (%f %f %f) \n", 
-			   cpu->dvec[0], cpu->dvec[1], cpu->dvec[2],
-			   gpu->dvec[0], gpu->dvec[1], gpu->dvec[2] );
-
-			   fprintf (stderr, "ivec (%d %d %d) (%d %d %d) \n", 
-			   cpu->rel_box[0], cpu->rel_box[1], cpu->rel_box[2],
-			   gpu->rel_box[0], gpu->rel_box[1], gpu->rel_box[2] );
-			   fprintf (stderr, "GPU start %d GPU End %d \n", gpu_index, gpu_end );
-
-			   exit (1);
-			   }
-			   }
-			   }
-
-			 */
-		}
-
-		int Estimate_Device_Matrix (reax_system *system, control_params *control, 
-				simulation_data *data, static_storage *workspace, 
-				list **lists, output_controls *out_control )
-		{
-			int *indices, *Htop;
-			list *far_nbrs = dev_lists + FAR_NBRS;
-			int max_sparse_entries = 0;
-			real t1, t2;
-
-			indices = (int *) scratch;
-			cuda_memset ( indices, 0, INT_SIZE * system->N, RES_SCRATCH );
-
-			t1 = Get_Time ();
-
-			Estimate_Sparse_Matrix_Entries <<<BLOCKS, BLOCK_SIZE>>>
-				( system->d_atoms, (control_params *)control->d_control, 
-				  (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, 
-				  *far_nbrs, system->N, indices );
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-			t2 = Get_Timing_Info ( t1 );
-
-			//fprintf (stderr, " Time to estimate sparse matrix entries --- > %f \n", t2 );
-
-			Htop = (int *) malloc (INT_SIZE * (system->N + 1));
-			memset (Htop, 0, INT_SIZE * (system->N + 1));
-			copy_host_device (Htop, indices, system->N * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-
-			for (int i = 0; i < system->N; i++) 
-			{
-				if (max_sparse_entries < Htop[i]) {
-					max_sparse_entries = Htop[i];
-				}    
-			}
+    int index = 0;
+    int count = 0;
+    int jicount = 0;
+    int end_index, gpu_index, gpu_end, k;
+    far_neighbor_data gpu, cpu;
+
+    /*
+       for (int i = 0; i < N ; i++ )
+       {
+       if (test[i] != start[i]) {
+       fprintf (stderr, "start index does not match \n");
+       exit (0);
+       }
+
+       if (test[i+1] != (end[i]) ){
+       fprintf (stderr, "end index does not match for atom %d (cpu: %d gpu: %d) \n", i, test[i+1], end[i]);
+       exit (0);
+       }
+       }
+     */
+
+
+    for (int i = 0; i < N; i++){
+        index = Start_Index (i, slist);
+        //fprintf (stderr, "GPU : Neighbors of atom --> %d (start: %d , end: %d )\n", i, start[i], end[i]);
+
+
+        for (int j = start[i]; j < end[i]; j++){
+            gpu = data[j];
+
+            if (i < data[j].nbr) continue;
+            /*
+               if (i < data[j].nbr) {
+            //fprintf (stderr, " atom %d and neighbor %d @ index %d\n", i, data[j].nbr, j);
+            int src = data[j].nbr;
+            int dest = i;
+            int x;
+
+
+            for (x = start[src]; x < end[src]; x++) {
+            if (data[x].nbr != dest) continue;
+
+            gpu = data[x];
+            cpu = data[j];
+
+            if (  (gpu.d != cpu.d) ||
+            (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
+            (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
+            fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, 
+            data[j].d, 
+            data[j].rel_box[0],
+            data[j].rel_box[1],
+            data[j].rel_box[2],
+            data[j].dvec[0], 
+            data[j].dvec[1], 
+            data[j].dvec[2] 
+            );
+            fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr,
+            data[x].d,
+            data[x].rel_box[0],
+            data[x].rel_box[1],
+            data[x].rel_box[2],
+            data[x].dvec[0],
+            data[x].dvec[1],
+            data[x].dvec[2]
+            );
+            jicount++;
+            }
+            break;
+            }
+
+            if (x >= end[src]) {
+            fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src );
+            exit (0);
+            }
+
+            continue;
+            }
+             */
+
+            cpu = slist->select.far_nbr_list[index];
+            //if ( (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ){
+            //if ( (gpu->d != cpu->d) ){
+            if (  (gpu.nbr != cpu.nbr) || (gpu.d != cpu.d) ||
+                    (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
+                    (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
+                //if ( (gpu.dvec[0] != i) || (gpu.dvec[1] != i) ||(gpu.dvec[2] != i) ||
+                //        (gpu.rel_box[0] != i) || (gpu.rel_box[1] != i) ||(gpu.rel_box[2] != i) ) {
+                //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE - RVEC_SIZE - INT_SIZE )){
+
+                fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d ) (%d %d %d) \n", i, start[i], end[i], j, gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
+                fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, slist), End_Index (i, slist), index);
+
+                /*
+                   fprintf (stdout, "Far neighbors does not match atom: %d \n", i );
+                   fprintf (stdout, "neighbor %d ,  %d \n",  cpu.nbr, gpu.nbr);
+                   fprintf (stdout, "d %f ,  %f \n", slist->select.far_nbr_list[index].d, data[j].d);
+                   fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", 
+                   cpu.dvec[0], cpu.dvec[1], cpu.dvec[2],
+                   gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] );
+
+                   fprintf (stdout, "ivec (%d %d %d) (%d %d %d) \n", 
+                   cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2],
+                   gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
+
+                 */
+                count ++;
+            }
+
+            //fprintf (stderr, "GPU (neighbor %d , d %d )\n", gpu->nbr, gpu->d);
+            index ++;
+            }
+
+            if (index != End_Index (i, slist))
+            {
+                fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, slist), End_Index(i, slist),
+                        start[i], end[i]);
+                exit (10);
+            }
+            }
+
+            fprintf (stderr, "Far neighbors MATCH between CPU and GPU -->%d  reverse %d \n", count, jicount);
+
+            /*
+               for (int i = 0; i < N; i++) 
+               {
+               index = Start_Index (i, slist);
+               end_index = End_Index (i, slist);
+
+               gpu_index = start[i];
+               gpu_end = end[i];
+               for (int j = index; j < end_index; j++) 
+               {
+               far_neighbor_data *cpu = &slist->select.far_nbr_list[j];
+               far_neighbor_data *gpu;
+
+               for (k = gpu_index; k < gpu_end; k++) {
+               gpu = &data[k];
+               if (gpu->nbr == cpu->nbr) break;
+               }
+
+               if (k == gpu_end) { fprintf (stderr, " could not find neighbor for atom %d \n", i); exit (1); }
+
+               if ( (gpu->nbr != cpu->nbr) || (gpu->d != cpu->d) ||
+               ((cpu->dvec[0] || gpu->dvec[0]) || (cpu->dvec[1] || gpu->dvec[1]) || (cpu->dvec[2] || gpu->dvec[2])) ||
+               ((cpu->rel_box[0] || gpu->rel_box[0]) || (cpu->rel_box[1] || gpu->rel_box[1]) || (cpu->rel_box[2] || gpu->rel_box[2])) ) {
+
+               fprintf (stderr, "Far neighbors does not match atom: %d \n", i );
+               fprintf (stderr, "neighbor %d ,  %d \n",  cpu->nbr, gpu->nbr);
+               fprintf (stderr, "d %d ,  %d \n", cpu->d, gpu->d);
+               fprintf (stderr, "dvec (%f %f %f) (%f %f %f) \n", 
+               cpu->dvec[0], cpu->dvec[1], cpu->dvec[2],
+               gpu->dvec[0], gpu->dvec[1], gpu->dvec[2] );
+
+               fprintf (stderr, "ivec (%d %d %d) (%d %d %d) \n", 
+               cpu->rel_box[0], cpu->rel_box[1], cpu->rel_box[2],
+               gpu->rel_box[0], gpu->rel_box[1], gpu->rel_box[2] );
+               fprintf (stderr, "GPU start %d GPU End %d \n", gpu_index, gpu_end );
+
+               exit (1);
+               }
+               }
+               }
+
+             */
+        }
+
+        int Estimate_Device_Matrix (reax_system *system, control_params *control, 
+                simulation_data *data, static_storage *workspace, 
+                list **lists, output_controls *out_control )
+        {
+            int *indices, *Htop;
+            list *far_nbrs = dev_lists + FAR_NBRS;
+            int max_sparse_entries = 0;
+            real t1, t2;
+
+            indices = (int *) scratch;
+            cuda_memset ( indices, 0, INT_SIZE * system->N, RES_SCRATCH );
+
+            t1 = Get_Time ();
+
+            Estimate_Sparse_Matrix_Entries <<<BLOCKS, BLOCK_SIZE>>>
+                ( system->d_atoms, (control_params *)control->d_control, 
+                  (simulation_data *)data->d_simulation_data, (simulation_box *)system->d_box, 
+                  *far_nbrs, system->N, indices );
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+            t2 = Get_Timing_Info ( t1 );
+
+            //fprintf (stderr, " Time to estimate sparse matrix entries --- > %f \n", t2 );
+
+            Htop = (int *) malloc (INT_SIZE * (system->N + 1));
+            memset (Htop, 0, INT_SIZE * (system->N + 1));
+            copy_host_device (Htop, indices, system->N * INT_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+
+            for (int i = 0; i < system->N; i++) 
+            {
+                if (max_sparse_entries < Htop[i]) {
+                    max_sparse_entries = Htop[i];
+                }    
+            }
 
 #ifdef __DEBUG_CUDA__
-			fprintf (stderr, " Max sparse entries for this run are ---> %d \n", max_sparse_entries );
+            fprintf (stderr, " Max sparse entries for this run are ---> %d \n", max_sparse_entries );
 #endif
 
-			return max_sparse_entries * SAFE_ZONE;
-			//return max_sparse_entries;
-		}
+            return max_sparse_entries * SAFE_ZONE;
+            //return max_sparse_entries;
+        }
 
-		void Allocate_Device_Matrix (reax_system *system, control_params *control, 
-				simulation_data *data, static_storage *workspace, 
-				list **lists, output_controls *out_control )
-		{
+        void Allocate_Device_Matrix (reax_system *system, control_params *control, 
+                simulation_data *data, static_storage *workspace, 
+                list **lists, output_controls *out_control )
+        {
 
-			//Allocate space for the sparse Matrix entries here. 
-			system->max_sparse_matrix_entries = 
-				Estimate_Device_Matrix (system, control, data, workspace, lists, out_control );
-			dev_workspace->H.n = system->N ;
-			dev_workspace->H.m = system->N * system->max_sparse_matrix_entries;
-			Cuda_Init_Sparse_Matrix (&dev_workspace->H, system->max_sparse_matrix_entries * system->N, system->N );
+            //Allocate space for the sparse Matrix entries here. 
+            system->max_sparse_matrix_entries = 
+                Estimate_Device_Matrix (system, control, data, workspace, lists, out_control );
+            dev_workspace->H.n = system->N ;
+            dev_workspace->H.m = system->N * system->max_sparse_matrix_entries;
+            Cuda_Init_Sparse_Matrix (&dev_workspace->H, system->max_sparse_matrix_entries * system->N, system->N );
 
 #ifdef __CUDA_MEM__
-			fprintf( stderr, "Device memory allocated: sparse matrix= %ld (MB)\n", 
-					system->max_sparse_matrix_entries * system->N * sizeof(sparse_matrix_entry) / (1024*1024) );
+            fprintf( stderr, "Device memory allocated: sparse matrix= %ld (MB)\n", 
+                    system->max_sparse_matrix_entries * system->N * sizeof(sparse_matrix_entry) / (1024*1024) );
 #endif
-		}
+        }
 
-		void Cuda_Init_Lists( reax_system *system, control_params *control, 
-				simulation_data *data, static_storage *workspace, 
-				list **lists, output_controls *out_control )
-		{
-			int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
-			int *hb_top, *bond_top;
+        void Cuda_Init_Lists( reax_system *system, control_params *control, 
+                simulation_data *data, static_storage *workspace, 
+                list **lists, output_controls *out_control )
+        {
+            int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
+            int *hb_top, *bond_top;
 
-			real t_start, t_elapsed;
+            real t_start, t_elapsed;
 
-			grid *g = &( system->g );
-			int *d_indices = (int *) scratch;
-			int total = g->ncell[0] * g->ncell[1] * g->ncell[2];
+            grid *g = &( system->g );
+            int *d_indices = (int *) scratch;
+            int total = g->ncell[0] * g->ncell[1] * g->ncell[2];
 
-			cuda_memset ( d_indices, 0, INT_SIZE * system->N, RES_SCRATCH );
+            cuda_memset ( d_indices, 0, INT_SIZE * system->N, RES_SCRATCH );
 
 #ifdef __BUILD_DEBUG__
-			for (int i = 0; i < g->max_nbrs; i ++) {
-				if ((g->nbrs[i][0] >= g->ncell[0]) ||
-						(g->nbrs[i][1] >= g->ncell[1]) ||
-						(g->nbrs[i][2] >= g->ncell[2]) ) {
-					fprintf (stderr, " Grid Incorrectly built.... \n");
-					exit (1);
-				}
-
-			}
+            for (int i = 0; i < g->max_nbrs; i ++) {
+                if ((g->nbrs[i][0] >= g->ncell[0]) ||
+                        (g->nbrs[i][1] >= g->ncell[1]) ||
+                        (g->nbrs[i][2] >= g->ncell[2]) ) {
+                    fprintf (stderr, " Grid Incorrectly built.... \n");
+                    exit (1);
+                }
+
+            }
 #endif
 
-			dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
-			dim3 threadsperblock (system->g.max_atoms);
+            dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
+            dim3 threadsperblock (system->g.max_atoms);
 
 #ifdef __BUILD_DEBUG__
-			fprintf (stderr, "Blocks per grid (%d %d %d)\n", system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
-			fprintf (stderr, "Estimate Num  Neighbors with threads per block as %d \n", system->d_g.max_atoms);
-			fprintf (stderr, "Max nbrs %d \n", system->d_g.max_nbrs);
+            fprintf (stderr, "Blocks per grid (%d %d %d)\n", system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
+            fprintf (stderr, "Estimate Num  Neighbors with threads per block as %d \n", system->d_g.max_atoms);
+            fprintf (stderr, "Max nbrs %d \n", system->d_g.max_nbrs);
 #endif 
 
 
-			//First Bin atoms and they sync the host and the device for the grid.
-			//This will copy the atoms from host to device.
-			Cuda_Bin_Atoms (system, workspace);
-			Sync_Host_Device (&system->g, &system->d_g, cudaMemcpyHostToDevice );
+            //First Bin atoms and they sync the host and the device for the grid.
+            //This will copy the atoms from host to device.
+            Cuda_Bin_Atoms (system, workspace);
+            Sync_Host_Device (&system->g, &system->d_g, cudaMemcpyHostToDevice );
 
-			Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>>
-				(system->d_atoms, system->d_g, system->d_box, 
-				 (control_params *)control->d_control, d_indices);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
+            Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>>
+                (system->d_atoms, system->d_g, system->d_box, 
+                 (control_params *)control->d_control, d_indices);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
 
-			int *nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) );
-			memset (nbrs_indices , 0, INT_SIZE * (system->N + 1));
+            int *nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) );
+            memset (nbrs_indices , 0, INT_SIZE * (system->N + 1));
 
-			nbrs_indices [0] = 0;
-			copy_host_device (&nbrs_indices [1], d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); 
+            nbrs_indices [0] = 0;
+            copy_host_device (&nbrs_indices [1], d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); 
 
-			for (int i = 1; i <= system->N; i++)
-				nbrs_indices [i] += nbrs_indices [i-1];
+            for (int i = 1; i <= system->N; i++)
+                nbrs_indices [i] += nbrs_indices [i-1];
 
-			num_nbrs = nbrs_indices [system->N] ;
-			system->num_nbrs = num_nbrs;
+            num_nbrs = nbrs_indices [system->N] ;
+            system->num_nbrs = num_nbrs;
 
 #ifdef __DEBUG_CUDA__
-			fprintf (stderr, "Total neighbors %d \n", nbrs_indices[system->N]);
-			fprintf (stderr, "Corrected Total neighbors %d \n", num_nbrs);
+            fprintf (stderr, "Total neighbors %d \n", nbrs_indices[system->N]);
+            fprintf (stderr, "Corrected Total neighbors %d \n", num_nbrs);
 #endif
 
 
-			list *far_nbrs = (dev_lists + FAR_NBRS);
-			if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE) ) {
-				fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-				exit( INIT_ERR );
-			}
+            list *far_nbrs = (dev_lists + FAR_NBRS);
+            if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, far_nbrs, TYP_DEVICE) ) {
+                fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
+                exit( INIT_ERR );
+            }
 
 #ifdef __CUDA_MEM__
-			fprintf( stderr, "Device memory allocated: far_nbrs = %ld (MB)\n", 
-					num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
+            fprintf( stderr, "Device memory allocated: far_nbrs = %ld (MB)\n", 
+                    num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
 #endif
 
-			copy_host_device (nbrs_indices, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ );
-			copy_host_device (nbrs_indices, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ );
-			Cuda_Generate_Neighbor_Lists (system, workspace, control, false);
+            copy_host_device (nbrs_indices, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ );
+            copy_host_device (nbrs_indices, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyHostToDevice, __LINE__ );
+            Cuda_Generate_Neighbor_Lists (system, workspace, control, false);
 
 #ifdef __BUILD_DEBUG__
 
-			int *end = (int *)malloc (sizeof (int) * system->N);
-			int *start = (int *) malloc (sizeof (int) * system->N );
+            int *end = (int *)malloc (sizeof (int) * system->N);
+            int *start = (int *) malloc (sizeof (int) * system->N );
 
-			copy_host_device (start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0);
-			copy_host_device (end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0);
+            copy_host_device (start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0);
+            copy_host_device (end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 0);
 
-			far_neighbor_data *far_data = (far_neighbor_data *) 
-				malloc (FAR_NEIGHBOR_SIZE * num_nbrs);
-			copy_host_device (far_data, far_nbrs->select.far_nbr_list, 
-					FAR_NEIGHBOR_SIZE * num_nbrs, cudaMemcpyDeviceToHost, 0);
+            far_neighbor_data *far_data = (far_neighbor_data *) 
+                malloc (FAR_NEIGHBOR_SIZE * num_nbrs);
+            copy_host_device (far_data, far_nbrs->select.far_nbr_list, 
+                    FAR_NEIGHBOR_SIZE * num_nbrs, cudaMemcpyDeviceToHost, 0);
 
-			compare_far_neighbors (nbrs_indices, start, end, far_data, *lists + FAR_NBRS, system->N);
+            compare_far_neighbors (nbrs_indices, start, end, far_data, *lists + FAR_NBRS, system->N);
 
-			free (start);
-			free (end);
+            free (start);
+            free (end);
 #endif
 
-			int *output, size;
-			size = INT_SIZE * 2 * system->N + 2;
-			output = (int *) malloc (size);
-			Cuda_Estimate_Storage_Sizes (system, control, output);
+            int *output, size;
+            size = INT_SIZE * 2 * system->N + 2;
+            output = (int *) malloc (size);
+            Cuda_Estimate_Storage_Sizes (system, control, output);
 
-			Htop = output[0];
-			num_3body  = output[1];
-			hb_top = &output[ 2 ]; 
-			bond_top = &output[ 2 + system->N ];
+            Htop = output[0];
+            num_3body  = output[1];
+            hb_top = &output[ 2 ]; 
+            bond_top = &output[ 2 + system->N ];
 
 #ifdef __DEBUG_CUDA__
-			int max_hbonds = 0;
-			int min_hbonds = 1000;
-			int max_bonds = 0;
-			int min_bonds = 1000;
-			for (int i = 0; i < system->N; i++) {
-				if ( max_hbonds < hb_top[i])
-					max_hbonds = hb_top[i];
-				if (min_hbonds > hb_top[i])
-					min_hbonds = hb_top[i];
-
-				if (max_bonds < bond_top [i])
-					max_bonds = bond_top[i];
-				if (min_bonds > bond_top[i])
-					min_bonds = bond_top[i];
-			}
-
-			fprintf (stderr, "Max Hbonds %d min Hbonds %d \n", max_hbonds, min_hbonds );
-			fprintf (stderr, "Max bonds %d min bonds %d \n", max_bonds, min_bonds );
-			fprintf (stderr, "Device HTop --> %d and num_3body --> %d \n", Htop, num_3body );
+            int max_hbonds = 0;
+            int min_hbonds = 1000;
+            int max_bonds = 0;
+            int min_bonds = 1000;
+            for (int i = 0; i < system->N; i++) {
+                if ( max_hbonds < hb_top[i])
+                    max_hbonds = hb_top[i];
+                if (min_hbonds > hb_top[i])
+                    min_hbonds = hb_top[i];
+
+                if (max_bonds < bond_top [i])
+                    max_bonds = bond_top[i];
+                if (min_bonds > bond_top[i])
+                    min_bonds = bond_top[i];
+            }
+
+            fprintf (stderr, "Max Hbonds %d min Hbonds %d \n", max_hbonds, min_hbonds );
+            fprintf (stderr, "Max bonds %d min bonds %d \n", max_bonds, min_bonds );
+            fprintf (stderr, "Device HTop --> %d and num_3body --> %d \n", Htop, num_3body );
 #endif
 
-			Allocate_Device_Matrix (system, control, data, workspace, lists, out_control );
+            Allocate_Device_Matrix (system, control, data, workspace, lists, out_control );
 
-			dev_workspace->num_H = 0;
+            dev_workspace->num_H = 0;
 
-			if( control->hb_cut > 0 ) {
+            if( control->hb_cut > 0 ) {
 
-				int *hbond_index = (int *) malloc ( INT_SIZE * system->N );
-				// init H indexes 
-				num_hbonds = 0;
-				for( i = 0; i < system->N; ++i )
-					if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 || 
-							system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 2  ) // H atom
-						//hbond_index[i] = workspace->num_H++;
-						hbond_index[i] = num_hbonds ++;
-					else 
-						hbond_index[i] = -1;
+                int *hbond_index = (int *) malloc ( INT_SIZE * system->N );
+                // init H indexes 
+                num_hbonds = 0;
+                for( i = 0; i < system->N; ++i )
+                    if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 || 
+                            system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 2  ) // H atom
+                        //hbond_index[i] = workspace->num_H++;
+                        hbond_index[i] = num_hbonds ++;
+                    else 
+                        hbond_index[i] = -1;
 
-				copy_host_device (hbond_index, dev_workspace->hbond_index, 
-						system->N * INT_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_HBOND_INDEX );
-				dev_workspace->num_H = num_hbonds;
+                copy_host_device (hbond_index, dev_workspace->hbond_index, 
+                        system->N * INT_SIZE, cudaMemcpyHostToDevice, RES_STORAGE_HBOND_INDEX );
+                dev_workspace->num_H = num_hbonds;
 
 #ifdef __DEBUG_CUDA__
-				fprintf (stderr, "Device num_H --> %d \n", dev_workspace->num_H );
+                fprintf (stderr, "Device num_H --> %d \n", dev_workspace->num_H );
 #endif
 
-				Cuda_Allocate_HBond_List( system->N, dev_workspace->num_H, dev_workspace->hbond_index, 
-						hb_top, (dev_lists+HBONDS) );
-				num_hbonds = hb_top[system->N-1];
-				system->num_hbonds = num_hbonds;
+                Cuda_Allocate_HBond_List( system->N, dev_workspace->num_H, dev_workspace->hbond_index, 
+                        hb_top, (dev_lists+HBONDS) );
+                num_hbonds = hb_top[system->N-1];
+                system->num_hbonds = num_hbonds;
 
 #ifdef __CUDA_MEM__
-				fprintf (stderr, "Device memory allocated: Hydrogen Bonds list: %ld (MB) \n", 
-						sizeof (hbond_data) * num_hbonds / (1024*1024));
+                fprintf (stderr, "Device memory allocated: Hydrogen Bonds list: %ld (MB) \n", 
+                        sizeof (hbond_data) * num_hbonds / (1024*1024));
 #endif
 
 #ifdef __DEBUG_CUDA__
-				fprintf (stderr, "Device Total number of HBonds --> %d \n", num_hbonds );
+                fprintf (stderr, "Device Total number of HBonds --> %d \n", num_hbonds );
 #endif
 
-				free (hbond_index);
-			}
+                free (hbond_index);
+            }
 
-			// bonds list 
-			Cuda_Allocate_Bond_List( system->N, bond_top, dev_lists+BONDS );
-			num_bonds = bond_top[system->N-1];
-			system->num_bonds = num_bonds;
+            // bonds list 
+            Cuda_Allocate_Bond_List( system->N, bond_top, dev_lists+BONDS );
+            num_bonds = bond_top[system->N-1];
+            system->num_bonds = num_bonds;
 
 #ifdef __CUDA_MEM__
-			fprintf (stderr, "Device memory allocated: Bonds list: %ld (MB) \n", 
-					sizeof (bond_data) * num_bonds / (1024*1024));
+            fprintf (stderr, "Device memory allocated: Bonds list: %ld (MB) \n", 
+                    sizeof (bond_data) * num_bonds / (1024*1024));
 #endif
 
 #ifdef __DEBUG_CUDA__
-			fprintf (stderr, "Device Total Bonds --> %d \n", num_bonds );
+            fprintf (stderr, "Device Total Bonds --> %d \n", num_bonds );
 #endif
 
-			//	system->max_thb_intrs = num_3body;
-			// 3bodies list 
-			//if(!Make_List(num_bonds, num_bonds * MAX_THREE_BODIES, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) {
-			//  fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-			//  exit( INIT_ERR );
-			//}
+            //    system->max_thb_intrs = num_3body;
+            // 3bodies list 
+            //if(!Make_List(num_bonds, num_bonds * MAX_THREE_BODIES, TYP_THREE_BODY, dev_lists + THREE_BODIES, TYP_DEVICE)) {
+            //  fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
+            //  exit( INIT_ERR );
+            //}
 
-			//fprintf( stderr, "***memory allocated: three_body = %ldMB\n", 
-			//   num_bonds * MAX_THREE_BODIES *sizeof(three_body_interaction_data) / (1024*1024) );
-			//fprintf (stderr, "size of (three_body_interaction_data) : %d \n", sizeof (three_body_interaction_data));
+            //fprintf( stderr, "***memory allocated: three_body = %ldMB\n", 
+            //   num_bonds * MAX_THREE_BODIES *sizeof(three_body_interaction_data) / (1024*1024) );
+            //fprintf (stderr, "size of (three_body_interaction_data) : %d \n", sizeof (three_body_interaction_data));
 
-			//Free local resources
-			free (output);
-			free (nbrs_indices);
-		}
+            //Free local resources
+            free (output);
+            free (nbrs_indices);
+        }
 
 
-		void Init_Lists( reax_system *system, control_params *control, 
-				simulation_data *data, static_storage *workspace, 
-				list **lists, output_controls *out_control )
-		{
-			int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
-			int *hb_top, *bond_top;
+        void Init_Lists( reax_system *system, control_params *control, 
+                simulation_data *data, static_storage *workspace, 
+                list **lists, output_controls *out_control )
+        {
+            int i, num_nbrs, num_hbonds, num_bonds, num_3body, Htop;
+            int *hb_top, *bond_top;
 
-			real t_start, t_elapsed;
+            real t_start, t_elapsed;
 
-			num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists );
+            num_nbrs = Estimate_NumNeighbors( system, control, workspace, lists );
 
 #ifdef __DEBUG_CUDA__
-			fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs);
+            fprintf (stderr, "Serial NumNeighbors ---> %d \n", num_nbrs);
 #endif
 
-			if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS) ) {
-				fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
-				exit( INIT_ERR );
-			}
+            if( !Make_List(system->N, num_nbrs, TYP_FAR_NEIGHBOR, (*lists)+FAR_NBRS) ) {
+                fprintf(stderr, "Problem in initializing far nbrs list. Terminating!\n");
+                exit( INIT_ERR );
+            }
 #if defined(DEBUG_FOCUS)
-			fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
-					num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
+            fprintf( stderr, "memory allocated: far_nbrs = %ldMB\n", 
+                    num_nbrs * sizeof(far_neighbor_data) / (1024*1024) );
 #endif
 
-			t_start = Get_Time ();
-			Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control);
-			t_elapsed = Get_Timing_Info ( t_start );
+            t_start = Get_Time ();
+            Generate_Neighbor_Lists(system,control,data,workspace,lists,out_control);
+            t_elapsed = Get_Timing_Info ( t_start );
 
 #ifdef __DEBUG_CUDA__
-			fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed );
+            fprintf (stderr, " Timing Generate Neighbors %lf \n", t_elapsed );
 #endif
 
-			Htop = 0;
-			hb_top = (int*) calloc( system->N, sizeof(int) );
-			bond_top = (int*) calloc( system->N, sizeof(int) );
-			num_3body = 0;
-			Estimate_Storage_Sizes( system, control, lists, 
-					&Htop, hb_top, bond_top, &num_3body );
+            Htop = 0;
+            hb_top = (int*) calloc( system->N, sizeof(int) );
+            bond_top = (int*) calloc( system->N, sizeof(int) );
+            num_3body = 0;
+            Estimate_Storage_Sizes( system, control, lists, 
+                    &Htop, hb_top, bond_top, &num_3body );
 
-			Allocate_Matrix( &(workspace->H), system->N, Htop );
+            Allocate_Matrix( &(workspace->H), system->N, Htop );
 #if defined(DEBUG_FOCUS)
-			fprintf( stderr, "estimated storage - Htop: %d\n", Htop );
-			fprintf( stderr, "memory allocated: H = %ldMB\n", 
-					Htop * sizeof(sparse_matrix_entry) / (1024*1024) );
+            fprintf( stderr, "estimated storage - Htop: %d\n", Htop );
+            fprintf( stderr, "memory allocated: H = %ldMB\n", 
+                    Htop * sizeof(sparse_matrix_entry) / (1024*1024) );
 #endif
 
-			workspace->num_H = 0;
-			if( control->hb_cut > 0 ) {
-				/* init H indexes */
-				for( i = 0; i < system->N; ++i )
-					if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom
-						workspace->hbond_index[i] = workspace->num_H++;
-					else workspace->hbond_index[i] = -1;
+            workspace->num_H = 0;
+            if( control->hb_cut > 0 ) {
+                /* init H indexes */
+                for( i = 0; i < system->N; ++i )
+                    if( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 ) // H atom
+                        workspace->hbond_index[i] = workspace->num_H++;
+                    else workspace->hbond_index[i] = -1;
 
-				Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, 
-						hb_top, (*lists)+HBONDS );
-				num_hbonds = hb_top[system->N-1];
+                Allocate_HBond_List( system->N, workspace->num_H, workspace->hbond_index, 
+                        hb_top, (*lists)+HBONDS );
+                num_hbonds = hb_top[system->N-1];
 
 #ifdef __DEBUG_CUDA__
-				fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds );
+                fprintf( stderr, "Serial num_hbonds: %d\n", num_hbonds );
 #endif
 
 #if defined(DEBUG_FOCUS)
-				fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds );
-				fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
-						num_hbonds * sizeof(hbond_data) / (1024*1024) );
+                fprintf( stderr, "estimated storage - num_hbonds: %d\n", num_hbonds );
+                fprintf( stderr, "memory allocated: hbonds = %ldMB\n", 
+                        num_hbonds * sizeof(hbond_data) / (1024*1024) );
 #endif
-			}
+            }
 
-			/* bonds list */
-			Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS );
-			num_bonds = bond_top[system->N-1];
+            /* bonds list */
+            Allocate_Bond_List( system->N, bond_top, (*lists)+BONDS );
+            num_bonds = bond_top[system->N-1];
 #if defined(DEBUG_FOCUS)
-			fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds );
-			fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
-					num_bonds * sizeof(bond_data) / (1024*1024) );
+            fprintf( stderr, "estimated storage - num_bonds: %d\n", num_bonds );
+            fprintf( stderr, "memory allocated: bonds = %ldMB\n", 
+                    num_bonds * sizeof(bond_data) / (1024*1024) );
 #endif
 
 #ifdef __DEBUG_CUDA__
-			fprintf (stderr, " host num_3body : %d \n", num_3body);
-			fprintf (stderr, " host num_bonds : %d \n", num_bonds);
+            fprintf (stderr, " host num_3body : %d \n", num_3body);
+            fprintf (stderr, " host num_bonds : %d \n", num_bonds);
 #endif
 
-			/* 3bodies list */
-			if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES)) {
-				fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
-				exit( INIT_ERR );
-			}
+            /* 3bodies list */
+            if(!Make_List(num_bonds, num_3body, TYP_THREE_BODY, (*lists)+THREE_BODIES)) {
+                fprintf( stderr, "Problem in initializing angles list. Terminating!\n" );
+                exit( INIT_ERR );
+            }
 #if defined(DEBUG_FOCUS)
-			fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body );
-			fprintf( stderr, "memory allocated: 3-body = %ldMB\n", 
-					num_3body * sizeof(three_body_interaction_data) / (1024*1024) );
+            fprintf( stderr, "estimated storage - num_3body: %d\n", num_3body );
+            fprintf( stderr, "memory allocated: 3-body = %ldMB\n", 
+                    num_3body * sizeof(three_body_interaction_data) / (1024*1024) );
 #endif
 #ifdef TEST_FORCES
-			if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) {
-				fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
-				exit( INIT_ERR );
-			}
-
-			if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) {
-				fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
-				exit( INIT_ERR );
-			}
+            if(!Make_List( system->N, num_bonds * 8, TYP_DDELTA, (*lists) + DDELTA )) {
+                fprintf( stderr, "Problem in initializing dDelta list. Terminating!\n" );
+                exit( INIT_ERR );
+            }
+
+            if( !Make_List( num_bonds, num_bonds*MAX_BONDS*3, TYP_DBO, (*lists)+DBO ) ) {
+                fprintf( stderr, "Problem in initializing dBO list. Terminating!\n" );
+                exit( INIT_ERR );
+            }
 #endif
 
-			free( hb_top );
-			free( bond_top );
-		}
-
-
-		void Init_Out_Controls(reax_system *system, control_params *control, 
-				static_storage *workspace, output_controls *out_control)
-		{
-			char temp[1000];
-
-			/* Init trajectory file */
-			if( out_control->write_steps > 0 ) { 
-				strcpy( temp, control->sim_name );
-				strcat( temp, ".trj" );
-				out_control->trj = fopen( temp, "w" );
-				out_control->write_header( system, control, workspace, out_control );
-			}
-
-			if( out_control->energy_update_freq > 0 ) {
-				/* Init out file */
-				strcpy( temp, control->sim_name );
-				strcat( temp, ".out" );
-				out_control->out = fopen( temp, "w" );
-				fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n",
-						"step", "total energy", "poten. energy", "kin. energy", 
-						"temp.", "target", "volume", "press.", "target" );
-				fflush( out_control->out );
-
-				/* Init potentials file */
-				strcpy( temp, control->sim_name );
-				strcat( temp, ".pot" );
-				out_control->pot = fopen( temp, "w" );
-				fprintf( out_control->pot, 
-						"%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n",
-						"step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", 
-						"etor", "econj", "evdw","ecoul", "epol" );
-				fflush( out_control->pot );
-
-				/* Init log file */
-				strcpy( temp, control->sim_name );
-				strcat( temp, ".log" );
-				out_control->log = fopen( temp, "w" );
-				fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", 
-						"step", "total", "neighbors", "init", "bonded", 
-						"nonbonded", "QEq", "matvec" );
-			}
-
-			/* Init pressure file */
-			if( control->ensemble == NPT || 
-					control->ensemble == iNPT || 
-					control->ensemble == sNPT ) {
-				strcpy( temp, control->sim_name );
-				strcat( temp, ".prs" );
-				out_control->prs = fopen( temp, "w" );
-				fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n",
-						"step", "norm_x", "norm_y", "norm_z", 
-						"press_x", "press_y", "press_z", "target_p", "volume" );
-				fflush( out_control->prs );
-			}
-
-			/* Init molecular analysis file */
-			if( control->molec_anal ) {
-				sprintf( temp, "%s.mol", control->sim_name );
-				out_control->mol = fopen( temp, "w" );
-				if( control->num_ignored ) {
-					sprintf( temp, "%s.ign", control->sim_name );
-					out_control->ign = fopen( temp, "w" );
-				} 
-			}
-
-			/* Init electric dipole moment analysis file */
-			if( control->dipole_anal ) {
-				strcpy( temp, control->sim_name );
-				strcat( temp, ".dpl" );
-				out_control->dpl = fopen( temp, "w" );
-				fprintf( out_control->dpl, 
-						"Step      Molecule Count  Avg. Dipole Moment Norm\n" );
-				fflush( out_control->dpl );
-			}
-
-			/* Init diffusion coef analysis file */
-			if( control->diffusion_coef ) {
-				strcpy( temp, control->sim_name );
-				strcat( temp, ".drft" );
-				out_control->drft = fopen( temp, "w" );
-				fprintf( out_control->drft, "Step     Type Count   Avg Squared Disp\n" );
-				fflush( out_control->drft );
-			}
+            free( hb_top );
+            free( bond_top );
+        }
+
+
+        void Init_Out_Controls(reax_system *system, control_params *control, 
+                static_storage *workspace, output_controls *out_control)
+        {
+            char temp[1000];
+
+            /* Init trajectory file */
+            if( out_control->write_steps > 0 ) { 
+                strcpy( temp, control->sim_name );
+                strcat( temp, ".trj" );
+                out_control->trj = fopen( temp, "w" );
+                out_control->write_header( system, control, workspace, out_control );
+            }
+
+            if( out_control->energy_update_freq > 0 ) {
+                /* Init out file */
+                strcpy( temp, control->sim_name );
+                strcat( temp, ".out" );
+                out_control->out = fopen( temp, "w" );
+                fprintf( out_control->out, "%-6s%16s%16s%16s%11s%11s%13s%13s%13s\n",
+                        "step", "total energy", "poten. energy", "kin. energy", 
+                        "temp.", "target", "volume", "press.", "target" );
+                fflush( out_control->out );
+
+                /* Init potentials file */
+                strcpy( temp, control->sim_name );
+                strcat( temp, ".pot" );
+                out_control->pot = fopen( temp, "w" );
+                fprintf( out_control->pot, 
+                        "%-6s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s%13s\n",
+                        "step", "ebond", "eatom", "elp", "eang", "ecoa", "ehb", 
+                        "etor", "econj", "evdw","ecoul", "epol" );
+                fflush( out_control->pot );
+
+                /* Init log file */
+                strcpy( temp, control->sim_name );
+                strcat( temp, ".log" );
+                out_control->log = fopen( temp, "w" );
+                fprintf( out_control->log, "%-6s%10s%10s%10s%10s%10s%10s%10s\n", 
+                        "step", "total", "neighbors", "init", "bonded", 
+                        "nonbonded", "QEq", "matvec" );
+            }
+
+            /* Init pressure file */
+            if( control->ensemble == NPT || 
+                    control->ensemble == iNPT || 
+                    control->ensemble == sNPT ) {
+                strcpy( temp, control->sim_name );
+                strcat( temp, ".prs" );
+                out_control->prs = fopen( temp, "w" );
+                fprintf( out_control->prs, "%-6s%13s%13s%13s%13s%13s%13s%13s%13s\n",
+                        "step", "norm_x", "norm_y", "norm_z", 
+                        "press_x", "press_y", "press_z", "target_p", "volume" );
+                fflush( out_control->prs );
+            }
+
+            /* Init molecular analysis file */
+            if( control->molec_anal ) {
+                sprintf( temp, "%s.mol", control->sim_name );
+                out_control->mol = fopen( temp, "w" );
+                if( control->num_ignored ) {
+                    sprintf( temp, "%s.ign", control->sim_name );
+                    out_control->ign = fopen( temp, "w" );
+                } 
+            }
+
+            /* Init electric dipole moment analysis file */
+            if( control->dipole_anal ) {
+                strcpy( temp, control->sim_name );
+                strcat( temp, ".dpl" );
+                out_control->dpl = fopen( temp, "w" );
+                fprintf( out_control->dpl, 
+                        "Step      Molecule Count  Avg. Dipole Moment Norm\n" );
+                fflush( out_control->dpl );
+            }
+
+            /* Init diffusion coef analysis file */
+            if( control->diffusion_coef ) {
+                strcpy( temp, control->sim_name );
+                strcat( temp, ".drft" );
+                out_control->drft = fopen( temp, "w" );
+                fprintf( out_control->drft, "Step     Type Count   Avg Squared Disp\n" );
+                fflush( out_control->drft );
+            }
 
 
 #ifdef TEST_ENERGY
-			/* open bond energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".ebond" );
-			out_control->ebond = fopen( temp, "w" );
-
-			/* open lone-pair energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".elp" );
-			out_control->elp = fopen( temp, "w" );
-
-			/* open overcoordination energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".eov" );
-			out_control->eov = fopen( temp, "w" );
-
-			/* open undercoordination energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".eun" );
-			out_control->eun = fopen( temp, "w" );
-
-			/* open angle energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".eval" );
-			out_control->eval = fopen( temp, "w" );
-
-			/* open penalty energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".epen" );
-			out_control->epen = fopen( temp, "w" );
-
-			/* open coalition energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".ecoa" );
-			out_control->ecoa = fopen( temp, "w" );
-
-			/* open hydrogen bond energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".ehb" );
-			out_control->ehb = fopen( temp, "w" );
-
-			/* open torsion energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".etor" );
-			out_control->etor = fopen( temp, "w" );
-
-			/* open conjugation energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".econ" );
-			out_control->econ = fopen( temp, "w" );
-
-			/* open vdWaals energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".evdw" );
-			out_control->evdw = fopen( temp, "w" );
-
-			/* open coulomb energy file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".ecou" );
-			out_control->ecou = fopen( temp, "w" );
+            /* open bond energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".ebond" );
+            out_control->ebond = fopen( temp, "w" );
+
+            /* open lone-pair energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".elp" );
+            out_control->elp = fopen( temp, "w" );
+
+            /* open overcoordination energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".eov" );
+            out_control->eov = fopen( temp, "w" );
+
+            /* open undercoordination energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".eun" );
+            out_control->eun = fopen( temp, "w" );
+
+            /* open angle energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".eval" );
+            out_control->eval = fopen( temp, "w" );
+
+            /* open penalty energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".epen" );
+            out_control->epen = fopen( temp, "w" );
+
+            /* open coalition energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".ecoa" );
+            out_control->ecoa = fopen( temp, "w" );
+
+            /* open hydrogen bond energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".ehb" );
+            out_control->ehb = fopen( temp, "w" );
+
+            /* open torsion energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".etor" );
+            out_control->etor = fopen( temp, "w" );
+
+            /* open conjugation energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".econ" );
+            out_control->econ = fopen( temp, "w" );
+
+            /* open vdWaals energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".evdw" );
+            out_control->evdw = fopen( temp, "w" );
+
+            /* open coulomb energy file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".ecou" );
+            out_control->ecou = fopen( temp, "w" );
 #endif
 
 
 #ifdef TEST_FORCES
-			/* open bond orders file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".fbo" );
-			out_control->fbo = fopen( temp, "w" );
-
-			/* open bond orders derivatives file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".fdbo" );
-			out_control->fdbo = fopen( temp, "w" );
-
-			/* open bond forces file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".fbond" );
-			out_control->fbond = fopen( temp, "w" );
-
-			/* open lone-pair forces file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".flp" );
-			out_control->flp = fopen( temp, "w" );
-
-			/* open overcoordination forces file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".fatom" );
-			out_control->fatom = fopen( temp, "w" );
-
-			/* open angle forces file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".f3body" );
-			out_control->f3body = fopen( temp, "w" );
-
-			/* open hydrogen bond forces file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".fhb" );
-			out_control->fhb = fopen( temp, "w" );
-
-			/* open torsion forces file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".f4body" );
-			out_control->f4body = fopen( temp, "w" );
-
-			/* open nonbonded forces file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".fnonb" );
-			out_control->fnonb = fopen( temp, "w" );
-
-			/* open total force file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".ftot" );
-			out_control->ftot = fopen( temp, "w" );
-
-			/* open coulomb forces file */
-			strcpy( temp, control->sim_name );
-			strcat( temp, ".ftot2" );
-			out_control->ftot2 = fopen( temp, "w" );
+            /* open bond orders file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".fbo" );
+            out_control->fbo = fopen( temp, "w" );
+
+            /* open bond orders derivatives file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".fdbo" );
+            out_control->fdbo = fopen( temp, "w" );
+
+            /* open bond forces file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".fbond" );
+            out_control->fbond = fopen( temp, "w" );
+
+            /* open lone-pair forces file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".flp" );
+            out_control->flp = fopen( temp, "w" );
+
+            /* open overcoordination forces file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".fatom" );
+            out_control->fatom = fopen( temp, "w" );
+
+            /* open angle forces file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".f3body" );
+            out_control->f3body = fopen( temp, "w" );
+
+            /* open hydrogen bond forces file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".fhb" );
+            out_control->fhb = fopen( temp, "w" );
+
+            /* open torsion forces file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".f4body" );
+            out_control->f4body = fopen( temp, "w" );
+
+            /* open nonbonded forces file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".fnonb" );
+            out_control->fnonb = fopen( temp, "w" );
+
+            /* open total force file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".ftot" );
+            out_control->ftot = fopen( temp, "w" );
+
+            /* open coulomb forces file */
+            strcpy( temp, control->sim_name );
+            strcat( temp, ".ftot2" );
+            out_control->ftot2 = fopen( temp, "w" );
 #endif
 
 
-			/* Error handling */
-			/* if ( out_control->out == NULL || out_control->pot == NULL || 
-			   out_control->log == NULL || out_control->mol == NULL || 
-			   out_control->dpl == NULL || out_control->drft == NULL ||       
-			   out_control->pdb == NULL )
-			   {
-			   fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." );
-			   exit( CANNOT_OPEN_OUTFILE );
-			   }*/
-		}
+            /* Error handling */
+            /* if ( out_control->out == NULL || out_control->pot == NULL || 
+               out_control->log == NULL || out_control->mol == NULL || 
+               out_control->dpl == NULL || out_control->drft == NULL ||       
+               out_control->pdb == NULL )
+               {
+               fprintf( stderr, "FILE OPEN ERROR. TERMINATING..." );
+               exit( CANNOT_OPEN_OUTFILE );
+               }*/
+        }
 
 
-		void Initialize(reax_system *system, control_params *control, 
-				simulation_data *data, static_storage *workspace, list **lists, 
-				output_controls *out_control, evolve_function *Evolve)
-		{
-			Randomize();
+        void Initialize(reax_system *system, control_params *control, 
+                simulation_data *data, static_storage *workspace, list **lists, 
+                output_controls *out_control, evolve_function *Evolve)
+        {
+            Randomize();
 
-			Init_System( system, control, data );
+            Init_System( system, control, data );
 
-			Init_Simulation_Data( system, control, data, out_control, Evolve );
+            Init_Simulation_Data( system, control, data, out_control, Evolve );
 
-			Init_Workspace( system, control, workspace );
+            Init_Workspace( system, control, workspace );
 
-			Init_Lists( system, control, data, workspace, lists, out_control );
+            Init_Lists( system, control, data, workspace, lists, out_control );
 
-			Init_Out_Controls( system, control, workspace, out_control );
+            Init_Out_Controls( system, control, workspace, out_control );
 
-			/* These are done in forces.c, only forces.c can see all those functions */
-			Init_Bonded_Force_Functions( control );
+            /* These are done in forces.c, only forces.c can see all those functions */
+            Init_Bonded_Force_Functions( control );
 #ifdef TEST_FORCES
-			Init_Force_Test_Functions( );
+            Init_Force_Test_Functions( );
 #endif
 
-			if( control->tabulate )
-				Make_LR_Lookup_Table( system, control );
+            if( control->tabulate )
+                Make_LR_Lookup_Table( system, control );
 
 #if defined(DEBUG_FOCUS)
-			fprintf( stderr, "data structures have been initialized...\n" ); 
+            fprintf( stderr, "data structures have been initialized...\n" ); 
 #endif
-		}
+        }
 
-		void Cuda_Initialize(reax_system *system, control_params *control, 
-				simulation_data *data, static_storage *workspace, list **lists, 
-				output_controls *out_control, evolve_function *Evolve)
-		{
-			Randomize ();
+        void Cuda_Initialize(reax_system *system, control_params *control, 
+                simulation_data *data, static_storage *workspace, list **lists, 
+                output_controls *out_control, evolve_function *Evolve)
+        {
+            Randomize ();
 
-			Cuda_Init_Scratch ();
+            Cuda_Init_Scratch ();
 
-			//System
-			Cuda_Init_System (system);
-			Sync_Host_Device ( system, cudaMemcpyHostToDevice );
-			Cuda_Init_System (system, control, data );
+            //System
+            Cuda_Init_System (system);
+            Sync_Host_Device ( system, cudaMemcpyHostToDevice );
+            Cuda_Init_System (system, control, data );
 
-			//Simulation Data
-			copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , 
-					cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS );
-			Cuda_Init_Simulation_Data (data);
-			//Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
-			Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve );
-			Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
+            //Simulation Data
+            copy_host_device (system->atoms, system->d_atoms, REAX_ATOM_SIZE * system->N , 
+                    cudaMemcpyHostToDevice, RES_SYSTEM_ATOMS );
+            Cuda_Init_Simulation_Data (data);
+            //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
+            Cuda_Init_Simulation_Data( system, control, data, out_control, Evolve );
+            Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
 
-			//static storage
-			Cuda_Init_Workspace_System ( system, dev_workspace );
-			Cuda_Init_Workspace ( system, control, dev_workspace );
-			Cuda_Init_Workspace_Device (workspace);
+            //static storage
+            Cuda_Init_Workspace_System ( system, dev_workspace );
+            Cuda_Init_Workspace ( system, control, dev_workspace );
+            Cuda_Init_Workspace_Device (workspace);
 
-			//control
-			Cuda_Init_Control (control);
+            //control
+            Cuda_Init_Control (control);
 
-			//Grid
-			Cuda_Init_Grid (&system->g, &system->d_g );
+            //Grid
+            Cuda_Init_Grid (&system->g, &system->d_g );
 
-			//lists
-			Cuda_Init_Lists (system, control, data, workspace, lists, out_control );
+            //lists
+            Cuda_Init_Lists (system, control, data, workspace, lists, out_control );
 
-			Init_Out_Controls( system, control, workspace, out_control );
+            Init_Out_Controls( system, control, workspace, out_control );
 
-			if( control->tabulate ) {
-				real start, end;
-				start = Get_Time ();
-				Make_LR_Lookup_Table( system, control );
-				copy_LR_table_to_device (system, control );
-				end = Get_Timing_Info ( start );
+            if( control->tabulate ) {
+                real start, end;
+                start = Get_Time ();
+                Make_LR_Lookup_Table( system, control );
+                copy_LR_table_to_device (system, control );
+                end = Get_Timing_Info ( start );
 
 #ifdef __DEBUG_CUDA__
-				fprintf (stderr, "Done copying the LR table to the device ---> %f \n", end );
+                fprintf (stderr, "Done copying the LR table to the device ---> %f \n", end );
 #endif
-			}
-		}
+            }
+        }
diff --git a/PuReMD-GPU/src/integrate.cu b/PuReMD-GPU/src/integrate.cu
index 5d56d622..d0790286 100644
--- a/PuReMD-GPU/src/integrate.cu
+++ b/PuReMD-GPU/src/integrate.cu
@@ -38,49 +38,49 @@
 
 
 void Velocity_Verlet_NVE(reax_system* system, control_params* control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
-	int i, steps, renbr;
-	real inv_m, dt, dt_sqr;
-	rvec dx;
-
-	dt = control->dt;
-	dt_sqr = SQR(dt);
-	steps = data->step - data->prev_steps;
-	renbr = (steps % control->reneighbor == 0);
+    int i, steps, renbr;
+    real inv_m, dt, dt_sqr;
+    rvec dx;
+
+    dt = control->dt;
+    dt_sqr = SQR(dt);
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "step%d: ", data->step );
+    fprintf( stderr, "step%d: ", data->step );
 #endif
 
-	for( i = 0; i < system->N; i++ ) {
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+    for( i = 0; i < system->N; i++ ) {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
-		rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-				0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
-		Inc_on_T3( system->atoms[i].x, dx, &( system->box ) );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
+                0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
+        Inc_on_T3( system->atoms[i].x, dx, &( system->box ) );
 
-		rvec_ScaledAdd( system->atoms[i].v, 
-				0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-	}
+        rvec_ScaledAdd( system->atoms[i].v, 
+                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+    }
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "verlet1 - ");
+    fprintf( stderr, "verlet1 - ");
 #endif
 
-	Reallocate( system, workspace, lists, renbr );
-	Reset( system, control, data, workspace, lists );
-	if( renbr )
-		Generate_Neighbor_Lists( system, control, data, workspace, 
-				lists, out_control );  
-	Compute_Forces( system, control, data, workspace, lists, out_control );
-
-	for( i = 0; i < system->N; i++ ) {
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-		rvec_ScaledAdd( system->atoms[i].v, 
-				0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-	}
+    Reallocate( system, workspace, lists, renbr );
+    Reset( system, control, data, workspace, lists );
+    if( renbr )
+        Generate_Neighbor_Lists( system, control, data, workspace, 
+                lists, out_control );  
+    Compute_Forces( system, control, data, workspace, lists, out_control );
+
+    for( i = 0; i < system->N; i++ ) {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+        rvec_ScaledAdd( system->atoms[i].v, 
+                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+    }
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "verlet2\n");
+    fprintf( stderr, "verlet2\n");
 #endif
 }
 
@@ -89,209 +89,209 @@ void Velocity_Verlet_NVE(reax_system* system, control_params* control,
 ///////////////////////////////////////////////////////////////////
 
 GLOBAL void Cuda_Velocity_Verlet_NVE_atoms1 (reax_atom *atoms, 
-		single_body_parameters *sbp, 
-		simulation_box *box,
-		int N, real dt)
+        single_body_parameters *sbp, 
+        simulation_box *box,
+        int N, real dt)
 {
-	real inv_m, dt_sqr;
-	rvec dx;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	dt_sqr = SQR(dt);
-	//for( i = 0; i < system->N; i++ ) {
-	inv_m = 1.0 / sbp[atoms[i].type].mass;
-
-	rvec_ScaledSum( dx, dt, atoms[i].v, 
-			0.5 * dt_sqr * -F_CONV * inv_m, atoms[i].f );
-	Inc_on_T3( atoms[i].x, dx, box );
-
-	rvec_ScaledAdd( atoms[i].v, 
-			0.5 * dt * -F_CONV * inv_m, atoms[i].f );
-	//}
+    real inv_m, dt_sqr;
+    rvec dx;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    dt_sqr = SQR(dt);
+    //for( i = 0; i < system->N; i++ ) {
+    inv_m = 1.0 / sbp[atoms[i].type].mass;
+
+    rvec_ScaledSum( dx, dt, atoms[i].v, 
+            0.5 * dt_sqr * -F_CONV * inv_m, atoms[i].f );
+    Inc_on_T3( atoms[i].x, dx, box );
+
+    rvec_ScaledAdd( atoms[i].v, 
+            0.5 * dt * -F_CONV * inv_m, atoms[i].f );
+    //}
 }
 
 GLOBAL void Cuda_Velocity_Verlet_NVE_atoms2 (reax_atom *atoms, single_body_parameters *sbp, int N, real dt)
 {
-	real inv_m;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	//for( i = 0; i < system->N; i++ ) {
-	inv_m = 1.0 / sbp[atoms[i].type].mass;
-	rvec_ScaledAdd( atoms[i].v, 
-			0.5 * dt * -F_CONV * inv_m, atoms[i].f );
-	//}
+    real inv_m;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    //for( i = 0; i < system->N; i++ ) {
+    inv_m = 1.0 / sbp[atoms[i].type].mass;
+    rvec_ScaledAdd( atoms[i].v, 
+            0.5 * dt * -F_CONV * inv_m, atoms[i].f );
+    //}
 }
 
 void Cuda_Velocity_Verlet_NVE(reax_system* system, control_params* control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
-	int i, steps, renbr;
-	real inv_m, dt, dt_sqr;
-	rvec dx;
-	int blocks, block_size;
+    int i, steps, renbr;
+    real inv_m, dt, dt_sqr;
+    rvec dx;
+    int blocks, block_size;
 
-	dt = control->dt;
-	dt_sqr = SQR(dt);
-	steps = data->step - data->prev_steps;
-	renbr = (steps % control->reneighbor == 0);
+    dt = control->dt;
+    dt_sqr = SQR(dt);
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
 
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "step%d: ", data->step );
+    fprintf( stderr, "step%d: ", data->step );
 #endif
 
-	compute_blocks (&blocks, &block_size, system->N);
-	Cuda_Velocity_Verlet_NVE_atoms1 <<<blocks, block_size>>>
-		(system->d_atoms, system->reaxprm.d_sbp, 
-		 (simulation_box *)system->d_box, system->N, dt);
-	cudaThreadSynchronize ();
+    compute_blocks (&blocks, &block_size, system->N);
+    Cuda_Velocity_Verlet_NVE_atoms1 <<<blocks, block_size>>>
+        (system->d_atoms, system->reaxprm.d_sbp, 
+         (simulation_box *)system->d_box, system->N, dt);
+    cudaThreadSynchronize ();
 
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "verlet1 - ");
+    fprintf( stderr, "verlet1 - ");
 #endif
 
-	Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
-	Cuda_Reset( system, control, data, workspace, lists );
+    Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
+    Cuda_Reset( system, control, data, workspace, lists );
 
-	if( renbr ) {
-		Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true);
-	}
+    if( renbr ) {
+        Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true);
+    }
 
-	Cuda_Compute_Forces( system, control, data, workspace, lists, out_control );
+    Cuda_Compute_Forces( system, control, data, workspace, lists, out_control );
 
-	Cuda_Velocity_Verlet_NVE_atoms2<<<blocks, block_size>>>
-		(system->d_atoms, system->reaxprm.d_sbp, system->N, dt);
-	cudaThreadSynchronize ();
+    Cuda_Velocity_Verlet_NVE_atoms2<<<blocks, block_size>>>
+        (system->d_atoms, system->reaxprm.d_sbp, system->N, dt);
+    cudaThreadSynchronize ();
 
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "verlet2\n");
+    fprintf( stderr, "verlet2\n");
 #endif
 }
 
 void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, 
-		control_params* control, 
-		simulation_data *data, 
-		static_storage *workspace, 
-		list **lists, 
-		output_controls *out_control )
+        control_params* control, 
+        simulation_data *data, 
+        static_storage *workspace, 
+        list **lists, 
+        output_controls *out_control )
 {
-	int i, itr, steps, renbr;
-	real inv_m, coef_v, dt, dt_sqr;
-	real E_kin_new, G_xi_new, v_xi_new, v_xi_old;
-	rvec dx;
-	thermostat *therm;
-
-	dt = control->dt;
-	dt_sqr = SQR( dt );
-	therm = &( data->therm );
-	steps = data->step - data->prev_steps;
-	renbr = (steps % control->reneighbor == 0);
+    int i, itr, steps, renbr;
+    real inv_m, coef_v, dt, dt_sqr;
+    real E_kin_new, G_xi_new, v_xi_new, v_xi_old;
+    rvec dx;
+    thermostat *therm;
+
+    dt = control->dt;
+    dt_sqr = SQR( dt );
+    therm = &( data->therm );
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "step%d: ", data->step );
+    fprintf( stderr, "step%d: ", data->step );
 #endif
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, " Entering Velocity_Verlet_Nose_Hoover_NVT_Klein:  coef to update velocity --> %6.10f\n", therm->v_xi_old);
+    fprintf (stderr, " Entering Velocity_Verlet_Nose_Hoover_NVT_Klein:  coef to update velocity --> %6.10f\n", therm->v_xi_old);
 #endif
 
-	/* Compute x(t + dt) and copy old forces */
-	for (i=0; i < system->N; i++) {
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+    /* Compute x(t + dt) and copy old forces */
+    for (i=0; i < system->N; i++) {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
-		rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, system->atoms[i].v,
-				0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f );
+        rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, system->atoms[i].v,
+                0.5 * dt_sqr * inv_m * -F_CONV, system->atoms[i].f );
 
-		Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
+        Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
 
-		rvec_Copy( workspace->f_old[i], system->atoms[i].f );
-	}
-	/* Compute xi(t + dt) */
-	therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
+        rvec_Copy( workspace->f_old[i], system->atoms[i].f );
+    }
+    /* Compute xi(t + dt) */
+    therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "verlet1 - " );
+    fprintf( stderr, "verlet1 - " );
 #endif
 
-	Reallocate( system, workspace, lists, renbr );
-	Reset( system, control, data, workspace, lists );
+    Reallocate( system, workspace, lists, renbr );
+    Reset( system, control, data, workspace, lists );
 
-	if( renbr )
-		Generate_Neighbor_Lists( system, control, data, workspace, 
-				lists, out_control );
+    if( renbr )
+        Generate_Neighbor_Lists( system, control, data, workspace, 
+                lists, out_control );
 
-	/* Calculate Forces at time (t + dt) */
-	Compute_Forces( system,control,data, workspace, lists, out_control );
+    /* Calculate Forces at time (t + dt) */
+    Compute_Forces( system,control,data, workspace, lists, out_control );
 
-	/* Compute iteration constants for each atom's velocity */
-	for( i = 0; i < system->N; ++i ) {
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+    /* Compute iteration constants for each atom's velocity */
+    for( i = 0; i < system->N; ++i ) {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
 
-		rvec_Scale( workspace->v_const[i], 
-				1.0 - 0.5 * dt * therm->v_xi, system->atoms[i].v );
-		rvec_ScaledAdd( workspace->v_const[i], 
-				0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
-		rvec_ScaledAdd( workspace->v_const[i], 
-				0.5 * dt * inv_m * -F_CONV, system->atoms[i].f );
+        rvec_Scale( workspace->v_const[i], 
+                1.0 - 0.5 * dt * therm->v_xi, system->atoms[i].v );
+        rvec_ScaledAdd( workspace->v_const[i], 
+                0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
+        rvec_ScaledAdd( workspace->v_const[i], 
+                0.5 * dt * inv_m * -F_CONV, system->atoms[i].f );
 #if defined(DEBUG)
-		fprintf( stderr, "atom%d: inv_m=%f, C1=%f, C2=%f, v_const=%f %f %f\n", 
-				i, inv_m, 1.0 - 0.5 * dt * therm->v_xi, 
-				0.5 * dt * inv_m * -F_CONV, workspace->v_const[i][0], 
-				workspace->v_const[i][1], workspace->v_const[i][2] );  
+        fprintf( stderr, "atom%d: inv_m=%f, C1=%f, C2=%f, v_const=%f %f %f\n", 
+                i, inv_m, 1.0 - 0.5 * dt * therm->v_xi, 
+                0.5 * dt * inv_m * -F_CONV, workspace->v_const[i][0], 
+                workspace->v_const[i][1], workspace->v_const[i][2] );  
 #endif
-	}
+    }
 
 
-	v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
-	E_kin_new = G_xi_new = v_xi_old = 0;
-	itr = 0;
-	do {
-		itr++;      
+    v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
+    E_kin_new = G_xi_new = v_xi_old = 0;
+    itr = 0;
+    do {
+        itr++;      
 
-		/* new values become old in this iteration */
-		v_xi_old = v_xi_new;
-		coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
-		E_kin_new = 0;
+        /* new values become old in this iteration */
+        v_xi_old = v_xi_new;
+        coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
+        E_kin_new = 0;
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, " *********** coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old);
-		//print_sys_atoms (system);
+        fprintf (stderr, " *********** coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old);
+        //print_sys_atoms (system);
 #endif
 
-		for( i = 0; i < system->N; ++i ) {
-			rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] );
+        for( i = 0; i < system->N; ++i ) {
+            rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] );
 
-			E_kin_new += ( 0.5*system->reaxprm.sbp[system->atoms[i].type].mass * 
-					rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
+            E_kin_new += ( 0.5*system->reaxprm.sbp[system->atoms[i].type].mass * 
+                    rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
 #if defined(DEBUG)
-			fprintf( stderr, "itr%d-atom%d: coef_v = %f, v_xi_old = %f\n", 
-					itr, i, coef_v, v_xi_old );
+            fprintf( stderr, "itr%d-atom%d: coef_v = %f, v_xi_old = %f\n", 
+                    itr, i, coef_v, v_xi_old );
 #endif
-		}
+        }
 
-		G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - 
-				data->N_f * K_B * control->T );
-		v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
+        G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - 
+                data->N_f * K_B * control->T );
+        v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
 #if defined(DEBUG)
-		fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n",
-				itr, G_xi_new, v_xi_new, v_xi_old );
+        fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n",
+                itr, G_xi_new, v_xi_new, v_xi_old );
 #endif
-	}
-	while( fabs(v_xi_new - v_xi_old ) > 1e-5 );
+    }
+    while( fabs(v_xi_new - v_xi_old ) > 1e-5 );
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, " Iteration Count in NVE --> %d \n", itr );
+    fprintf (stderr, " Iteration Count in NVE --> %d \n", itr );
 #endif
 
 
 #ifndef __BUILD_DEBUG__
-	therm->v_xi_old = therm->v_xi;
-	therm->v_xi = v_xi_new;
-	therm->G_xi = G_xi_new;  
+    therm->v_xi_old = therm->v_xi;
+    therm->v_xi = v_xi_new;
+    therm->G_xi = G_xi_new;  
 #endif 
 
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr,"vel scale\n" );
+    fprintf( stderr,"vel scale\n" );
 #endif 
 }
 
@@ -303,200 +303,200 @@ void Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
 ///////////////////////////////////////////////////////////////////
 
 GLOBAL void Compute_X_t_dt (real dt, real dt_sqr, thermostat p_therm,
-		reax_atom *atoms, single_body_parameters *sbp, 
-		simulation_box *box,
-		static_storage p_workspace, int N)
+        reax_atom *atoms, single_body_parameters *sbp, 
+        simulation_box *box,
+        static_storage p_workspace, int N)
 {
 
-	real inv_m;
-	rvec dx;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real inv_m;
+    rvec dx;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if (i >= N) return;
+    if (i >= N) return;
 
-	static_storage *workspace = &p_workspace;
-	thermostat *therm = &p_therm;
+    static_storage *workspace = &p_workspace;
+    thermostat *therm = &p_therm;
 
-	/* Compute x(t + dt) and copy old forces */
-	//for (i=0; i < system->N; i++) {
-	inv_m = 1.0 / sbp[atoms[i].type].mass;
+    /* Compute x(t + dt) and copy old forces */
+    //for (i=0; i < system->N; i++) {
+    inv_m = 1.0 / sbp[atoms[i].type].mass;
 
-	rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, atoms[i].v,
-			0.5 * dt_sqr * inv_m * -F_CONV, atoms[i].f );
+    rvec_ScaledSum( dx, dt - 0.5 * dt_sqr * therm->v_xi, atoms[i].v,
+            0.5 * dt_sqr * inv_m * -F_CONV, atoms[i].f );
 
-	Inc_on_T3( atoms[i].x, dx, box );
+    Inc_on_T3( atoms[i].x, dx, box );
 
-	rvec_Copy( workspace->f_old[i], atoms[i].f );
-	//}
+    rvec_Copy( workspace->f_old[i], atoms[i].f );
+    //}
 
 }
 
 GLOBAL void Update_Velocity (reax_atom *atoms, single_body_parameters *sbp, 
-		static_storage p_workspace, real dt, thermostat p_therm, 
-		int N)
+        static_storage p_workspace, real dt, thermostat p_therm, 
+        int N)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	real inv_m;
-	static_storage *workspace = &p_workspace;
-	thermostat *therm = &p_therm;
-
-	//for( i = 0; i < system->N; ++i ) {
-	inv_m = 1.0 / sbp[atoms[i].type].mass;
-
-	rvec_Scale( workspace->v_const[i], 
-			1.0 - 0.5 * dt * therm->v_xi, atoms[i].v );
-	rvec_ScaledAdd( workspace->v_const[i], 
-			0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
-	rvec_ScaledAdd( workspace->v_const[i], 
-			0.5 * dt * inv_m * -F_CONV, atoms[i].f );
-	//}
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    real inv_m;
+    static_storage *workspace = &p_workspace;
+    thermostat *therm = &p_therm;
+
+    //for( i = 0; i < system->N; ++i ) {
+    inv_m = 1.0 / sbp[atoms[i].type].mass;
+
+    rvec_Scale( workspace->v_const[i], 
+            1.0 - 0.5 * dt * therm->v_xi, atoms[i].v );
+    rvec_ScaledAdd( workspace->v_const[i], 
+            0.5 * dt * inv_m * -F_CONV, workspace->f_old[i] );
+    rvec_ScaledAdd( workspace->v_const[i], 
+            0.5 * dt * inv_m * -F_CONV, atoms[i].f );
+    //}
 }
 
 GLOBAL void E_Kin_Reduction (reax_atom *atoms, static_storage p_workspace,
-		single_body_parameters *sbp, 
-		real *per_block_results, real coef_v, const size_t n)
+        single_body_parameters *sbp, 
+        real *per_block_results, real coef_v, const size_t n)
 {
-	extern __shared__ real sdata[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
-	static_storage *workspace = &p_workspace;
-
-	if(i < n)
-	{
-		rvec_Scale( atoms[i].v, coef_v, workspace->v_const[i] );
-		x = ( 0.5 * sbp[atoms[i].type].mass * 
-				rvec_Dot( atoms[i].v, atoms[i].v ) );
-	}
-	sdata[threadIdx.x] = x;
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{   
-			sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-		}   
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		per_block_results[blockIdx.x] = sdata[0];
-	}
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+    static_storage *workspace = &p_workspace;
+
+    if(i < n)
+    {
+        rvec_Scale( atoms[i].v, coef_v, workspace->v_const[i] );
+        x = ( 0.5 * sbp[atoms[i].type].mass * 
+                rvec_Dot( atoms[i].v, atoms[i].v ) );
+    }
+    sdata[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {   
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }   
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sdata[0];
+    }
 }
 
 
 void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system, 
-		control_params* control, 
-		simulation_data *data, 
-		static_storage *workspace, 
-		list **lists, 
-		output_controls *out_control )
+        control_params* control, 
+        simulation_data *data, 
+        static_storage *workspace, 
+        list **lists, 
+        output_controls *out_control )
 {
-	int i, itr, steps, renbr;
-	real inv_m, coef_v, dt, dt_sqr;
-	real E_kin_new, G_xi_new, v_xi_new, v_xi_old;
-	rvec dx;
-	thermostat *therm;
+    int i, itr, steps, renbr;
+    real inv_m, coef_v, dt, dt_sqr;
+    real E_kin_new, G_xi_new, v_xi_new, v_xi_old;
+    rvec dx;
+    thermostat *therm;
 
-	real *results = (real *)scratch;
+    real *results = (real *)scratch;
 
-	dt = control->dt;
-	dt_sqr = SQR( dt );
-	therm = &( data->therm );
-	steps = data->step - data->prev_steps;
-	renbr = (steps % control->reneighbor == 0);
+    dt = control->dt;
+    dt_sqr = SQR( dt );
+    therm = &( data->therm );
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, " Device: Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein --> coef to update velocity --> %6.10f\n", therm->v_xi_old);
+    fprintf (stderr, " Device: Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein --> coef to update velocity --> %6.10f\n", therm->v_xi_old);
 #endif
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "step%d: ", data->step );
+    fprintf( stderr, "step%d: ", data->step );
 #endif
 
-	Compute_X_t_dt <<< BLOCKS, BLOCK_SIZE >>>
-		(dt, dt_sqr, data->therm, system->d_atoms, 
-		 system->reaxprm.d_sbp, system->d_box, *dev_workspace, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Compute_X_t_dt <<< BLOCKS, BLOCK_SIZE >>>
+        (dt, dt_sqr, data->therm, system->d_atoms, 
+         system->reaxprm.d_sbp, system->d_box, *dev_workspace, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	/* Compute xi(t + dt) */
-	therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
+    /* Compute xi(t + dt) */
+    therm->xi += ( therm->v_xi * dt + 0.5 * dt_sqr * therm->G_xi );
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "verlet1 - " );
+    fprintf( stderr, "verlet1 - " );
 #endif
 
-	Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
-	Cuda_Reset( system, control, data, workspace, lists );
+    Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
+    Cuda_Reset( system, control, data, workspace, lists );
 
-	if( renbr ) {
-		//generate_neighbor_lists here
-		Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true);
-	}
+    if( renbr ) {
+        //generate_neighbor_lists here
+        Cuda_Generate_Neighbor_Lists (system, dev_workspace, control, true);
+    }
 
-	/* Calculate Forces at time (t + dt) */
-	Cuda_Compute_Forces( system,control,data, workspace, lists, out_control );
+    /* Calculate Forces at time (t + dt) */
+    Cuda_Compute_Forces( system,control,data, workspace, lists, out_control );
 
-	/* Compute iteration constants for each atom's velocity */
-	Update_Velocity <<< BLOCKS, BLOCK_SIZE >>>
-		(system->d_atoms, system->reaxprm.d_sbp, *dev_workspace,
-		 dt, *therm, system->N );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    /* Compute iteration constants for each atom's velocity */
+    Update_Velocity <<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, system->reaxprm.d_sbp, *dev_workspace,
+         dt, *therm, system->N );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
 
-	v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
-	E_kin_new = G_xi_new = v_xi_old = 0;
-	itr = 0;
-	do {
-		itr++;      
+    v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
+    E_kin_new = G_xi_new = v_xi_old = 0;
+    itr = 0;
+    do {
+        itr++;      
 
-		/* new values become old in this iteration */
-		v_xi_old = v_xi_new;
-		coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
-		E_kin_new = 0;
+        /* new values become old in this iteration */
+        v_xi_old = v_xi_new;
+        coef_v = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
+        E_kin_new = 0;
 
-		/*reduction for the E_Kin_new here*/
+        /*reduction for the E_Kin_new here*/
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, " Device: coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old);
+        fprintf (stderr, " Device: coef to update velocity --> %6.10f, %6.10f, %6.10f\n", coef_v, dt, therm->v_xi_old);
 #endif
-		cuda_memset (results, 0, 2 * BLOCK_SIZE * REAL_SIZE, RES_SCRATCH );
-		E_Kin_Reduction <<< BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>
-			(system->d_atoms, *dev_workspace, system->reaxprm.d_sbp, 
-			 results, coef_v, system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
-			(results, results + BLOCKS_POW_2, BLOCKS_POW_2);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		copy_host_device (&E_kin_new, results + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); 
-
-		G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - 
-				data->N_f * K_B * control->T );
-		v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
+        cuda_memset (results, 0, 2 * BLOCK_SIZE * REAL_SIZE, RES_SCRATCH );
+        E_Kin_Reduction <<< BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>>
+            (system->d_atoms, *dev_workspace, system->reaxprm.d_sbp, 
+             results, coef_v, system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
+            (results, results + BLOCKS_POW_2, BLOCKS_POW_2);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        copy_host_device (&E_kin_new, results + BLOCKS_POW_2, REAL_SIZE, cudaMemcpyDeviceToHost, RES_SCRATCH ); 
+
+        G_xi_new = control->Tau_T * ( 2.0 * E_kin_new - 
+                data->N_f * K_B * control->T );
+        v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
 #if defined(DEBUG)
-		fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n",
-				itr, G_xi_new, v_xi_new, v_xi_old );
+        fprintf( stderr, "itr%d: G_xi_new = %f, v_xi_new = %f, v_xi_old = %f\n",
+                itr, G_xi_new, v_xi_new, v_xi_old );
 #endif
-	}
-	while( fabs(v_xi_new - v_xi_old ) > 1e-5 );
+    }
+    while( fabs(v_xi_new - v_xi_old ) > 1e-5 );
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, " Iteration Count in NVE --> %d \n", itr );
+    fprintf (stderr, " Iteration Count in NVE --> %d \n", itr );
 #endif
 
-	therm->v_xi_old = therm->v_xi;
-	therm->v_xi = v_xi_new;
-	therm->G_xi = G_xi_new;  
+    therm->v_xi_old = therm->v_xi;
+    therm->v_xi = v_xi_new;
+    therm->G_xi = G_xi_new;  
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr,"vel scale\n" );
+    fprintf( stderr,"vel scale\n" );
 #endif 
 }
 
@@ -509,109 +509,109 @@ void Cuda_Velocity_Verlet_Nose_Hoover_NVT_Klein(reax_system* system,
    All box dimensions are scaled by the same amount, 
    there is no change in the angles between axes. */
 void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system, 
-		control_params* control, 
-		simulation_data *data,
-		static_storage *workspace, 
-		list **lists, 
-		output_controls *out_control )
+        control_params* control, 
+        simulation_data *data,
+        static_storage *workspace, 
+        list **lists, 
+        output_controls *out_control )
 {
-	int i, steps, renbr;
-	real inv_m, dt, lambda, mu;
-	rvec dx;
+    int i, steps, renbr;
+    real inv_m, dt, lambda, mu;
+    rvec dx;
 
-	dt = control->dt;
-	steps = data->step - data->prev_steps;
-	renbr = (steps % control->reneighbor == 0);
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
 #if defined(DEBUG_FOCUS)
-	//fprintf( out_control->prs, 
-	//         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
-	//control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P );
-	fprintf( stderr, "step %d: ", data->step );
+    //fprintf( out_control->prs, 
+    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
+    //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P );
+    fprintf( stderr, "step %d: ", data->step );
 #endif
 
-	/* velocity verlet, 1st part */
-	for( i = 0; i < system->N; i++ ) {
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-		/* Compute x(t + dt) */
-		rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-				0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
-		Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
-		/* Compute v(t + dt/2) */
-		rvec_ScaledAdd( system->atoms[i].v, 
-				0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
-		/*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n", 
-		  workspace->orig_id[i], 
-		  system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
-		  0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], 
-		  0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
-		  0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
-	}
+    /* velocity verlet, 1st part */
+    for( i = 0; i < system->N; i++ ) {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+        /* Compute x(t + dt) */
+        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
+                0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
+        Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
+        /* Compute v(t + dt/2) */
+        rvec_ScaledAdd( system->atoms[i].v, 
+                0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
+        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n", 
+          workspace->orig_id[i], 
+          system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], 
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
+    }
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "verlet1 - " );
+    fprintf( stderr, "verlet1 - " );
 #endif
 
-	Reallocate( system, workspace, lists, renbr );  
-	Reset( system, control, data, workspace, lists );
-	if( renbr ) {
-		Update_Grid( system );
-		Generate_Neighbor_Lists( system, control, data, workspace,
-				lists, out_control );
-	}
-	Compute_Forces( system, control, data, workspace, lists, out_control );
-
-	/* velocity verlet, 2nd part */
-	for( i = 0; i < system->N; i++ ) {
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-		/* Compute v(t + dt) */
-		rvec_ScaledAdd( system->atoms[i].v, 
-				0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-		/* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n", 
-		   workspace->orig_id[i], 
-		   system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
-		   0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], 
-		   0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], 
-		   0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/
-	}
-	//Compute_Kinetic_Energy( system, data );   
-	Compute_Pressure_Isotropic( system, control, data, out_control );
+    Reallocate( system, workspace, lists, renbr );  
+    Reset( system, control, data, workspace, lists );
+    if( renbr ) {
+        Update_Grid( system );
+        Generate_Neighbor_Lists( system, control, data, workspace,
+                lists, out_control );
+    }
+    Compute_Forces( system, control, data, workspace, lists, out_control );
+
+    /* velocity verlet, 2nd part */
+    for( i = 0; i < system->N; i++ ) {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+        /* Compute v(t + dt) */
+        rvec_ScaledAdd( system->atoms[i].v, 
+                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n", 
+           workspace->orig_id[i], 
+           system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], 
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], 
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/
+    }
+    //Compute_Kinetic_Energy( system, data );   
+    Compute_Pressure_Isotropic( system, control, data, out_control );
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "verlet2 - " );
+    fprintf( stderr, "verlet2 - " );
 #endif
 
-	/* pressure scaler */
-	mu = POW( 1.0 + (dt / control->Tau_P[0]) * (data->iso_bar.P - control->P[0]),
-			1.0 / 3 );
-	if( mu < MIN_dV ) 
-		mu = MIN_dV;
-	else if( mu > MAX_dV )
-		mu = MAX_dV;
-
-	/* temperature scaler */
-	lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-	if( lambda < MIN_dT )
-		lambda = MIN_dT;
-	else if (lambda > MAX_dT )
-		lambda = MAX_dT;
-	lambda = SQRT( lambda );
-
-	/* Scale velocities and positions at t+dt */
-	for( i = 0; i < system->N; ++i ) {
-		rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v );
-		/* IMPORTANT: What Adri does with scaling positions first to 
-		   unit coordinates and then back to cartesian coordinates essentially 
-		   is scaling the coordinates with mu^2. However, this causes unphysical 
-		   modifications on the system because box dimensions
-		   are being scaled with mu! We need to discuss this with Adri! */
-		rvec_Scale( system->atoms[i].x, mu, system->atoms[i].x );
-	}
-	//Compute_Kinetic_Energy( system, data );
+    /* pressure scaler */
+    mu = POW( 1.0 + (dt / control->Tau_P[0]) * (data->iso_bar.P - control->P[0]),
+            1.0 / 3 );
+    if( mu < MIN_dV ) 
+        mu = MIN_dV;
+    else if( mu > MAX_dV )
+        mu = MAX_dV;
+
+    /* temperature scaler */
+    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+    if( lambda < MIN_dT )
+        lambda = MIN_dT;
+    else if (lambda > MAX_dT )
+        lambda = MAX_dT;
+    lambda = SQRT( lambda );
+
+    /* Scale velocities and positions at t+dt */
+    for( i = 0; i < system->N; ++i ) {
+        rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v );
+        /* IMPORTANT: What Adri does with scaling positions first to 
+           unit coordinates and then back to cartesian coordinates essentially 
+           is scaling the coordinates with mu^2. However, this causes unphysical 
+           modifications on the system because box dimensions
+           are being scaled with mu! We need to discuss this with Adri! */
+        rvec_Scale( system->atoms[i].x, mu, system->atoms[i].x );
+    }
+    //Compute_Kinetic_Energy( system, data );
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "scaling - " );
+    fprintf( stderr, "scaling - " );
 #endif
 
-	Update_Box_Isotropic( &(system->box), mu );
+    Update_Box_Isotropic( &(system->box), mu );
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "updated box\n" );
+    fprintf( stderr, "updated box\n" );
 #endif
 }
 
@@ -620,112 +620,112 @@ void Velocity_Verlet_Berendsen_Isotropic_NPT( reax_system* system,
    All box dimensions are scaled by the same amount, 
    there is no change in the angles between axes. */
 void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system, 
-		control_params* control, 
-		simulation_data *data,
-		static_storage *workspace, 
-		list **lists, 
-		output_controls *out_control )
+        control_params* control, 
+        simulation_data *data,
+        static_storage *workspace, 
+        list **lists, 
+        output_controls *out_control )
 {
-	int i, d, steps, renbr;
-	real dt, inv_m, lambda;
-	rvec dx, mu;
+    int i, d, steps, renbr;
+    real dt, inv_m, lambda;
+    rvec dx, mu;
 
-	dt = control->dt;
-	steps = data->step - data->prev_steps;
-	renbr = (steps % control->reneighbor == 0);
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
 #if defined(DEBUG_FOCUS)
-	//fprintf( out_control->prs, 
-	//         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
-	//control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P );
-	fprintf( stderr, "step %d: ", data->step );
+    //fprintf( out_control->prs, 
+    //         "tau_t: %g  tau_p: %g  dt/tau_t: %g  dt/tau_p: %g\n", 
+    //control->Tau_T, control->Tau_P, dt / control->Tau_T, dt / control->Tau_P );
+    fprintf( stderr, "step %d: ", data->step );
 #endif
 
-	/* velocity verlet, 1st part */
-	for( i = 0; i < system->N; i++ ) {
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; 
-		/* Compute x(t + dt) */
-		rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-				0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
-		Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
-		/* Compute v(t + dt/2) */
-		rvec_ScaledAdd( system->atoms[i].v, 
-				0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
-		/*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n", 
-		  workspace->orig_id[i], 
-		  system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
-		  0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], 
-		  0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
-		  0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
-	}
+    /* velocity verlet, 1st part */
+    for( i = 0; i < system->N; i++ ) {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass; 
+        /* Compute x(t + dt) */
+        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
+                0.5 * -F_CONV * inv_m * SQR(dt), system->atoms[i].f );
+        Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
+        /* Compute v(t + dt/2) */
+        rvec_ScaledAdd( system->atoms[i].v, 
+                0.5 * -F_CONV * inv_m * dt, system->atoms[i].f );
+        /*fprintf( stderr, "%6d   %15.8f %15.8f %15.8f   %15.8f %15.8f %15.8f\n", 
+          workspace->orig_id[i], 
+          system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2],
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[0], 
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[1], 
+          0.5 * SQR(dt) * -F_CONV * inv_m * system->atoms[i].f[2] ); */
+    }
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "verlet1 - " );
+    fprintf( stderr, "verlet1 - " );
 #endif
 
-	Reallocate( system, workspace, lists, renbr ); 
-	Reset( system, control, data, workspace, lists );
-	if( renbr ) {
-		Update_Grid( system );
-		Generate_Neighbor_Lists( system, control, data, workspace, 
-				lists, out_control );
-	}
-	Compute_Forces( system, control, data, workspace, lists, out_control );
-
-	/* velocity verlet, 2nd part */
-	for( i = 0; i < system->N; i++ ) {
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-		/* Compute v(t + dt) */
-		rvec_ScaledAdd( system->atoms[i].v, 
-				0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-		/* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n", 
-		   workspace->orig_id[i], 
-		   system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
-		   0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], 
-		   0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], 
-		   0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/
-	}
-	//Compute_Kinetic_Energy( system, data );   
-	Compute_Pressure_Isotropic( system, control, data, out_control );
+    Reallocate( system, workspace, lists, renbr ); 
+    Reset( system, control, data, workspace, lists );
+    if( renbr ) {
+        Update_Grid( system );
+        Generate_Neighbor_Lists( system, control, data, workspace, 
+                lists, out_control );
+    }
+    Compute_Forces( system, control, data, workspace, lists, out_control );
+
+    /* velocity verlet, 2nd part */
+    for( i = 0; i < system->N; i++ ) {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+        /* Compute v(t + dt) */
+        rvec_ScaledAdd( system->atoms[i].v, 
+                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        /* fprintf( stderr, "%6d   %15f %15f %15f   %15.8f %15.8f %15.8f\n", 
+           workspace->orig_id[i], 
+           system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[0], 
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[1], 
+           0.5 * dt * -F_CONV * inv_m * system->atoms[i].f[2] );*/
+    }
+    //Compute_Kinetic_Energy( system, data );   
+    Compute_Pressure_Isotropic( system, control, data, out_control );
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "verlet2 - " );
+    fprintf( stderr, "verlet2 - " );
 #endif
 
-	/* pressure scaler */
-	for( d = 0; d < 3; ++d ){
-		mu[d] = POW( 1.0+(dt/control->Tau_P[d])*(data->tot_press[d]-control->P[d]),
-				1.0 / 3 );
-		if( mu[d] < MIN_dV ) 
-			mu[d] = MIN_dV;
-		else if( mu[d] > MAX_dV )
-			mu[d] = MAX_dV;
-	}
-
-	/* temperature scaler */
-	lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-	if( lambda < MIN_dT )
-		lambda = MIN_dT;
-	else if (lambda > MAX_dT )
-		lambda = MAX_dT;
-	lambda = SQRT( lambda );
-
-	/* Scale velocities and positions at t+dt */
-	for( i = 0; i < system->N; ++i ) {
-		rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v );
-		/* IMPORTANT: What Adri does with scaling positions first to 
-		   unit coordinates and then back to cartesian coordinates essentially 
-		   is scaling the coordinates with mu^2. However, this causes unphysical 
-		   modifications on the system because box dimensions
-		   are being scaled with mu! We need to discuss this with Adri! */
-		for( d = 0; d < 3; ++d )
-			system->atoms[i].x[d] = system->atoms[i].x[d] * mu[d];
-	}
-	//Compute_Kinetic_Energy( system, data );
+    /* pressure scaler */
+    for( d = 0; d < 3; ++d ){
+        mu[d] = POW( 1.0+(dt/control->Tau_P[d])*(data->tot_press[d]-control->P[d]),
+                1.0 / 3 );
+        if( mu[d] < MIN_dV ) 
+            mu[d] = MIN_dV;
+        else if( mu[d] > MAX_dV )
+            mu[d] = MAX_dV;
+    }
+
+    /* temperature scaler */
+    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+    if( lambda < MIN_dT )
+        lambda = MIN_dT;
+    else if (lambda > MAX_dT )
+        lambda = MAX_dT;
+    lambda = SQRT( lambda );
+
+    /* Scale velocities and positions at t+dt */
+    for( i = 0; i < system->N; ++i ) {
+        rvec_Scale( system->atoms[i].v, lambda, system->atoms[i].v );
+        /* IMPORTANT: What Adri does with scaling positions first to 
+           unit coordinates and then back to cartesian coordinates essentially 
+           is scaling the coordinates with mu^2. However, this causes unphysical 
+           modifications on the system because box dimensions
+           are being scaled with mu! We need to discuss this with Adri! */
+        for( d = 0; d < 3; ++d )
+            system->atoms[i].x[d] = system->atoms[i].x[d] * mu[d];
+    }
+    //Compute_Kinetic_Energy( system, data );
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "scaling - " );
+    fprintf( stderr, "scaling - " );
 #endif
 
-	Update_Box_SemiIsotropic( &(system->box), mu );
+    Update_Box_SemiIsotropic( &(system->box), mu );
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "updated box & grid\n" );
+    fprintf( stderr, "updated box & grid\n" );
 #endif
 }
 
@@ -741,243 +741,243 @@ void Velocity_Verlet_Berendsen_SemiIsotropic_NPT( reax_system* system,
 #ifdef ANISOTROPIC
 
 void Velocity_Verlet_Nose_Hoover_NVT(reax_system* system, 
-		control_params* control, 
-		simulation_data *data,
-		static_storage *workspace, 
-		list **lists, 
-		output_controls *out_control )
+        control_params* control, 
+        simulation_data *data,
+        static_storage *workspace, 
+        list **lists, 
+        output_controls *out_control )
 {
-	int i;
-	real inv_m;
-	real dt = control->dt;
-	real dt_sqr = SQR(dt);
-	rvec dx;
-
-	for (i=0; i < system->N; i++)
-	{
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-
-		// Compute x(t + dt)
-		rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-				0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
-		Inc_on_T3_Gen( system->atoms[i].x, dx, &(system->box) );
-
-		// Compute v(t + dt/2)
-		rvec_ScaledAdd( system->atoms[i].v, 
-				-0.5 * dt * data->therm.xi, system->atoms[i].v );
-		rvec_ScaledAdd( system->atoms[i].v, 
-				0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-	}
-
-	// Compute zeta(t + dt/2), E_Kininetic(t + dt/2)
-	// IMPORTANT: What will be the initial value of zeta? and what is g?
-	data->therm.xi += 0.5 * dt * control->Tau_T  * 
-		( 2.0 * data->E_Kin - data->N_f * K_B * control->T );
-
-	Reset( system, control, data, workspace );
-	fprintf(out_control->log,"reset-"); fflush( out_control->log );
-
-	Generate_Neighbor_Lists( system, control, data, workspace, 
-			lists, out_control );
-	fprintf(out_control->log,"nbrs-"); fflush( out_control->log );
-
-	/* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
-	   fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
-
-	Compute_Forces( system, control, data, workspace, lists, out_control );
-	fprintf(out_control->log,"forces\n"); fflush( out_control->log );
-
-	//Compute_Kinetic_Energy( system, data );
-
-	for( i = 0; i < system->N; i++ )
-	{
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-
-		// compute v(t + dt)
-		rvec_ScaledAdd( system->atoms[i].v, 
-				-0.5 * dt * data->therm.xi, system->atoms[i].v );
-		rvec_ScaledAdd( system->atoms[i].v, 
-				0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-	}
-
-	// Compute zeta(t + dt)
-	data->therm.xi += 0.5*dt * control->Tau_T  * ( 2.0 * data->E_Kin - 
-			data->N_f * K_B * control->T );
-
-	fprintf( out_control->log,"Xi: %8.3f %8.3f %8.3f\n", 
-			data->therm.xi, data->E_Kin, data->N_f * K_B * control->T ); 
-	fflush( out_control->log );
+    int i;
+    real inv_m;
+    real dt = control->dt;
+    real dt_sqr = SQR(dt);
+    rvec dx;
+
+    for (i=0; i < system->N; i++)
+    {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+
+        // Compute x(t + dt)
+        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
+                0.5 * dt_sqr * -F_CONV * inv_m, system->atoms[i].f );
+        Inc_on_T3_Gen( system->atoms[i].x, dx, &(system->box) );
+
+        // Compute v(t + dt/2)
+        rvec_ScaledAdd( system->atoms[i].v, 
+                -0.5 * dt * data->therm.xi, system->atoms[i].v );
+        rvec_ScaledAdd( system->atoms[i].v, 
+                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+    }
+
+    // Compute zeta(t + dt/2), E_Kininetic(t + dt/2)
+    // IMPORTANT: What will be the initial value of zeta? and what is g?
+    data->therm.xi += 0.5 * dt * control->Tau_T  * 
+        ( 2.0 * data->E_Kin - data->N_f * K_B * control->T );
+
+    Reset( system, control, data, workspace );
+    fprintf(out_control->log,"reset-"); fflush( out_control->log );
+
+    Generate_Neighbor_Lists( system, control, data, workspace, 
+            lists, out_control );
+    fprintf(out_control->log,"nbrs-"); fflush( out_control->log );
+
+    /* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
+       fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
+
+    Compute_Forces( system, control, data, workspace, lists, out_control );
+    fprintf(out_control->log,"forces\n"); fflush( out_control->log );
+
+    //Compute_Kinetic_Energy( system, data );
+
+    for( i = 0; i < system->N; i++ )
+    {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+
+        // compute v(t + dt)
+        rvec_ScaledAdd( system->atoms[i].v, 
+                -0.5 * dt * data->therm.xi, system->atoms[i].v );
+        rvec_ScaledAdd( system->atoms[i].v, 
+                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+    }
+
+    // Compute zeta(t + dt)
+    data->therm.xi += 0.5*dt * control->Tau_T  * ( 2.0 * data->E_Kin - 
+            data->N_f * K_B * control->T );
+
+    fprintf( out_control->log,"Xi: %8.3f %8.3f %8.3f\n", 
+            data->therm.xi, data->E_Kin, data->N_f * K_B * control->T ); 
+    fflush( out_control->log );
 }
 
 
 
 void Velocity_Verlet_Isotropic_NPT( reax_system* system, 
-		control_params* control, 
-		simulation_data *data,
-		static_storage *workspace, 
-		list **lists, 
-		output_controls *out_control )
+        control_params* control, 
+        simulation_data *data,
+        static_storage *workspace, 
+        list **lists, 
+        output_controls *out_control )
 {
-	int i, itr;
-	real deps, v_eps_new=0, v_eps_old=0, G_xi_new;
-	real dxi, v_xi_new=0, v_xi_old=0, a_eps_new;
-	real inv_m, exp_deps, inv_3V;
-	real E_kin, P_int, P_int_const;
-	real coef_v, coef_v_eps;
-	real dt = control->dt;
-	real dt_sqr = SQR( dt );
-	thermostat *therm = &( data->therm );
-	isotropic_barostat *iso_bar = &( data->iso_bar );
-	simulation_box *box = &( system->box );
-	rvec dx, dv;
-
-	// Here we just calculate how much to increment eps, xi, v_eps, v_xi.
-	// Commits are done after positions and velocities of atoms are updated
-	// because position, velocity updates uses v_eps, v_xi terms; 
-	// yet we need EXP( deps ) to be able to calculate 
-	// positions and velocities accurately.  
-	iso_bar->a_eps = control->Tau_P * 
-		( 3.0 * box->volume * (iso_bar->P - control->P) + 
-		  6.0 * data->E_Kin / data->N_f ) - iso_bar->v_eps * therm->v_xi;
-	deps = dt * iso_bar->v_eps + 0.5 * dt_sqr * iso_bar->a_eps;
-	exp_deps = EXP( deps );
-
-	therm->G_xi = control->Tau_T * ( 2.0 * data->E_Kin + 
-			SQR( iso_bar->v_eps ) / control->Tau_P - 
-			(data->N_f +1) * K_B * control->T );
-	dxi = therm->v_xi * dt + 0.5 * therm->G_xi * dt_sqr;
-
-	fprintf(out_control->log, "a: %12.6f   eps: %12.6f   deps: %12.6f\n", 
-			iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps);
-	fprintf(out_control->log, "G: %12.6f   xi : %12.6f   dxi : %12.6f\n", 
-			therm->G_xi, therm->v_xi, therm->xi );
-
-	// Update positions and velocities
-	// NOTE: v_old, v_xi_old, v_eps_old are meant to be the old values 
-	// in the iteration not the old values at time t or before!
-	for (i=0; i < system->N; i++)
-	{
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-
-		// Compute x(t + dt)
-		rvec_ScaledSum( workspace->a[i], -F_CONV * inv_m, system->atoms[i].f, 
-				-( (2.0 + 3.0/data->N_f) * iso_bar->v_eps + therm->v_xi ),
-				system->atoms[i].v );
-		rvec_ScaledSum( dx, dt, system->atoms[i].v, 
-				0.5 * dt_sqr, workspace->a[i] );
-		Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
-		rvec_Scale( system->atoms[i].x, exp_deps, system->atoms[i].x );
-	}
-
-	// Commit updates
-	therm->xi += dxi;
-	iso_bar->eps += deps;
-	//Update_Box_Isotropic( EXP( 3.0 * iso_bar->eps ), &(system->box) );
-	Update_Box_Isotropic( &(system->box), EXP( 3.0 * iso_bar->eps ) );
-
-
-	// Calculate new forces, f(t + dt)
-	Reset( system, control, data, workspace );
-	fprintf(out_control->log,"reset-"); fflush( out_control->log );
-
-	Generate_Neighbor_Lists( system, control, data, workspace, 
-			lists, out_control );
-	fprintf(out_control->log,"nbrs-"); fflush( out_control->log );
-
-	/* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
-	   fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
-
-	Compute_Forces( system, control, data, workspace, lists, out_control );
-	fprintf(out_control->log,"forces\n"); fflush( out_control->log );
-
-
-	// Compute iteration constants for each atom's velocity and for P_internal
-	// Compute kinetic energy for initial velocities of the iteration
-	P_int_const = E_kin = 0;
-	for( i = 0; i < system->N; ++i )
-	{
-		inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
-
-		rvec_ScaledSum( dv, 0.5 * dt, workspace->a[i], 
-				0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
-		rvec_Add( dv, system->atoms[i].v );
-		rvec_Scale( workspace->v_const[i], exp_deps, dv );
-
-		P_int_const += ( -F_CONV * 
-				rvec_Dot( system->atoms[i].f, system->atoms[i].x ) );
-
-		E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * 
-				rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
-	}
-
-
-	// Compute initial p_int
-	inv_3V = 1.0 / (3.0 * system->box.volume);
-	P_int = inv_3V * ( 2.0 * E_kin + P_int_const );
-
-	v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
-	v_eps_new = iso_bar->v_eps_old + 2.0 * dt * iso_bar->a_eps;
-
-	itr = 0;
-	do
-	{
-		itr++;
-		// new values become old in this iteration
-		v_xi_old = v_xi_new;
-		v_eps_old = v_eps_new;
-
-
-		for( i = 0; i < system->N; ++i )
-		{
-			coef_v = 1.0 / (1.0 + 0.5 * dt * exp_deps * 
-					( (2.0 + 3.0/data->N_f) * v_eps_old + v_xi_old ) );
-			rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] );
-		}
-
-
-		coef_v_eps = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
-		a_eps_new = 3.0 * control->Tau_P * 
-			( system->box.volume * (P_int - control->P) + 2.0 * E_kin / data->N_f );
-		v_eps_new = coef_v_eps * ( iso_bar->v_eps + 
-				0.5 * dt * ( iso_bar->a_eps + a_eps_new ) );
+    int i, itr;
+    real deps, v_eps_new=0, v_eps_old=0, G_xi_new;
+    real dxi, v_xi_new=0, v_xi_old=0, a_eps_new;
+    real inv_m, exp_deps, inv_3V;
+    real E_kin, P_int, P_int_const;
+    real coef_v, coef_v_eps;
+    real dt = control->dt;
+    real dt_sqr = SQR( dt );
+    thermostat *therm = &( data->therm );
+    isotropic_barostat *iso_bar = &( data->iso_bar );
+    simulation_box *box = &( system->box );
+    rvec dx, dv;
+
+    // Here we just calculate how much to increment eps, xi, v_eps, v_xi.
+    // Commits are done after positions and velocities of atoms are updated
+    // because position, velocity updates uses v_eps, v_xi terms; 
+    // yet we need EXP( deps ) to be able to calculate 
+    // positions and velocities accurately.  
+    iso_bar->a_eps = control->Tau_P * 
+        ( 3.0 * box->volume * (iso_bar->P - control->P) + 
+          6.0 * data->E_Kin / data->N_f ) - iso_bar->v_eps * therm->v_xi;
+    deps = dt * iso_bar->v_eps + 0.5 * dt_sqr * iso_bar->a_eps;
+    exp_deps = EXP( deps );
+
+    therm->G_xi = control->Tau_T * ( 2.0 * data->E_Kin + 
+            SQR( iso_bar->v_eps ) / control->Tau_P - 
+            (data->N_f +1) * K_B * control->T );
+    dxi = therm->v_xi * dt + 0.5 * therm->G_xi * dt_sqr;
+
+    fprintf(out_control->log, "a: %12.6f   eps: %12.6f   deps: %12.6f\n", 
+            iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps);
+    fprintf(out_control->log, "G: %12.6f   xi : %12.6f   dxi : %12.6f\n", 
+            therm->G_xi, therm->v_xi, therm->xi );
+
+    // Update positions and velocities
+    // NOTE: v_old, v_xi_old, v_eps_old are meant to be the old values 
+    // in the iteration not the old values at time t or before!
+    for (i=0; i < system->N; i++)
+    {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+
+        // Compute x(t + dt)
+        rvec_ScaledSum( workspace->a[i], -F_CONV * inv_m, system->atoms[i].f, 
+                -( (2.0 + 3.0/data->N_f) * iso_bar->v_eps + therm->v_xi ),
+                system->atoms[i].v );
+        rvec_ScaledSum( dx, dt, system->atoms[i].v, 
+                0.5 * dt_sqr, workspace->a[i] );
+        Inc_on_T3( system->atoms[i].x, dx, &(system->box) );
+        rvec_Scale( system->atoms[i].x, exp_deps, system->atoms[i].x );
+    }
+
+    // Commit updates
+    therm->xi += dxi;
+    iso_bar->eps += deps;
+    //Update_Box_Isotropic( EXP( 3.0 * iso_bar->eps ), &(system->box) );
+    Update_Box_Isotropic( &(system->box), EXP( 3.0 * iso_bar->eps ) );
+
+
+    // Calculate new forces, f(t + dt)
+    Reset( system, control, data, workspace );
+    fprintf(out_control->log,"reset-"); fflush( out_control->log );
+
+    Generate_Neighbor_Lists( system, control, data, workspace, 
+            lists, out_control );
+    fprintf(out_control->log,"nbrs-"); fflush( out_control->log );
+
+    /* QEq( system, control, workspace, lists[FAR_NBRS], out_control );
+       fprintf(out_control->log,"qeq-"); fflush( out_control->log ); */
+
+    Compute_Forces( system, control, data, workspace, lists, out_control );
+    fprintf(out_control->log,"forces\n"); fflush( out_control->log );
+
+
+    // Compute iteration constants for each atom's velocity and for P_internal
+    // Compute kinetic energy for initial velocities of the iteration
+    P_int_const = E_kin = 0;
+    for( i = 0; i < system->N; ++i )
+    {
+        inv_m = 1.0 / system->reaxprm.sbp[system->atoms[i].type].mass;
+
+        rvec_ScaledSum( dv, 0.5 * dt, workspace->a[i], 
+                0.5 * dt * -F_CONV * inv_m, system->atoms[i].f );
+        rvec_Add( dv, system->atoms[i].v );
+        rvec_Scale( workspace->v_const[i], exp_deps, dv );
+
+        P_int_const += ( -F_CONV * 
+                rvec_Dot( system->atoms[i].f, system->atoms[i].x ) );
+
+        E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * 
+                rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
+    }
+
+
+    // Compute initial p_int
+    inv_3V = 1.0 / (3.0 * system->box.volume);
+    P_int = inv_3V * ( 2.0 * E_kin + P_int_const );
+
+    v_xi_new = therm->v_xi_old + 2.0 * dt * therm->G_xi;
+    v_eps_new = iso_bar->v_eps_old + 2.0 * dt * iso_bar->a_eps;
+
+    itr = 0;
+    do
+    {
+        itr++;
+        // new values become old in this iteration
+        v_xi_old = v_xi_new;
+        v_eps_old = v_eps_new;
+
+
+        for( i = 0; i < system->N; ++i )
+        {
+            coef_v = 1.0 / (1.0 + 0.5 * dt * exp_deps * 
+                    ( (2.0 + 3.0/data->N_f) * v_eps_old + v_xi_old ) );
+            rvec_Scale( system->atoms[i].v, coef_v, workspace->v_const[i] );
+        }
+
+
+        coef_v_eps = 1.0 / (1.0 + 0.5 * dt * v_xi_old);
+        a_eps_new = 3.0 * control->Tau_P * 
+            ( system->box.volume * (P_int - control->P) + 2.0 * E_kin / data->N_f );
+        v_eps_new = coef_v_eps * ( iso_bar->v_eps + 
+                0.5 * dt * ( iso_bar->a_eps + a_eps_new ) );
 
 
-		G_xi_new = control->Tau_T * ( 2.0 * E_kin + 
-				SQR( v_eps_old ) / control->Tau_P - 
-				(data->N_f + 1) * K_B * control->T );
-		v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
+        G_xi_new = control->Tau_T * ( 2.0 * E_kin + 
+                SQR( v_eps_old ) / control->Tau_P - 
+                (data->N_f + 1) * K_B * control->T );
+        v_xi_new = therm->v_xi + 0.5 * dt * ( therm->G_xi + G_xi_new );
 
 
-		E_kin = 0;
-		for( i = 0; i < system->N; ++i )
-			E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * 
-					rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
+        E_kin = 0;
+        for( i = 0; i < system->N; ++i )
+            E_kin += (0.5 * system->reaxprm.sbp[system->atoms[i].type].mass * 
+                    rvec_Dot( system->atoms[i].v, system->atoms[i].v ) );
 
-		P_int = inv_3V * ( 2.0*E_kin + P_int_const );
+        P_int = inv_3V * ( 2.0*E_kin + P_int_const );
 
 
-		fprintf( out_control->log, 
-				"itr %d E_kin: %8.3f veps_n:%8.3f veps_o:%8.3f vxi_n:%8.3f vxi_o: %8.3f\n", 
-				itr, E_kin, v_eps_new, v_eps_old, v_xi_new, v_xi_old );
-	}
-	while( fabs(v_eps_new - v_eps_old) + fabs(v_xi_new - v_xi_old) > 2e-3 );
+        fprintf( out_control->log, 
+                "itr %d E_kin: %8.3f veps_n:%8.3f veps_o:%8.3f vxi_n:%8.3f vxi_o: %8.3f\n", 
+                itr, E_kin, v_eps_new, v_eps_old, v_xi_new, v_xi_old );
+    }
+    while( fabs(v_eps_new - v_eps_old) + fabs(v_xi_new - v_xi_old) > 2e-3 );
 
 
-	therm->v_xi_old = therm->v_xi;
-	therm->v_xi = v_xi_new;
-	therm->G_xi = G_xi_new;
+    therm->v_xi_old = therm->v_xi;
+    therm->v_xi = v_xi_new;
+    therm->G_xi = G_xi_new;
 
-	iso_bar->v_eps_old = iso_bar->v_eps;
-	iso_bar->v_eps = v_eps_new;
-	iso_bar->a_eps = a_eps_new;
+    iso_bar->v_eps_old = iso_bar->v_eps;
+    iso_bar->v_eps = v_eps_new;
+    iso_bar->a_eps = a_eps_new;
 
-	fprintf( out_control->log, "V: %8.3ff\tsides{%8.3f, %8.3f, %8.3f}\n", 
-			system->box.volume, 
-			system->box.box[0][0],system->box.box[1][1],system->box.box[2][2] );
-	fprintf(out_control->log,"eps:\ta- %8.3f  v- %8.3f  eps- %8.3f\n", 
-			iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps);
-	fprintf(out_control->log,"xi: \tG- %8.3f  v- %8.3f  xi - %8.3f\n", 
-			therm->G_xi, therm->v_xi, therm->xi);
+    fprintf( out_control->log, "V: %8.3ff\tsides{%8.3f, %8.3f, %8.3f}\n", 
+            system->box.volume, 
+            system->box.box[0][0],system->box.box[1][1],system->box.box[2][2] );
+    fprintf(out_control->log,"eps:\ta- %8.3f  v- %8.3f  eps- %8.3f\n", 
+            iso_bar->a_eps, iso_bar->v_eps, iso_bar->eps);
+    fprintf(out_control->log,"xi: \tG- %8.3f  v- %8.3f  xi - %8.3f\n", 
+            therm->G_xi, therm->v_xi, therm->xi);
 }
 
 #endif
@@ -989,256 +989,256 @@ void Velocity_Verlet_Isotropic_NPT( reax_system* system,
    All box dimensions are scaled by the same amount, 
    there is no change in the angles between axes. */
 void Velocity_Verlet_Berendsen_NVT( reax_system* system,
-		control_params* control,
-		simulation_data *data,
-		static_storage *workspace,
-		list **lists,
-		output_controls *out_control
-		)
+        control_params* control,
+        simulation_data *data,
+        static_storage *workspace,
+        list **lists,
+        output_controls *out_control
+        )
 {
-	int i, steps, renbr;
-	real inv_m, dt, lambda;
-	rvec dx;
-	reax_atom *atom;
+    int i, steps, renbr;
+    real inv_m, dt, lambda;
+    rvec dx;
+    reax_atom *atom;
 
-	fprintf (stderr, " Velocity_Verlet_Berendsen_NVT: step :%d \n", data->step);
+    fprintf (stderr, " Velocity_Verlet_Berendsen_NVT: step :%d \n", data->step);
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "step%d\n", data->step );
+    fprintf( stderr, "step%d\n", data->step );
 #endif
-	dt = control->dt;
-	steps = data->step - data->prev_steps;
-	renbr = (steps % control->reneighbor == 0);
-
-	/* velocity verlet, 1st part */
-	for( i = 0; i < system->N; i++ ) {
-		atom = &(system->atoms[i]);
-		inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
-		/* Compute x(t + dt) */
-		rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-		rvec_Add( atom->x, dx );
-		/* Compute v(t + dt/2) */
-		rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-	}
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
+
+    /* velocity verlet, 1st part */
+    for( i = 0; i < system->N; i++ ) {
+        atom = &(system->atoms[i]);
+        inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
+        /* Compute x(t + dt) */
+        rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+        rvec_Add( atom->x, dx );
+        /* Compute v(t + dt/2) */
+        rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+    }
 #if defined(DEBUG_FOCUS)
-	fprintf(stderr, "step%d: verlet1 done\n", data->step);
+    fprintf(stderr, "step%d: verlet1 done\n", data->step);
 #endif
 
-	Reallocate( system, workspace, lists, renbr );
-	Reset( system, control, data, workspace, lists );
+    Reallocate( system, workspace, lists, renbr );
+    Reset( system, control, data, workspace, lists );
 
-	if( renbr )
-		Generate_Neighbor_Lists( system, control, data, workspace, lists, out_control );
+    if( renbr )
+        Generate_Neighbor_Lists( system, control, data, workspace, lists, out_control );
 
-	Compute_Forces( system, control, data, workspace,
-			lists, out_control );
+    Compute_Forces( system, control, data, workspace,
+            lists, out_control );
 
-	/* velocity verlet, 2nd part */
-	for( i = 0; i < system->N; i++ ) {
-		atom = &(system->atoms[i]);
-		inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
-		/* Compute v(t + dt) */
-		rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-	}
+    /* velocity verlet, 2nd part */
+    for( i = 0; i < system->N; i++ ) {
+        atom = &(system->atoms[i]);
+        inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
+        /* Compute v(t + dt) */
+        rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+    }
 #if defined(DEBUG_FOCUS)  
-	fprintf(stderr, "step%d: verlet2 done\n", data->step);
+    fprintf(stderr, "step%d: verlet2 done\n", data->step);
 #endif
 
-	/* temperature scaler */
-	Compute_Kinetic_Energy( system, data );
-	lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-	if( lambda < MIN_dT )
-		lambda = MIN_dT;
-	else if (lambda > MAX_dT )
-		lambda = MAX_dT;
-	lambda = SQRT( lambda );
-
-	/* Scale velocities and positions at t+dt */
-	for( i = 0; i < system->N; ++i ) {
-		atom = &(system->atoms[i]);
-		rvec_Scale( atom->v, lambda, atom->v );
-	}
-	Compute_Kinetic_Energy( system, data );
+    /* temperature scaler */
+    Compute_Kinetic_Energy( system, data );
+    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+    if( lambda < MIN_dT )
+        lambda = MIN_dT;
+    else if (lambda > MAX_dT )
+        lambda = MAX_dT;
+    lambda = SQRT( lambda );
+
+    /* Scale velocities and positions at t+dt */
+    for( i = 0; i < system->N; ++i ) {
+        atom = &(system->atoms[i]);
+        rvec_Scale( atom->v, lambda, atom->v );
+    }
+    Compute_Kinetic_Energy( system, data );
 
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "step%d: scaled velocities\n",
-			data->step );
+    fprintf( stderr, "step%d: scaled velocities\n",
+            data->step );
 #endif
 }
 
 GLOBAL void ker_update_velocity_1 (reax_atom *atoms,
-		single_body_parameters *sbp,
-		real dt,
-		simulation_box *box,
-		int N)
+        single_body_parameters *sbp,
+        real dt,
+        simulation_box *box,
+        int N)
 {
-	real inv_m;
-	rvec dx;
-	reax_atom *atom;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= N ) return;
-
-	/* velocity verlet, 1st part */
-	//for( i = 0; i < system->n; i++ ) { 
-	atom = &(atoms[i]);
-	inv_m = 1.0 / sbp[atom->type].mass;
-	/* Compute x(t + dt) */
-	rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-	rvec_Add( atom->x, dx );
-
-	/* Metin's suggestion to rebox the atoms */
-	/* bNVT fix */
-	Inc_on_T3( atoms[i].x, dx, box );
-	/* bNVT fix */
-
-	/* Compute v(t + dt/2) */
-	rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-	//}
+    real inv_m;
+    rvec dx;
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N ) return;
+
+    /* velocity verlet, 1st part */
+    //for( i = 0; i < system->n; i++ ) { 
+    atom = &(atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute x(t + dt) */
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+
+    /* Metin's suggestion to rebox the atoms */
+    /* bNVT fix */
+    Inc_on_T3( atoms[i].x, dx, box );
+    /* bNVT fix */
+
+    /* Compute v(t + dt/2) */
+    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+    //}
 }
 
 void bNVT_update_velocity_part1 (reax_system *system, simulation_box *box, real dt)
 {
-	ker_update_velocity_1 <<< BLOCKS, BLOCK_SIZE>>>
-		(system->d_atoms, system->reaxprm.d_sbp, dt, box, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    ker_update_velocity_1 <<< BLOCKS, BLOCK_SIZE>>>
+        (system->d_atoms, system->reaxprm.d_sbp, dt, box, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 GLOBAL void ker_update_velocity_2 (reax_atom *atoms,
-		single_body_parameters *sbp,
-		real dt,
-		int N)
+        single_body_parameters *sbp,
+        real dt,
+        int N)
 {
-	reax_atom *atom;
-	real inv_m;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= N ) return;
-
-	/* velocity verlet, 2nd part */
-	//for( i = 0; i < system->n; i++ ) { 
-	atom = &(atoms[i]);
-	inv_m = 1.0 / sbp[atom->type].mass;
-	/* Compute v(t + dt) */
-	rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-	//}
+    reax_atom *atom;
+    real inv_m;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N ) return;
+
+    /* velocity verlet, 2nd part */
+    //for( i = 0; i < system->n; i++ ) { 
+    atom = &(atoms[i]);
+    inv_m = 1.0 / sbp[atom->type].mass;
+    /* Compute v(t + dt) */
+    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+    //}
 }
 
 void bNVT_update_velocity_part2 (reax_system *system, real dt)
 {
-	ker_update_velocity_2 <<< BLOCKS, BLOCK_SIZE >>>
-		(system->d_atoms, system->reaxprm.d_sbp, dt, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    ker_update_velocity_2 <<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, system->reaxprm.d_sbp, dt, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 GLOBAL void ker_scale_velocities (reax_atom *atoms, real lambda, int N)
 {
-	reax_atom *atom;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= N ) return;
-
-	/* Scale velocities and positions at t+dt */
-	//for( i = 0; i < system->n; ++i ) {
-	atom = &(atoms[i]);
-	rvec_Scale( atom->v, lambda, atom->v );
-	//}
+    reax_atom *atom;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N ) return;
+
+    /* Scale velocities and positions at t+dt */
+    //for( i = 0; i < system->n; ++i ) {
+    atom = &(atoms[i]);
+    rvec_Scale( atom->v, lambda, atom->v );
+    //}
 }
 
 void bNVT_scale_velocities (reax_system *system, real lambda)
 {
-	ker_scale_velocities <<< BLOCKS, BLOCK_SIZE >>>
-		(system->d_atoms, lambda, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    ker_scale_velocities <<< BLOCKS, BLOCK_SIZE >>>
+        (system->d_atoms, lambda, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 void Cuda_Velocity_Verlet_Berendsen_NVT( reax_system* system,
-		control_params* control,
-		simulation_data *data,
-		static_storage *workspace,
-		list **lists,
-		output_controls *out_control
-		)
+        control_params* control,
+        simulation_data *data,
+        static_storage *workspace,
+        list **lists,
+        output_controls *out_control
+        )
 {
-	int i, steps, renbr;
-	real inv_m, dt, lambda;
-	rvec dx;
-	reax_atom *atom;
+    int i, steps, renbr;
+    real inv_m, dt, lambda;
+    rvec dx;
+    reax_atom *atom;
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "step%d\n", data->step );
+    fprintf( stderr, "step%d\n", data->step );
 #endif
-	dt = control->dt;
-	steps = data->step - data->prev_steps;
-	renbr = (steps % control->reneighbor == 0);
-
-	/* velocity verlet, 1st part 
-	   for( i = 0; i < system->N; i++ ) { 
-	   atom = &(system->atoms[i]);
-	   inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
-	// Compute x(t + dt) 
-	rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
-	rvec_Add( atom->x, dx );
-	// Compute v(t + dt/2) 
-	rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
-	}
-	 */
-	bNVT_update_velocity_part1 (system, (simulation_box *) system->d_box, dt);
+    dt = control->dt;
+    steps = data->step - data->prev_steps;
+    renbr = (steps % control->reneighbor == 0);
+
+    /* velocity verlet, 1st part 
+       for( i = 0; i < system->N; i++ ) { 
+       atom = &(system->atoms[i]);
+       inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
+    // Compute x(t + dt) 
+    rvec_ScaledSum( dx, dt, atom->v, 0.5 * -F_CONV * inv_m * SQR(dt), atom->f );
+    rvec_Add( atom->x, dx );
+    // Compute v(t + dt/2) 
+    rvec_ScaledAdd( atom->v, 0.5 * -F_CONV * inv_m * dt, atom->f );
+    }
+     */
+    bNVT_update_velocity_part1 (system, (simulation_box *) system->d_box, dt);
 
 #if defined(DEBUG_FOCUS)
-	fprintf(stderr, "step%d: verlet1 done\n", data->step);
+    fprintf(stderr, "step%d: verlet1 done\n", data->step);
 #endif
 
-	Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
-	Cuda_Reset( system, control, data, workspace, lists );
-
-	if( renbr ) {
-		Cuda_Generate_Neighbor_Lists( system, workspace, control, true);
-	}
-
-	Cuda_Compute_Forces( system, control, data, workspace,
-			lists, out_control );
-
-	/* velocity verlet, 2nd part 
-	   for( i = 0; i < system->N; i++ ) {
-	   atom = &(system->atoms[i]);
-	   inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
-	// Compute v(t + dt) 
-	rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
-	}
-	 */
-	bNVT_update_velocity_part2 (system, dt);
+    Cuda_Reallocate( system, dev_workspace, dev_lists, renbr, data->step );
+    Cuda_Reset( system, control, data, workspace, lists );
+
+    if( renbr ) {
+        Cuda_Generate_Neighbor_Lists( system, workspace, control, true);
+    }
+
+    Cuda_Compute_Forces( system, control, data, workspace,
+            lists, out_control );
+
+    /* velocity verlet, 2nd part 
+       for( i = 0; i < system->N; i++ ) {
+       atom = &(system->atoms[i]);
+       inv_m = 1.0 / system->reaxprm.sbp[atom->type].mass;
+    // Compute v(t + dt) 
+    rvec_ScaledAdd( atom->v, 0.5 * dt * -F_CONV * inv_m, atom->f );
+    }
+     */
+    bNVT_update_velocity_part2 (system, dt);
 #if defined(DEBUG_FOCUS)  
-	fprintf(stderr, "step%d: verlet2 done\n", data->step);
+    fprintf(stderr, "step%d: verlet2 done\n", data->step);
 #endif
 
-	/* temperature scaler */
-	Cuda_Compute_Kinetic_Energy( system, data );
-	//get the latest temperature from the device to the host.
-	copy_host_device (&data->therm, &((simulation_data *)data->d_simulation_data)->therm,
-			sizeof (thermostat), cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
-
-	lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
-	if( lambda < MIN_dT )
-		lambda = MIN_dT;
-	else if (lambda > MAX_dT )
-		lambda = MAX_dT;
-	lambda = SQRT( lambda );
-
-	//fprintf (stderr, "step:%d lambda -> %f \n", data->step, lambda);
-
-	/* Scale velocities and positions at t+dt 
-	   for( i = 0; i < system->N; ++i ) {
-	   atom = &(system->atoms[i]);
-	   rvec_Scale( atom->v, lambda, atom->v );
-	   }
-	 */
-	bNVT_scale_velocities (system, lambda);
-	Cuda_Compute_Kinetic_Energy( system, data );
+    /* temperature scaler */
+    Cuda_Compute_Kinetic_Energy( system, data );
+    //get the latest temperature from the device to the host.
+    copy_host_device (&data->therm, &((simulation_data *)data->d_simulation_data)->therm,
+            sizeof (thermostat), cudaMemcpyDeviceToHost, RES_SIMULATION_DATA );
+
+    lambda = 1.0 + (dt / control->Tau_T) * (control->T / data->therm.T - 1.0);
+    if( lambda < MIN_dT )
+        lambda = MIN_dT;
+    else if (lambda > MAX_dT )
+        lambda = MAX_dT;
+    lambda = SQRT( lambda );
+
+    //fprintf (stderr, "step:%d lambda -> %f \n", data->step, lambda);
+
+    /* Scale velocities and positions at t+dt 
+       for( i = 0; i < system->N; ++i ) {
+       atom = &(system->atoms[i]);
+       rvec_Scale( atom->v, lambda, atom->v );
+       }
+     */
+    bNVT_scale_velocities (system, lambda);
+    Cuda_Compute_Kinetic_Energy( system, data );
 
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "step%d: scaled velocities\n",
-			data->step );
+    fprintf( stderr, "step%d: scaled velocities\n",
+            data->step );
 #endif
 
 }
diff --git a/PuReMD-GPU/src/list.cu b/PuReMD-GPU/src/list.cu
index 5412c951..095409aa 100644
--- a/PuReMD-GPU/src/list.cu
+++ b/PuReMD-GPU/src/list.cu
@@ -23,213 +23,213 @@
 
 HOST char Make_List(int n, int num_intrs, int type, list* l, int proc)
 {
-	char success=1;
-
-	if (proc == TYP_HOST) {
-
-		l->n = n;
-		l->num_intrs = num_intrs;
-
-		l->index = (int*) malloc( n * sizeof(int) );
-		l->end_index = (int*) malloc( n * sizeof(int) );
-
-		if (l->index == NULL) success = 0;
-		if (l->end_index == NULL) success = 0;
-
-		l->type = type;
-
-		switch(type)
-		{
-			case TYP_VOID:
-				l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
-				if (l->select.v == NULL) success = 0;
-				break;
-
-			case TYP_THREE_BODY:
-				l->select.three_body_list = (three_body_interaction_data*) 
-					malloc(l->num_intrs*sizeof(three_body_interaction_data));
-				if (l->select.three_body_list == NULL) success = 0;
-				break;
-
-			case TYP_BOND:
-				l->select.bond_list = (bond_data*) 
-					malloc(l->num_intrs * sizeof(bond_data));
-				if (l->select.bond_list == NULL) success = 0;
-				break;
-
-			case TYP_DBO:
-				l->select.dbo_list = (dbond_data*) 
-					malloc(l->num_intrs * sizeof(dbond_data));
-				if (l->select.dbo_list == NULL) success = 0;
-				break;
-
-			case TYP_DDELTA:
-				l->select.dDelta_list = (dDelta_data*) 
-					malloc(l->num_intrs*sizeof(dDelta_data));
-				if (l->select.dDelta_list == NULL) success = 0;
-				break;
-
-			case TYP_FAR_NEIGHBOR:
-				l->select.far_nbr_list = (far_neighbor_data*) 
-					malloc(l->num_intrs*sizeof(far_neighbor_data));
-				if (l->select.far_nbr_list == NULL) success = 0;
-				break;
-
-			case TYP_NEAR_NEIGHBOR:
-				l->select.near_nbr_list = (near_neighbor_data*) 
-					malloc(l->num_intrs*sizeof(near_neighbor_data));
-				if (l->select.near_nbr_list == NULL) success = 0;
-				break;
-
-			case TYP_HBOND:
-				l->select.hbond_list = (hbond_data*)
-					malloc( l->num_intrs * sizeof(hbond_data) );
-				if (l->select.hbond_list == NULL) success = 0;
-				break;			
-
-			default:
-				l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
-				if (l->select.v == NULL) success = 0;
-				l->type = TYP_VOID;
-				break;      
-		}
-
-	}
-	else 
-	{
-		l->n = n;
-		l->num_intrs = num_intrs;
-
-		cuda_malloc ((void **)&l->index, n * sizeof(int), 1, LIST_INDEX );
-		cuda_malloc ((void **)&l->end_index, n * sizeof(int), 1, LIST_END_INDEX );
-
-		switch(type)
-		{
-			case TYP_FAR_NEIGHBOR:
-				cuda_malloc ((void **) &l->select.far_nbr_list, 
-						l->num_intrs*sizeof(far_neighbor_data), 
-						1, LIST_FAR_NEIGHBOR_DATA);
-				/*
-				   cudaHostAlloc ((void **) &l->select.far_nbr_list, 
-				   l->num_intrs*sizeof(far_neighbor_data),
-				   cudaHostAllocMapped);
-
-				   cudaHostGetDevicePointer ( (void **) &l->select.far_nbr_list, 
-				   (void *)l->select.far_nbr_list, 0);
-				 */
-				break;
-
-			case TYP_HBOND:
-				cuda_malloc ((void **) &l->select.hbond_list,
-						l->num_intrs * sizeof(hbond_data),
-						1, LIST_HBOND_DATA );
-				break;			
-
-			case TYP_BOND:
-				cuda_malloc ((void **) &l->select.bond_list,
-						l->num_intrs * sizeof(bond_data),
-						1, LIST_BOND_DATA );
-				break;			
-
-			case TYP_THREE_BODY:
-				cuda_malloc ( (void **) &l->select.three_body_list, 
-						l->num_intrs * sizeof(three_body_interaction_data), 
-						1, LIST_THREE_BODY_DATA );
-				break;
-
-			default: 
-				fprintf (stderr, "Unknown list creation \n" );
-				exit (1);
-		}
-	}
-
-	return success;
+    char success=1;
+
+    if (proc == TYP_HOST) {
+
+        l->n = n;
+        l->num_intrs = num_intrs;
+
+        l->index = (int*) malloc( n * sizeof(int) );
+        l->end_index = (int*) malloc( n * sizeof(int) );
+
+        if (l->index == NULL) success = 0;
+        if (l->end_index == NULL) success = 0;
+
+        l->type = type;
+
+        switch(type)
+        {
+            case TYP_VOID:
+                l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
+                if (l->select.v == NULL) success = 0;
+                break;
+
+            case TYP_THREE_BODY:
+                l->select.three_body_list = (three_body_interaction_data*) 
+                    malloc(l->num_intrs*sizeof(three_body_interaction_data));
+                if (l->select.three_body_list == NULL) success = 0;
+                break;
+
+            case TYP_BOND:
+                l->select.bond_list = (bond_data*) 
+                    malloc(l->num_intrs * sizeof(bond_data));
+                if (l->select.bond_list == NULL) success = 0;
+                break;
+
+            case TYP_DBO:
+                l->select.dbo_list = (dbond_data*) 
+                    malloc(l->num_intrs * sizeof(dbond_data));
+                if (l->select.dbo_list == NULL) success = 0;
+                break;
+
+            case TYP_DDELTA:
+                l->select.dDelta_list = (dDelta_data*) 
+                    malloc(l->num_intrs*sizeof(dDelta_data));
+                if (l->select.dDelta_list == NULL) success = 0;
+                break;
+
+            case TYP_FAR_NEIGHBOR:
+                l->select.far_nbr_list = (far_neighbor_data*) 
+                    malloc(l->num_intrs*sizeof(far_neighbor_data));
+                if (l->select.far_nbr_list == NULL) success = 0;
+                break;
+
+            case TYP_NEAR_NEIGHBOR:
+                l->select.near_nbr_list = (near_neighbor_data*) 
+                    malloc(l->num_intrs*sizeof(near_neighbor_data));
+                if (l->select.near_nbr_list == NULL) success = 0;
+                break;
+
+            case TYP_HBOND:
+                l->select.hbond_list = (hbond_data*)
+                    malloc( l->num_intrs * sizeof(hbond_data) );
+                if (l->select.hbond_list == NULL) success = 0;
+                break;            
+
+            default:
+                l->select.v = (void *) malloc(l->num_intrs*sizeof(void));
+                if (l->select.v == NULL) success = 0;
+                l->type = TYP_VOID;
+                break;      
+        }
+
+    }
+    else 
+    {
+        l->n = n;
+        l->num_intrs = num_intrs;
+
+        cuda_malloc ((void **)&l->index, n * sizeof(int), 1, LIST_INDEX );
+        cuda_malloc ((void **)&l->end_index, n * sizeof(int), 1, LIST_END_INDEX );
+
+        switch(type)
+        {
+            case TYP_FAR_NEIGHBOR:
+                cuda_malloc ((void **) &l->select.far_nbr_list, 
+                        l->num_intrs*sizeof(far_neighbor_data), 
+                        1, LIST_FAR_NEIGHBOR_DATA);
+                /*
+                   cudaHostAlloc ((void **) &l->select.far_nbr_list, 
+                   l->num_intrs*sizeof(far_neighbor_data),
+                   cudaHostAllocMapped);
+
+                   cudaHostGetDevicePointer ( (void **) &l->select.far_nbr_list, 
+                   (void *)l->select.far_nbr_list, 0);
+                 */
+                break;
+
+            case TYP_HBOND:
+                cuda_malloc ((void **) &l->select.hbond_list,
+                        l->num_intrs * sizeof(hbond_data),
+                        1, LIST_HBOND_DATA );
+                break;            
+
+            case TYP_BOND:
+                cuda_malloc ((void **) &l->select.bond_list,
+                        l->num_intrs * sizeof(bond_data),
+                        1, LIST_BOND_DATA );
+                break;            
+
+            case TYP_THREE_BODY:
+                cuda_malloc ( (void **) &l->select.three_body_list, 
+                        l->num_intrs * sizeof(three_body_interaction_data), 
+                        1, LIST_THREE_BODY_DATA );
+                break;
+
+            default: 
+                fprintf (stderr, "Unknown list creation \n" );
+                exit (1);
+        }
+    }
+
+    return success;
 }
 
 
 HOST void Delete_List(list* l, int type)
 {
 
-	if (type == TYP_HOST )
-	{
-		if( l->index != NULL )
-			free(l->index);
-		if( l->end_index != NULL )
-			free(l->end_index);
-
-		switch(l->type)
-		{
-			case TYP_VOID:
-				if( l->select.v != NULL )
-					free(l->select.v);
-				break;
-			case TYP_THREE_BODY:
-				if( l->select.three_body_list != NULL )
-					free(l->select.three_body_list);
-				break;
-			case TYP_BOND:
-				if( l->select.bond_list != NULL )
-					free(l->select.bond_list);
-				break;
-			case TYP_DBO:
-				if( l->select.dbo_list != NULL )
-					free(l->select.dbo_list);
-				break;
-			case TYP_DDELTA:
-				if( l->select.dDelta_list != NULL )
-					free(l->select.dDelta_list);
-				break;
-			case TYP_FAR_NEIGHBOR:
-				if( l->select.far_nbr_list != NULL )
-					free(l->select.far_nbr_list);
-				break;
-			case TYP_NEAR_NEIGHBOR:
-				if( l->select.near_nbr_list != NULL )
-					free(l->select.near_nbr_list);
-				break;
-			case TYP_HBOND:
-				if( l->select.hbond_list != NULL )
-					free(l->select.hbond_list);
-				break;
-
-			default:
-				// Report fatal error
-				break;
-		}
-	}
-	else
-	{
-		if (l->index != NULL)
-			cuda_free (l->index, LIST_INDEX );	
-		if (l->end_index != NULL)
-			cuda_free (l->end_index, LIST_END_INDEX );
-
-		switch(type)
-		{
-			case TYP_FAR_NEIGHBOR:
-				if (l->select.far_nbr_list != NULL)
-					cuda_free (l->select.far_nbr_list, LIST_FAR_NEIGHBOR_DATA);
-				break;
-
-			case TYP_HBOND:
-				if (l->select.hbond_list != NULL)
-					cuda_free (l->select.hbond_list, LIST_HBOND_DATA );
-				break;			
-
-			case TYP_BOND:
-				if (l->select.bond_list != NULL)
-					cuda_free (l->select.bond_list, LIST_BOND_DATA );
-				break;			
-
-			case TYP_THREE_BODY:
-				if (l->select.three_body_list != NULL) 
-					cuda_free ( l->select.three_body_list, LIST_THREE_BODY_DATA );
-				break;
-
-			default: 
-				fprintf (stderr, "Unknown list deletion \n" );
-				exit (1);
-		}
-	}
+    if (type == TYP_HOST )
+    {
+        if( l->index != NULL )
+            free(l->index);
+        if( l->end_index != NULL )
+            free(l->end_index);
+
+        switch(l->type)
+        {
+            case TYP_VOID:
+                if( l->select.v != NULL )
+                    free(l->select.v);
+                break;
+            case TYP_THREE_BODY:
+                if( l->select.three_body_list != NULL )
+                    free(l->select.three_body_list);
+                break;
+            case TYP_BOND:
+                if( l->select.bond_list != NULL )
+                    free(l->select.bond_list);
+                break;
+            case TYP_DBO:
+                if( l->select.dbo_list != NULL )
+                    free(l->select.dbo_list);
+                break;
+            case TYP_DDELTA:
+                if( l->select.dDelta_list != NULL )
+                    free(l->select.dDelta_list);
+                break;
+            case TYP_FAR_NEIGHBOR:
+                if( l->select.far_nbr_list != NULL )
+                    free(l->select.far_nbr_list);
+                break;
+            case TYP_NEAR_NEIGHBOR:
+                if( l->select.near_nbr_list != NULL )
+                    free(l->select.near_nbr_list);
+                break;
+            case TYP_HBOND:
+                if( l->select.hbond_list != NULL )
+                    free(l->select.hbond_list);
+                break;
+
+            default:
+                // Report fatal error
+                break;
+        }
+    }
+    else
+    {
+        if (l->index != NULL)
+            cuda_free (l->index, LIST_INDEX );    
+        if (l->end_index != NULL)
+            cuda_free (l->end_index, LIST_END_INDEX );
+
+        switch(type)
+        {
+            case TYP_FAR_NEIGHBOR:
+                if (l->select.far_nbr_list != NULL)
+                    cuda_free (l->select.far_nbr_list, LIST_FAR_NEIGHBOR_DATA);
+                break;
+
+            case TYP_HBOND:
+                if (l->select.hbond_list != NULL)
+                    cuda_free (l->select.hbond_list, LIST_HBOND_DATA );
+                break;            
+
+            case TYP_BOND:
+                if (l->select.bond_list != NULL)
+                    cuda_free (l->select.bond_list, LIST_BOND_DATA );
+                break;            
+
+            case TYP_THREE_BODY:
+                if (l->select.three_body_list != NULL) 
+                    cuda_free ( l->select.three_body_list, LIST_THREE_BODY_DATA );
+                break;
+
+            default: 
+                fprintf (stderr, "Unknown list deletion \n" );
+                exit (1);
+        }
+    }
 }
 
diff --git a/PuReMD-GPU/src/lookup.cu b/PuReMD-GPU/src/lookup.cu
index 95fa5c46..c6cc23cf 100644
--- a/PuReMD-GPU/src/lookup.cu
+++ b/PuReMD-GPU/src/lookup.cu
@@ -25,53 +25,53 @@
 #include "index_utils.h"
 
 void Make_Lookup_Table(real xmin, real xmax, int n,
-		lookup_function f, lookup_table* t)
+        lookup_function f, lookup_table* t)
 {
-	int i;
-
-	t->xmin = xmin;
-	t->xmax = xmax;
-	t->n = n;
-	t->dx = (xmax - xmin)/(n-1);
-	t->inv_dx = 1.0 / t->dx;
-	t->a = (n-1)/(xmax-xmin);
-	t->y = (real*) malloc(n*sizeof(real));
-
-	for(i=0; i < n; i++)
-		t->y[i] = f(i*t->dx + t->xmin);
-
-	// //fprintf(stdout,"dx = %lf\n",t->dx);
-	// for(i=0; i < n; i++)
-	//   //fprintf( stdout,"%d %lf %lf %lf\n", 
-	//            i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) );
+    int i;
+
+    t->xmin = xmin;
+    t->xmax = xmax;
+    t->n = n;
+    t->dx = (xmax - xmin)/(n-1);
+    t->inv_dx = 1.0 / t->dx;
+    t->a = (n-1)/(xmax-xmin);
+    t->y = (real*) malloc(n*sizeof(real));
+
+    for(i=0; i < n; i++)
+        t->y[i] = f(i*t->dx + t->xmin);
+
+    // //fprintf(stdout,"dx = %lf\n",t->dx);
+    // for(i=0; i < n; i++)
+    //   //fprintf( stdout,"%d %lf %lf %lf\n", 
+    //            i, i/t->a+t->xmin, t->y[i], exp(i/t->a+t->xmin) );
 }
 
 
 /* Fills solution into x. Warning: will modify c and d! */
 HOST_DEVICE void Tridiagonal_Solve( const real *a, const real *b, 
-		real *c, real *d, real *x, unsigned int n){
-	int i;
-	real id;
-
-	/* Modify the coefficients. */
-	c[0] /= b[0];	/* Division by zero risk. */
-	d[0] /= b[0];	/* Division by zero would imply a singular matrix. */
-	for(i = 1; i < n; i++){
-		id = (b[i] - c[i-1] * a[i]);  /* Division by zero risk. */
-		c[i] /= id;	        /* Last value calculated is redundant. */
-		d[i] = (d[i] - d[i-1] * a[i])/id;
-	}
-
-	/* Now back substitute. */
-	x[n - 1] = d[n - 1];
-	for(i = n - 2; i >= 0; i--)
-		x[i] = d[i] - c[i] * x[i + 1];
+        real *c, real *d, real *x, unsigned int n){
+    int i;
+    real id;
+
+    /* Modify the coefficients. */
+    c[0] /= b[0];    /* Division by zero risk. */
+    d[0] /= b[0];    /* Division by zero would imply a singular matrix. */
+    for(i = 1; i < n; i++){
+        id = (b[i] - c[i-1] * a[i]);  /* Division by zero risk. */
+        c[i] /= id;            /* Last value calculated is redundant. */
+        d[i] = (d[i] - d[i-1] * a[i])/id;
+    }
+
+    /* Now back substitute. */
+    x[n - 1] = d[n - 1];
+    for(i = n - 2; i >= 0; i--)
+        x[i] = d[i] - c[i] * x[i + 1];
 }
 
 GLOBAL void Cuda_Tridiagonal_Solve (const real *a, const real *b, 
-		real *c, real *d, real *x, unsigned int n)
+        real *c, real *d, real *x, unsigned int n)
 {
-	Tridiagonal_Solve ( a, b, c, d, x, n );
+    Tridiagonal_Solve ( a, b, c, d, x, n );
 }
 
 
@@ -84,189 +84,189 @@ GLOBAL void Cuda_Tridiagonal_Solve (const real *a, const real *b,
 
 
 void Natural_Cubic_Spline( const real *h, const real *f, 
-		cubic_spline_coef *coef, unsigned int n )
+        cubic_spline_coef *coef, unsigned int n )
 {
-	int i;
-	real *a, *b, *c, *d, *v;
-
-	/* allocate space for the linear system */
-	a = (real*) malloc( n * sizeof(real) );
-	b = (real*) malloc( n * sizeof(real) );
-	c = (real*) malloc( n * sizeof(real) );
-	d = (real*) malloc( n * sizeof(real) );
-	v = (real*) malloc( n * sizeof(real) );
-
-	/* build the linear system */
-	a[0] = a[1] = a[n-1] = 0;
-	for( i = 2; i < n-1; ++i )
-		a[i] = h[i-1];
-
-	b[0] = b[n-1] = 0;
-	for( i = 1; i < n-1; ++i )
-		b[i] = 2 * (h[i-1] + h[i]); 
-
-	c[0] = c[n-2] = c[n-1] = 0;
-	for( i = 1; i < n-2; ++i )
-		c[i] = h[i];
-
-	d[0] = d[n-1] = 0;
-	for( i = 1; i < n-1; ++i )
-		d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
-
-	/*//fprintf( stderr, "i  a        b        c        d\n" );
-	  for( i = 0; i < n; ++i )
-	//fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
-	v[0] = 0;
-	v[n-1] = 0;
-	Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
-
-	for( i = 1; i < n; ++i ){
-		coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-		coef[i-1].c = v[i]/2;
-		coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-		coef[i-1].a = f[i];
-	}
-
-	/*//fprintf( stderr, "i  v  coef\n" );
-	  for( i = 0; i < n; ++i )
-	//fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-	i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+    int i;
+    real *a, *b, *c, *d, *v;
+
+    /* allocate space for the linear system */
+    a = (real*) malloc( n * sizeof(real) );
+    b = (real*) malloc( n * sizeof(real) );
+    c = (real*) malloc( n * sizeof(real) );
+    d = (real*) malloc( n * sizeof(real) );
+    v = (real*) malloc( n * sizeof(real) );
+
+    /* build the linear system */
+    a[0] = a[1] = a[n-1] = 0;
+    for( i = 2; i < n-1; ++i )
+        a[i] = h[i-1];
+
+    b[0] = b[n-1] = 0;
+    for( i = 1; i < n-1; ++i )
+        b[i] = 2 * (h[i-1] + h[i]); 
+
+    c[0] = c[n-2] = c[n-1] = 0;
+    for( i = 1; i < n-2; ++i )
+        c[i] = h[i];
+
+    d[0] = d[n-1] = 0;
+    for( i = 1; i < n-1; ++i )
+        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+
+    /*//fprintf( stderr, "i  a        b        c        d\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+    v[0] = 0;
+    v[n-1] = 0;
+    Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
+
+    for( i = 1; i < n; ++i ){
+        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
+        coef[i-1].c = v[i]/2;
+        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
+        coef[i-1].a = f[i];
+    }
+
+    /*//fprintf( stderr, "i  v  coef\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
+    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 }
 
 
 GLOBAL void cubic_spline_init_a ( real *a, const real *h, int n )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
-
-	if (i == 0 || i == 1 || i == (n-1)) {
-		a[i] = 0;
-	} else {
-		a[i] = h[i-1];
-	}
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
+
+    if (i == 0 || i == 1 || i == (n-1)) {
+        a[i] = 0;
+    } else {
+        a[i] = h[i-1];
+    }
 }
 
 GLOBAL void cubic_spline_init_b (real *b, const real *h, int n )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
-
-	if (i == 0 || i == (n-1)) {
-		b[i] = 0;
-	} else {
-		b[i] = 2 * (h[i-1] + h[i]);
-	}
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
+
+    if (i == 0 || i == (n-1)) {
+        b[i] = 0;
+    } else {
+        b[i] = 2 * (h[i-1] + h[i]);
+    }
 }
 
 GLOBAL void cubic_spline_init_c (real *c, const real *h, int n )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
-
-	if (i == 0 || i == (n-1) || i == (n-2)) {
-		c[i] = 0;
-	} else {
-		c[i] = h[i];
-	}
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
+
+    if (i == 0 || i == (n-1) || i == (n-2)) {
+        c[i] = 0;
+    } else {
+        c[i] = h[i];
+    }
 }
 
 GLOBAL void cubic_spline_init_d (real *d, const real *f, const real *h, int n )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
-
-	if ( i == 0 || i == (n-1) ) {
-		d[i] = 0;
-	} else {
-		d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
-	}
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
+
+    if ( i == 0 || i == (n-1) ) {
+        d[i] = 0;
+    } else {
+        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+    }
 }
 
 GLOBAL void calculate_cubic_spline_coef ( const real *f, real *v, const real *h, LR_lookup_table *data, int offset, int n )
 {
-	cubic_spline_coef *coef;
-
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= n || i == 0) return;
-
-	if (offset == SPLINE_H_OFFSET)
-		coef = &data->H[1];
-	else if(offset == SPLINE_CEVD_OFFSET)
-		coef = &data->CEvd[1];
-	else if (offset == SPLINE_CECLMB_OFFSET)
-		coef = &data->CEclmb[1];
-	else if (offset == SPLINE_VDW_OFFSET)
-		coef = &data->vdW[1];
-	else if (offset == SPLINE_ELE_OFFSET)
-		coef = &data->ele[1];
-	else
-		coef = 0;
-
-	coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-	coef[i-1].c = v[i]/2;
-	coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-	coef[i-1].a = f[i];
+    cubic_spline_coef *coef;
+
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n || i == 0) return;
+
+    if (offset == SPLINE_H_OFFSET)
+        coef = &data->H[1];
+    else if(offset == SPLINE_CEVD_OFFSET)
+        coef = &data->CEvd[1];
+    else if (offset == SPLINE_CECLMB_OFFSET)
+        coef = &data->CEclmb[1];
+    else if (offset == SPLINE_VDW_OFFSET)
+        coef = &data->vdW[1];
+    else if (offset == SPLINE_ELE_OFFSET)
+        coef = &data->ele[1];
+    else
+        coef = 0;
+
+    coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
+    coef[i-1].c = v[i]/2;
+    coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
+    coef[i-1].a = f[i];
 }
 
 
 void Cuda_Natural_Cubic_Spline( const real *h, const real *f, 
-		LR_lookup_table *data, int offset, unsigned int n )
+        LR_lookup_table *data, int offset, unsigned int n )
 {
-	int i;
-	real *a, *b, *c, *d, *v;
-	int blocks, block_size;
-
-	////fprintf (stderr, "Entering Cuda_Natural_Cubic_Spline ... \n");
-
-	/* allocate space for the linear system */
-	cuda_malloc ((void **) &a, REAL_SIZE * n, 0, __LINE__ );
-	cuda_malloc ((void **) &b, REAL_SIZE * n, 0, __LINE__ );
-	cuda_malloc ((void **) &c, REAL_SIZE * n, 0, __LINE__ );
-	cuda_malloc ((void **) &d, REAL_SIZE * n, 0, __LINE__ );
-	cuda_malloc ((void **) &v, REAL_SIZE * n, 1, __LINE__ );
-
-	////fprintf (stderr, "Mem allocation done... \n");
-
-	/* build linear system */
-	compute_blocks ( &blocks, &block_size, n);
-	cubic_spline_init_a <<< blocks, block_size >>>
-		( a, h, n );
-	cudaThreadSynchronize ();
-	////fprintf (stderr, "cubic_spline_init_a done.... -> %d \n", cudaGetLastError ());
-
-	cubic_spline_init_b <<< blocks, block_size >>>
-		( b, h, n );
-	cudaThreadSynchronize ();
-	////fprintf (stderr, "cubic_spline_init_b done.... -> %d \n", cudaGetLastError ());
-
-	cubic_spline_init_c <<< blocks, block_size >>>
-		( c, h, n );
-	cudaThreadSynchronize ();
-	//fprintf (stderr, "cubic_spline_init_c done.... -> %d \n", cudaGetLastError ());
-
-	cubic_spline_init_d <<< blocks, block_size >>>
-		( d, f, h, n );
-	cudaThreadSynchronize ();
-	//fprintf (stderr, "cubic_spline_init_d done.... -> %d \n", cudaGetLastError ());
-
-	/*//fprintf( stderr, "i  a        b        c        d\n" );
-	  for( i = 0; i < n; ++i )
-	//fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
-
-	Cuda_Tridiagonal_Solve <<<1, 1>>>
-		( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
-	cudaThreadSynchronize ();
-	//fprintf (stderr, "Tridiagonal_Solve done.... -> %d \n", cudaGetLastError ());
-
-	calculate_cubic_spline_coef <<< blocks, block_size >>>
-		( f, v, h, data,offset, n );
-	cudaThreadSynchronize ();
-	//fprintf (stderr, "calculate_cubic_spline_coef done.... -> %d \n", cudaGetLastError ());
-
-	/*//fprintf( stderr, "i  v  coef\n" );
-	  for( i = 0; i < n; ++i )
-	//fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-	i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+    int i;
+    real *a, *b, *c, *d, *v;
+    int blocks, block_size;
+
+    ////fprintf (stderr, "Entering Cuda_Natural_Cubic_Spline ... \n");
+
+    /* allocate space for the linear system */
+    cuda_malloc ((void **) &a, REAL_SIZE * n, 0, __LINE__ );
+    cuda_malloc ((void **) &b, REAL_SIZE * n, 0, __LINE__ );
+    cuda_malloc ((void **) &c, REAL_SIZE * n, 0, __LINE__ );
+    cuda_malloc ((void **) &d, REAL_SIZE * n, 0, __LINE__ );
+    cuda_malloc ((void **) &v, REAL_SIZE * n, 1, __LINE__ );
+
+    ////fprintf (stderr, "Mem allocation done... \n");
+
+    /* build linear system */
+    compute_blocks ( &blocks, &block_size, n);
+    cubic_spline_init_a <<< blocks, block_size >>>
+        ( a, h, n );
+    cudaThreadSynchronize ();
+    ////fprintf (stderr, "cubic_spline_init_a done.... -> %d \n", cudaGetLastError ());
+
+    cubic_spline_init_b <<< blocks, block_size >>>
+        ( b, h, n );
+    cudaThreadSynchronize ();
+    ////fprintf (stderr, "cubic_spline_init_b done.... -> %d \n", cudaGetLastError ());
+
+    cubic_spline_init_c <<< blocks, block_size >>>
+        ( c, h, n );
+    cudaThreadSynchronize ();
+    //fprintf (stderr, "cubic_spline_init_c done.... -> %d \n", cudaGetLastError ());
+
+    cubic_spline_init_d <<< blocks, block_size >>>
+        ( d, f, h, n );
+    cudaThreadSynchronize ();
+    //fprintf (stderr, "cubic_spline_init_d done.... -> %d \n", cudaGetLastError ());
+
+    /*//fprintf( stderr, "i  a        b        c        d\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+
+    Cuda_Tridiagonal_Solve <<<1, 1>>>
+        ( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
+    cudaThreadSynchronize ();
+    //fprintf (stderr, "Tridiagonal_Solve done.... -> %d \n", cudaGetLastError ());
+
+    calculate_cubic_spline_coef <<< blocks, block_size >>>
+        ( f, v, h, data,offset, n );
+    cudaThreadSynchronize ();
+    //fprintf (stderr, "calculate_cubic_spline_coef done.... -> %d \n", cudaGetLastError ());
+
+    /*//fprintf( stderr, "i  v  coef\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
+    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 }
 
 
@@ -280,194 +280,194 @@ void Cuda_Natural_Cubic_Spline( const real *h, const real *f,
 
 
 void Complete_Cubic_Spline( const real *h, const real *f, real v0, real vlast,
-		cubic_spline_coef *coef, unsigned int n )
+        cubic_spline_coef *coef, unsigned int n )
 {
-	int i;
-	real *a, *b, *c, *d, *v;
-
-	/* allocate space for the linear system */
-	a = (real*) malloc( n * sizeof(real) );
-	b = (real*) malloc( n * sizeof(real) );
-	c = (real*) malloc( n * sizeof(real) );
-	d = (real*) malloc( n * sizeof(real) );
-	v = (real*) malloc( n * sizeof(real) );
-
-	/* build the linear system */
-	a[0] = 0;
-	for( i = 1; i < n; ++i )
-		a[i] = h[i-1];
-
-	b[0] = 2*h[0];
-	for( i = 1; i < n; ++i )
-		b[i] = 2 * (h[i-1] + h[i]); 
-
-	c[n-1] = 0;
-	for( i = 0; i < n-1; ++i )
-		c[i] = h[i];
-
-	d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0;   
-	d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]);
-	for( i = 1; i < n-1; ++i )
-		d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
-
-	/*//fprintf( stderr, "i  a        b        c        d\n" );
-	  for( i = 0; i < n; ++i )
-	//fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
-	Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n );
-	// Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
-
-	for( i = 1; i < n; ++i ){
-		coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-		coef[i-1].c = v[i]/2;
-		coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-		coef[i-1].a = f[i];
-	}
-
-	/*//fprintf( stderr, "i  v  coef\n" );
-	  for( i = 0; i < n; ++i )
-	//fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-	i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+    int i;
+    real *a, *b, *c, *d, *v;
+
+    /* allocate space for the linear system */
+    a = (real*) malloc( n * sizeof(real) );
+    b = (real*) malloc( n * sizeof(real) );
+    c = (real*) malloc( n * sizeof(real) );
+    d = (real*) malloc( n * sizeof(real) );
+    v = (real*) malloc( n * sizeof(real) );
+
+    /* build the linear system */
+    a[0] = 0;
+    for( i = 1; i < n; ++i )
+        a[i] = h[i-1];
+
+    b[0] = 2*h[0];
+    for( i = 1; i < n; ++i )
+        b[i] = 2 * (h[i-1] + h[i]); 
+
+    c[n-1] = 0;
+    for( i = 0; i < n-1; ++i )
+        c[i] = h[i];
+
+    d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0;   
+    d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]);
+    for( i = 1; i < n-1; ++i )
+        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+
+    /*//fprintf( stderr, "i  a        b        c        d\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+    Tridiagonal_Solve( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n );
+    // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
+
+    for( i = 1; i < n; ++i ){
+        coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
+        coef[i-1].c = v[i]/2;
+        coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
+        coef[i-1].a = f[i];
+    }
+
+    /*//fprintf( stderr, "i  v  coef\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
+    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 }
 
 
 GLOBAL void complete_cubic_spline_init_a (real *a, const real *h, int n)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
 
-	if (i == 0) a[0] = 0;
-	else {
-		a[i] = h[i];
-	}
+    if (i == 0) a[0] = 0;
+    else {
+        a[i] = h[i];
+    }
 }
 
 GLOBAL void complete_cubic_spline_init_b (real *b, const real *h, int n)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
 
-	if (i == 0) b[0] = 2 * h[0];
-	else {
-		b[i] = 2 * (h[i-1] + h[i]); 
-	}
+    if (i == 0) b[0] = 2 * h[0];
+    else {
+        b[i] = 2 * (h[i-1] + h[i]); 
+    }
 }
 
 
 GLOBAL void complete_cubic_spline_init_c (real *c, const real *h, int n )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
 
-	if (i == (n-1)) c[n-1] = 0;
-	else {
-		c[i] = h[i];
-	}
+    if (i == (n-1)) c[n-1] = 0;
+    else {
+        c[i] = h[i];
+    }
 }
 
 GLOBAL void complete_cubic_spline_init_d (real *d, const real *f, const real *h, int v0_r, int vlast_r, int n)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real v0, vlast;
-	if ( i >= n ) return;
-
-	v0 = 0;
-	vlast = 0;
-
-	if (i == 0) {
-		d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0;   
-	}
-	else if (i == (n-1)) {
-		d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]);
-	}
-	else
-		d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real v0, vlast;
+    if ( i >= n ) return;
+
+    v0 = 0;
+    vlast = 0;
+
+    if (i == 0) {
+        d[0] = 6 * (f[1]-f[0])/h[0] - 6 * v0;   
+    }
+    else if (i == (n-1)) {
+        d[n-1] = 6 * vlast - 6 * (f[n-1]-f[n-2]/h[n-2]);
+    }
+    else
+        d[i] = 6 * ((f[i+1]-f[i])/h[i] - (f[i]-f[i-1])/h[i-1]);
 }
 
 GLOBAL void calculate_complete_cubic_spline_coef (LR_lookup_table *data, int offset, real *v, const real *h, const real *f, int n)
 {
 
-	cubic_spline_coef *coef;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= n ) return;
-
-	if (offset == SPLINE_H_OFFSET)
-		coef = &data->H[1];
-	else if(offset == SPLINE_CEVD_OFFSET)
-		coef = &data->CEvd[1];
-	else if (offset == SPLINE_CECLMB_OFFSET)
-		coef = &data->CEclmb[1];
-	else if (offset == SPLINE_VDW_OFFSET)
-		coef = &data->vdW[1];
-	else if (offset == SPLINE_ELE_OFFSET)
-		coef = &data->ele[1];
-	else
-		coef = 0;
-
-	coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
-	coef[i-1].c = v[i]/2;
-	coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
-	coef[i-1].a = f[i];
+    cubic_spline_coef *coef;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= n ) return;
+
+    if (offset == SPLINE_H_OFFSET)
+        coef = &data->H[1];
+    else if(offset == SPLINE_CEVD_OFFSET)
+        coef = &data->CEvd[1];
+    else if (offset == SPLINE_CECLMB_OFFSET)
+        coef = &data->CEclmb[1];
+    else if (offset == SPLINE_VDW_OFFSET)
+        coef = &data->vdW[1];
+    else if (offset == SPLINE_ELE_OFFSET)
+        coef = &data->ele[1];
+    else
+        coef = 0;
+
+    coef[i-1].d = (v[i] - v[i-1]) / (6*h[i-1]);
+    coef[i-1].c = v[i]/2;
+    coef[i-1].b = (f[i]-f[i-1])/h[i-1] + h[i-1]*(2*v[i] + v[i-1])/6;
+    coef[i-1].a = f[i];
 }
 
 void Cuda_Complete_Cubic_Spline( const real *h, const real *f, int v0_r, int vlast_r,
-		LR_lookup_table *data, int offset, unsigned int n )
+        LR_lookup_table *data, int offset, unsigned int n )
 {
-	int i;
-	real *a, *b, *c, *d, *v;
-
-	int blocks, block_size;
-
-	/* allocate space for the linear system */
-	cuda_malloc ((void **) &a, REAL_SIZE * n, 0, __LINE__ );
-	cuda_malloc ((void **) &b, REAL_SIZE * n, 0, __LINE__ );
-	cuda_malloc ((void **) &c, REAL_SIZE * n, 0, __LINE__ );
-	cuda_malloc ((void **) &d, REAL_SIZE * n, 0, __LINE__ );
-	cuda_malloc ((void **) &v, REAL_SIZE * n, 1, __LINE__ );
-
-	/* build the linear system */
-	compute_blocks ( &blocks, &block_size, n );
-
-	complete_cubic_spline_init_a <<< blocks, block_size >>>
-		(a, h, n);
-	cudaThreadSynchronize ();
-	//fprintf (stderr, "complete_cubic_spline_init_a done.... -> %d \n", cudaGetLastError ());
-
-	complete_cubic_spline_init_b <<< blocks, block_size >>>
-		(b, h, n);
-	cudaThreadSynchronize ();
-	//fprintf (stderr, "complete_cubic_spline_init_b done.... -> %d \n", cudaGetLastError ());
-
-	complete_cubic_spline_init_c <<< blocks, block_size >>>
-		( c, h, n );
-	cudaThreadSynchronize ();
-	//fprintf (stderr, "complete_cubic_spline_init_c done.... -> %d \n", cudaGetLastError ());
-
-	complete_cubic_spline_init_d <<< blocks, block_size >>>
-		(d, f, h, v0_r, vlast_r, n);
-	cudaThreadSynchronize ();
-	//fprintf (stderr, "complete_cubic_spline_init_d done.... -> %d \n", cudaGetLastError ());
-
-	/*//fprintf( stderr, "i  a        b        c        d\n" );
-	  for( i = 0; i < n; ++i )
-	//fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
-
-
-	Cuda_Tridiagonal_Solve <<< 1, 1 >>>
-		( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n );
-	cudaThreadSynchronize ();
-	//fprintf (stderr, "Tridiagonal_Solve done.... -> %d \n", cudaGetLastError ());
-	// Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
-
-
-	calculate_complete_cubic_spline_coef <<< blocks, block_size >>>
-		(data, offset, v, h, f, n);
-	cudaThreadSynchronize ();
-	//fprintf (stderr, " calculate_complete_cubic_spline_coef done.... -> %d \n", cudaGetLastError ());
-
-	/*//fprintf( stderr, "i  v  coef\n" );
-	  for( i = 0; i < n; ++i )
-	//fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
-	i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
+    int i;
+    real *a, *b, *c, *d, *v;
+
+    int blocks, block_size;
+
+    /* allocate space for the linear system */
+    cuda_malloc ((void **) &a, REAL_SIZE * n, 0, __LINE__ );
+    cuda_malloc ((void **) &b, REAL_SIZE * n, 0, __LINE__ );
+    cuda_malloc ((void **) &c, REAL_SIZE * n, 0, __LINE__ );
+    cuda_malloc ((void **) &d, REAL_SIZE * n, 0, __LINE__ );
+    cuda_malloc ((void **) &v, REAL_SIZE * n, 1, __LINE__ );
+
+    /* build the linear system */
+    compute_blocks ( &blocks, &block_size, n );
+
+    complete_cubic_spline_init_a <<< blocks, block_size >>>
+        (a, h, n);
+    cudaThreadSynchronize ();
+    //fprintf (stderr, "complete_cubic_spline_init_a done.... -> %d \n", cudaGetLastError ());
+
+    complete_cubic_spline_init_b <<< blocks, block_size >>>
+        (b, h, n);
+    cudaThreadSynchronize ();
+    //fprintf (stderr, "complete_cubic_spline_init_b done.... -> %d \n", cudaGetLastError ());
+
+    complete_cubic_spline_init_c <<< blocks, block_size >>>
+        ( c, h, n );
+    cudaThreadSynchronize ();
+    //fprintf (stderr, "complete_cubic_spline_init_c done.... -> %d \n", cudaGetLastError ());
+
+    complete_cubic_spline_init_d <<< blocks, block_size >>>
+        (d, f, h, v0_r, vlast_r, n);
+    cudaThreadSynchronize ();
+    //fprintf (stderr, "complete_cubic_spline_init_d done.... -> %d \n", cudaGetLastError ());
+
+    /*//fprintf( stderr, "i  a        b        c        d\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f\n", i, a[i], b[i], c[i], d[i] );*/
+
+
+    Cuda_Tridiagonal_Solve <<< 1, 1 >>>
+        ( &(a[0]), &(b[0]), &(c[0]), &(d[0]), &(v[0]), n );
+    cudaThreadSynchronize ();
+    //fprintf (stderr, "Tridiagonal_Solve done.... -> %d \n", cudaGetLastError ());
+    // Tridiagonal_Solve( &(a[1]), &(b[1]), &(c[1]), &(d[1]), &(v[1]), n-2 );
+
+
+    calculate_complete_cubic_spline_coef <<< blocks, block_size >>>
+        (data, offset, v, h, f, n);
+    cudaThreadSynchronize ();
+    //fprintf (stderr, " calculate_complete_cubic_spline_coef done.... -> %d \n", cudaGetLastError ());
+
+    /*//fprintf( stderr, "i  v  coef\n" );
+      for( i = 0; i < n; ++i )
+    //fprintf( stderr, "%d  %f  %f  %f  %f  %f\n", 
+    i, v[i], coef[i].a, coef[i].b, coef[i].c, coef[i].d ); */
 }
 
 
@@ -475,168 +475,168 @@ void Cuda_Complete_Cubic_Spline( const real *h, const real *f, int v0_r, int vla
 
 void LR_Lookup( LR_lookup_table *t, real r, LR_data *y )
 {
-	int i;
-	real base, dif;
-
-	i = (int)(r * t->inv_dx);
-	if( i == 0 )  ++i;
-	base = (real)(i+1) * t->dx;
-	dif = r - base;
-	////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif );
-
-	y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + 
-		t->vdW[i].a;
-	y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + 
-			t->CEvd[i].b)*dif + t->CEvd[i].a;
-	//y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b;
-
-	y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + 
-		t->ele[i].a;
-	y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif +
-		t->CEclmb[i].a;
-
-	y->H = y->e_ele * EV_to_KCALpMOL / C_ele;
-	//y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a;
+    int i;
+    real base, dif;
+
+    i = (int)(r * t->inv_dx);
+    if( i == 0 )  ++i;
+    base = (real)(i+1) * t->dx;
+    dif = r - base;
+    ////fprintf( stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif );
+
+    y->e_vdW = ((t->vdW[i].d*dif + t->vdW[i].c)*dif + t->vdW[i].b)*dif + 
+        t->vdW[i].a;
+    y->CEvd = ((t->CEvd[i].d*dif + t->CEvd[i].c)*dif + 
+            t->CEvd[i].b)*dif + t->CEvd[i].a;
+    //y->CEvd = (3*t->vdW[i].d*dif + 2*t->vdW[i].c)*dif + t->vdW[i].b;
+
+    y->e_ele = ((t->ele[i].d*dif + t->ele[i].c)*dif + t->ele[i].b)*dif + 
+        t->ele[i].a;
+    y->CEclmb = ((t->CEclmb[i].d*dif + t->CEclmb[i].c)*dif + t->CEclmb[i].b)*dif +
+        t->CEclmb[i].a;
+
+    y->H = y->e_ele * EV_to_KCALpMOL / C_ele;
+    //y->H = ((t->H[i].d*dif + t->H[i].c)*dif + t->H[i].b)*dif + t->H[i].a;
 }
 
 
 void Make_LR_Lookup_Table( reax_system *system, control_params *control )
 {
-	int i, j, r;
-	int num_atom_types;
-	int existing_types[MAX_ATOM_TYPES];
-	real dr;
-	real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb;
-	real v0_vdw, v0_ele, vlast_vdw, vlast_ele;
-	/* real rand_dist;
-	   real evdw_abserr, evdw_relerr, fvdw_abserr, fvdw_relerr;
-	   real eele_abserr, eele_relerr, fele_abserr, fele_relerr;
-	   real evdw_maxerr, eele_maxerr;
-	   LR_data y, y_spline; */
-
-	/* initializations */
-	vlast_ele = 0;
-	vlast_vdw = 0;
-	v0_ele = 0;
-	v0_vdw = 0;
-
-	num_atom_types = system->reaxprm.num_atom_types;
-	dr = control->r_cut / control->tabulate;
-	h = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-	fh = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-	fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-	fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-	fele = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-	fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) );
-
-	/* allocate Long-Range LookUp Table space based on 
-	   number of atom types in the ffield file */
-	//LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) );
-	//for( i = 0; i < num_atom_types; ++i )
-	// LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table));
-
-	LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table));
-
-	/* most atom types in ffield file will not exist in the current
-	   simulation. to avoid unnecessary lookup table space, determine
-	   the atom types that exist in the current simulation */
-	for( i = 0; i < MAX_ATOM_TYPES; ++i )
-		existing_types[i] = 0;
-	for( i = 0; i < system->N; ++i )
-		existing_types[ system->atoms[i].type ] = 1;
-
-	/* fill in the lookup table entries for existing atom types.
-	   only lower half should be enough. */
-	for( i = 0; i < num_atom_types; ++i )
-		if( existing_types[i] )
-			for( j = i; j < num_atom_types; ++j )
-				if( existing_types[j] ) {
-					LR[ index_lr (i,j,num_atom_types) ].xmin = 0;
-					LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut;
-					LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1;
-					LR[ index_lr (i,j,num_atom_types) ].dx = dr;
-					LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
-					LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) 
-						malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data));
-					LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) 
-						malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-					LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) 
-						malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-					LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) 
-						malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-					LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) 
-						malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-					LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) 
-						malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
-
-					for( r = 1; r <= control->tabulate; ++r ) {
-						LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) );
-						h[r] = LR[ index_lr (i,j,num_atom_types) ].dx;
-						fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H;
-						fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW;
-						fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-						fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele;
-						fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
-
-						if( r == 1 ){
-							v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-							v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
-						}
-						else if( r == control->tabulate ){
-							vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
-							vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
-						}
-					}
-
-					/*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
-					  for( r = 1; r <= control->tabulate; ++r )
-					//fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
-					Natural_Cubic_Spline( &h[1], &fh[1], 
-							&(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 );
-
-					/*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
-					  for( r = 1; r <= control->tabulate; ++r )
-					//fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
-					//fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
-					 */
-					Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, 
-							&(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 );
-					Natural_Cubic_Spline( &h[1], &fCEvd[1], 
-							&(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 );
-
-					/*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
-					  for( r = 1; r <= control->tabulate; ++r )
-					//fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
-					//fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
-					 */
-					Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, 
-							&(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 );
-					Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
-							&(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 );
-				}
-
-	/***** //test LR-Lookup table
-	  evdw_maxerr = 0;
-	  eele_maxerr = 0;
-	  for( i = 0; i < num_atom_types; ++i )
-	  if( existing_types[i] )
-	  for( j = i; j < num_atom_types; ++j )
-	  if( existing_types[j] ) {
-	  for( r = 1; r <= 100; ++r ) {
-	  rand_dist = (real)rand()/RAND_MAX * control->r_cut;
-	  LR_vdW_Coulomb( system, control, i, j, rand_dist, &y );
-	  LR_Lookup( &(LR[i][j]), rand_dist, &y_spline );
-
-	  evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW);
-	  evdw_relerr = fabs(evdw_abserr / y.e_vdW);
-	  fvdw_abserr = fabs(y.CEvd - y_spline.CEvd);
-	  fvdw_relerr = fabs(fvdw_abserr / y.CEvd);
-	  eele_abserr = fabs(y.e_ele - y_spline.e_ele);
-	  eele_relerr = fabs(eele_abserr / y.e_ele);
-	  fele_abserr = fabs(y.CEclmb - y_spline.CEclmb);
-	  fele_relerr = fabs(fele_abserr / y.CEclmb);
-
-	  if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){
+    int i, j, r;
+    int num_atom_types;
+    int existing_types[MAX_ATOM_TYPES];
+    real dr;
+    real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb;
+    real v0_vdw, v0_ele, vlast_vdw, vlast_ele;
+    /* real rand_dist;
+       real evdw_abserr, evdw_relerr, fvdw_abserr, fvdw_relerr;
+       real eele_abserr, eele_relerr, fele_abserr, fele_relerr;
+       real evdw_maxerr, eele_maxerr;
+       LR_data y, y_spline; */
+
+    /* initializations */
+    vlast_ele = 0;
+    vlast_vdw = 0;
+    v0_ele = 0;
+    v0_vdw = 0;
+
+    num_atom_types = system->reaxprm.num_atom_types;
+    dr = control->r_cut / control->tabulate;
+    h = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fh = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fvdw = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fCEvd = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fele = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+    fCEclmb = (real*) malloc( (control->tabulate+1) * sizeof(real) );
+
+    /* allocate Long-Range LookUp Table space based on 
+       number of atom types in the ffield file */
+    //LR = (LR_lookup_table**) malloc( num_atom_types * sizeof(LR_lookup_table*) );
+    //for( i = 0; i < num_atom_types; ++i )
+    // LR[i] = (LR_lookup_table*) malloc(num_atom_types * sizeof(LR_lookup_table));
+
+    LR = (LR_lookup_table*) malloc(num_atom_types * num_atom_types * sizeof(LR_lookup_table));
+
+    /* most atom types in ffield file will not exist in the current
+       simulation. to avoid unnecessary lookup table space, determine
+       the atom types that exist in the current simulation */
+    for( i = 0; i < MAX_ATOM_TYPES; ++i )
+        existing_types[i] = 0;
+    for( i = 0; i < system->N; ++i )
+        existing_types[ system->atoms[i].type ] = 1;
+
+    /* fill in the lookup table entries for existing atom types.
+       only lower half should be enough. */
+    for( i = 0; i < num_atom_types; ++i )
+        if( existing_types[i] )
+            for( j = i; j < num_atom_types; ++j )
+                if( existing_types[j] ) {
+                    LR[ index_lr (i,j,num_atom_types) ].xmin = 0;
+                    LR[ index_lr (i,j,num_atom_types) ].xmax = control->r_cut;
+                    LR[ index_lr (i,j,num_atom_types) ].n = control->tabulate + 1;
+                    LR[ index_lr (i,j,num_atom_types) ].dx = dr;
+                    LR[ index_lr (i,j,num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
+                    LR[ index_lr (i,j,num_atom_types) ].y = (LR_data*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(LR_data));
+                    LR[ index_lr (i,j,num_atom_types) ].H = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+                    LR[ index_lr (i,j,num_atom_types) ].vdW = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+                    LR[ index_lr (i,j,num_atom_types) ].CEvd = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+                    LR[ index_lr (i,j,num_atom_types) ].ele = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+                    LR[ index_lr (i,j,num_atom_types) ].CEclmb = (cubic_spline_coef*) 
+                        malloc(LR[ index_lr (i,j,num_atom_types) ].n * sizeof(cubic_spline_coef));
+
+                    for( r = 1; r <= control->tabulate; ++r ) {
+                        LR_vdW_Coulomb( system, control, i, j, r * dr, &(LR[ index_lr (i,j,num_atom_types) ].y[r]) );
+                        h[r] = LR[ index_lr (i,j,num_atom_types) ].dx;
+                        fh[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].H;
+                        fvdw[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_vdW;
+                        fCEvd[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
+                        fele[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].e_ele;
+                        fCEclmb[r] = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+
+                        if( r == 1 ){
+                            v0_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
+                            v0_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+                        }
+                        else if( r == control->tabulate ){
+                            vlast_vdw = LR[ index_lr (i,j,num_atom_types) ].y[r].CEvd;
+                            vlast_ele = LR[ index_lr (i,j,num_atom_types) ].y[r].CEclmb;
+                        }
+                    }
+
+                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
+                      for( r = 1; r <= control->tabulate; ++r )
+                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
+                    Natural_Cubic_Spline( &h[1], &fh[1], 
+                            &(LR[ index_lr (i,j,num_atom_types) ].H[1]), control->tabulate+1 );
+
+                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
+                      for( r = 1; r <= control->tabulate; ++r )
+                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
+                    //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
+                     */
+                    Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw, 
+                            &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 );
+                    Natural_Cubic_Spline( &h[1], &fCEvd[1], 
+                            &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 );
+
+                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
+                      for( r = 1; r <= control->tabulate; ++r )
+                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
+                    //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
+                     */
+                    Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele, 
+                            &(LR[ index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 );
+                    Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
+                            &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 );
+                }
+
+    /***** //test LR-Lookup table
+      evdw_maxerr = 0;
+      eele_maxerr = 0;
+      for( i = 0; i < num_atom_types; ++i )
+      if( existing_types[i] )
+      for( j = i; j < num_atom_types; ++j )
+      if( existing_types[j] ) {
+      for( r = 1; r <= 100; ++r ) {
+      rand_dist = (real)rand()/RAND_MAX * control->r_cut;
+      LR_vdW_Coulomb( system, control, i, j, rand_dist, &y );
+      LR_Lookup( &(LR[i][j]), rand_dist, &y_spline );
+
+      evdw_abserr = fabs(y.e_vdW - y_spline.e_vdW);
+      evdw_relerr = fabs(evdw_abserr / y.e_vdW);
+      fvdw_abserr = fabs(y.CEvd - y_spline.CEvd);
+      fvdw_relerr = fabs(fvdw_abserr / y.CEvd);
+      eele_abserr = fabs(y.e_ele - y_spline.e_ele);
+      eele_relerr = fabs(eele_abserr / y.e_ele);
+      fele_abserr = fabs(y.CEclmb - y_spline.CEclmb);
+      fele_relerr = fabs(fele_abserr / y.CEclmb);
+
+      if( evdw_relerr > 1e-10 || eele_relerr > 1e-10 ){
 //fprintf( stderr, "rand_dist = %24.15e\n", rand_dist );
 //fprintf( stderr, "%24.15e  %24.15e  %24.15e  %24.15e\n",
 y.H, y_spline.H, 
@@ -661,7 +661,7 @@ eele_maxerr = eele_relerr;
 }
 //fprintf( stderr, "evdw_maxerr: %24.15e\n", evdw_maxerr );
 //fprintf( stderr, "eele_maxerr: %24.15e\n", eele_maxerr );
-	 *******/
+     *******/
 
 free(h);
 free(fh);
@@ -673,58 +673,58 @@ free(fCEclmb);
 
 void copy_LR_table_to_device (reax_system *system, control_params *control)
 {
-	int i, j, r;
-	int num_atom_types;
-	int existing_types[MAX_ATOM_TYPES];
-	LR_data *d_y;
-	cubic_spline_coef *temp;
+    int i, j, r;
+    int num_atom_types;
+    int existing_types[MAX_ATOM_TYPES];
+    LR_data *d_y;
+    cubic_spline_coef *temp;
 
-	num_atom_types = system->reaxprm.num_atom_types;
+    num_atom_types = system->reaxprm.num_atom_types;
 
-	//fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n");
+    //fprintf (stderr, "Copying the LR Lookyp Table to the device ... \n");
 
-	cuda_malloc ((void **) &d_LR, LR_LOOKUP_TABLE_SIZE * ( num_atom_types * num_atom_types ), 0, RES_LR_LOOKUP_TABLE );
+    cuda_malloc ((void **) &d_LR, LR_LOOKUP_TABLE_SIZE * ( num_atom_types * num_atom_types ), 0, RES_LR_LOOKUP_TABLE );
 
-	for( i = 0; i < MAX_ATOM_TYPES; ++i )
-		existing_types[i] = 0;
+    for( i = 0; i < MAX_ATOM_TYPES; ++i )
+        existing_types[i] = 0;
 
-	for( i = 0; i < system->N; ++i )
-		existing_types[ system->atoms[i].type ] = 1;
+    for( i = 0; i < system->N; ++i )
+        existing_types[ system->atoms[i].type ] = 1;
 
-	copy_host_device ( LR, d_LR, LR_LOOKUP_TABLE_SIZE * (num_atom_types * num_atom_types), cudaMemcpyHostToDevice, RES_LR_LOOKUP_TABLE );
+    copy_host_device ( LR, d_LR, LR_LOOKUP_TABLE_SIZE * (num_atom_types * num_atom_types), cudaMemcpyHostToDevice, RES_LR_LOOKUP_TABLE );
 
-	for( i = 0; i < num_atom_types; ++i )
-		if( existing_types[i] )
-			for( j = i; j < num_atom_types; ++j )
+    for( i = 0; i < num_atom_types; ++i )
+        if( existing_types[i] )
+            for( j = i; j < num_atom_types; ++j )
 
-				if( existing_types[j] ) {
+                if( existing_types[j] ) {
 
-					cuda_malloc ((void **) &d_y, LR_DATA_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y );
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y, LR_DATA_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y );
-					copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, LR_DATA_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y );
+                    cuda_malloc ((void **) &d_y, LR_DATA_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y );
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].y, d_y, LR_DATA_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y );
+                    copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, LR_DATA_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y );
 
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H );
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_H );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_H );
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H );
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].H, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_H );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_H );
 
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW );
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW );
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW );
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].vdW, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW );
 
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD );
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD );
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD );
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEvd, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD );
 
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE );
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE );
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE );
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].ele, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE );
 
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB );
-					copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB );
-				}
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB );
+                    copy_host_device ( LR [ index_lr (i, j, num_atom_types) ].CEclmb, temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB );
+                }
 
-	//fprintf (stderr, "Copy of the LR Lookup Table to the device complete ... \n");
+    //fprintf (stderr, "Copy of the LR Lookup Table to the device complete ... \n");
 }
 
 
@@ -753,159 +753,159 @@ void copy_LR_table_to_device (reax_system *system, control_params *control)
 //////////////////////////////////////////////////////////////////////////
 
 GLOBAL void calculate_LR_Values ( LR_lookup_table *d_LR, real *h, real *fh, real *fvdw, real *fCEvd, real *fele, real *fCEclmb, 
-		global_parameters g_params, two_body_parameters *tbp, 
-		control_params *control, int i, 
-		int j, int num_atom_types, LR_data *data, real dr, int count )
+        global_parameters g_params, two_body_parameters *tbp, 
+        control_params *control, int i, 
+        int j, int num_atom_types, LR_data *data, real dr, int count )
 {
-	int r = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( r == 0 || r > count ) return;
+    int r = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( r == 0 || r > count ) return;
 
-	LR_vdW_Coulomb ( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types );
+    LR_vdW_Coulomb ( g_params, tbp, control, i, j, r * dr, &data[r], num_atom_types );
 
-	h[r] = d_LR[ index_lr (i, j, num_atom_types) ].dx;
-	fh[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].H;
-	fvdw[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_vdW;
-	fCEvd[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEvd;
-	fele[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_ele;
-	fCEclmb[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEclmb;
+    h[r] = d_LR[ index_lr (i, j, num_atom_types) ].dx;
+    fh[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].H;
+    fvdw[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_vdW;
+    fCEvd[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEvd;
+    fele[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].e_ele;
+    fCEclmb[r] = d_LR[ index_lr (i, j, num_atom_types) ].y[r].CEclmb;
 }
 
 GLOBAL void init_LR_values ( LR_lookup_table *d_LR, control_params *control, real dr, int i, int j, int num_atom_types )
 {
-	d_LR[ index_lr (i, j, num_atom_types) ].xmin = 0;
-	d_LR[ index_lr (i, j, num_atom_types) ].xmax = control->r_cut;
-	d_LR[ index_lr (i, j, num_atom_types) ].n = control->tabulate + 1;
-	d_LR[ index_lr (i, j, num_atom_types) ].dx = dr;
-	d_LR[ index_lr (i, j, num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
+    d_LR[ index_lr (i, j, num_atom_types) ].xmin = 0;
+    d_LR[ index_lr (i, j, num_atom_types) ].xmax = control->r_cut;
+    d_LR[ index_lr (i, j, num_atom_types) ].n = control->tabulate + 1;
+    d_LR[ index_lr (i, j, num_atom_types) ].dx = dr;
+    d_LR[ index_lr (i, j, num_atom_types) ].inv_dx = control->tabulate / control->r_cut;
 }
 
 void Cuda_Make_LR_Lookup_Table( reax_system *system, control_params *control )
 {
-	int i, j, r;
-	int num_atom_types;
-	int existing_types[MAX_ATOM_TYPES];
-	real dr;
-	real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb;
-
-	int v0_vdw_r, v0_ele_r, vlast_vdw_r, vlast_ele_r;
-
-	void *temp;
-	LR_data *d_y;
-	int blocks, block_size;
-
-	/* initializations */
-	vlast_ele_r = 0;
-	vlast_vdw_r = 0;
-	v0_ele_r = 0;
-	v0_vdw_r = 0;
-
-	num_atom_types = system->reaxprm.num_atom_types;
-	dr = control->r_cut / control->tabulate;
-
-	cuda_malloc ((void **) &h, 			REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y);
-	cuda_malloc ((void **) &fh, 		REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H);
-	cuda_malloc ((void **) &fvdw, 		REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW);
-	cuda_malloc ((void **) &fCEvd, 	REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD);
-	cuda_malloc ((void **) &fele, 		REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE);
-	cuda_malloc ((void **) &fCEclmb, 	REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB);
-
-	/* allocate Long-Range LookUp Table space based on 
-	   number of atom types in the ffield file */
-	cuda_malloc ((void **) &d_LR, LR_LOOKUP_TABLE_SIZE * ( num_atom_types * num_atom_types ), 0, RES_LR_LOOKUP_TABLE );
-
-	/* most atom types in ffield file will not exist in the current
-	   simulation. to avoid unnecessary lookup table space, determine
-	   the atom types that exist in the current simulation */
-	for( i = 0; i < MAX_ATOM_TYPES; ++i )
-		existing_types[i] = 0;
-
-	for( i = 0; i < system->N; ++i )
-		existing_types[ system->atoms[i].type ] = 1;
-
-	/* fill in the lookup table entries for existing atom types.
-	   only lower half should be enough. */
-	for( i = 0; i < num_atom_types; ++i )
-		if( existing_types[i] )
-			for( j = i; j < num_atom_types; ++j )
-				if( existing_types[j] ) {
-
-					init_LR_values <<< 1, 1 >>>
-						( d_LR, (control_params *)control->d_control, dr, i, j, num_atom_types );
-					cudaThreadSynchronize ();
-					//fprintf (stderr, "Done with init LR Values --> %d \n", cudaGetLastError ());
-
-					cuda_malloc ((void **) &d_y, LR_DATA_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y );
-					copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, LR_DATA_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y );
-
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_H );
-
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW );
-
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD );
-
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE );
-
-					cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB );
-					copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB );
-
-					//TODO check the bounds
-					compute_blocks ( &blocks, &block_size, control->tabulate );
-					calculate_LR_Values <<<blocks, block_size>>>
-						( d_LR, h, fh, fvdw, fCEvd, fele, fCEclmb, 
-						  system->reaxprm.d_gp, system->reaxprm.d_tbp, 
-						  (control_params *)control->d_control, i, j, system->reaxprm.num_atom_types, 
-						  d_y, dr, control->tabulate );
-					cudaThreadSynchronize ();
-
-					//fprintf (stderr, "Done with LR Values Calculation --> %d \n", cudaGetLastError ());
-
-					/*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
-					  for( r = 1; r <= control->tabulate; ++r )
-					//fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
-					Cuda_Natural_Cubic_Spline( h+1, fh+1, 
-							d_LR + index_lr (i,j,num_atom_types), SPLINE_H_OFFSET, control->tabulate+1 );
-
-					/*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
-					  for( r = 1; r <= control->tabulate; ++r )
-					//fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
-					//fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
-					 */
-
-					//TODO -- Pass the right v0 and vlast for the cubic spline
-					//Cuda_Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw_r, vlast_vdw_r, 
-					//		 &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 );
-					//Cuda_Natural_Cubic_Spline( &h[1], &fCEvd[1], 
-					//		&(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 );
-					Cuda_Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw_r, vlast_vdw_r, 
-							d_LR + index_lr (i,j,num_atom_types) , SPLINE_VDW_OFFSET, control->tabulate+1 );
-					Cuda_Natural_Cubic_Spline( &h[1], &fCEvd[1], 
-							d_LR + index_lr (i,j,num_atom_types) , SPLINE_CEVD_OFFSET, control->tabulate+1 );
-
-					/*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
-					  for( r = 1; r <= control->tabulate; ++r )
-					//fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
-					//fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
-					 */
-					//Cuda_Complete_Cubic_Spline( &h[1], &fele[1], v0_ele_r, vlast_ele_r, 
-					//		 &(LR[index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 );
-					//Cuda_Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
-					//		&(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 );
-					Cuda_Complete_Cubic_Spline( &h[1], &fele[1], v0_ele_r, vlast_ele_r, 
-							d_LR + index_lr (i,j,num_atom_types) , SPLINE_ELE_OFFSET, control->tabulate+1 );
-					Cuda_Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
-							d_LR + index_lr (i,j,num_atom_types) , SPLINE_CECLMB_OFFSET, control->tabulate+1 );
-				}
-
-	cuda_free(h, RES_LR_LOOKUP_Y);
-	cuda_free(fh, RES_LR_LOOKUP_H);
-	cuda_free(fvdw, RES_LR_LOOKUP_VDW);
-	cuda_free(fCEvd, RES_LR_LOOKUP_CEVD);
-	cuda_free(fele, RES_LR_LOOKUP_ELE);
-	cuda_free(fCEclmb, RES_LR_LOOKUP_CECLMB);
+    int i, j, r;
+    int num_atom_types;
+    int existing_types[MAX_ATOM_TYPES];
+    real dr;
+    real *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb;
+
+    int v0_vdw_r, v0_ele_r, vlast_vdw_r, vlast_ele_r;
+
+    void *temp;
+    LR_data *d_y;
+    int blocks, block_size;
+
+    /* initializations */
+    vlast_ele_r = 0;
+    vlast_vdw_r = 0;
+    v0_ele_r = 0;
+    v0_vdw_r = 0;
+
+    num_atom_types = system->reaxprm.num_atom_types;
+    dr = control->r_cut / control->tabulate;
+
+    cuda_malloc ((void **) &h,             REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y);
+    cuda_malloc ((void **) &fh,         REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H);
+    cuda_malloc ((void **) &fvdw,         REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW);
+    cuda_malloc ((void **) &fCEvd,     REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD);
+    cuda_malloc ((void **) &fele,         REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE);
+    cuda_malloc ((void **) &fCEclmb,     REAL_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB);
+
+    /* allocate Long-Range LookUp Table space based on 
+       number of atom types in the ffield file */
+    cuda_malloc ((void **) &d_LR, LR_LOOKUP_TABLE_SIZE * ( num_atom_types * num_atom_types ), 0, RES_LR_LOOKUP_TABLE );
+
+    /* most atom types in ffield file will not exist in the current
+       simulation. to avoid unnecessary lookup table space, determine
+       the atom types that exist in the current simulation */
+    for( i = 0; i < MAX_ATOM_TYPES; ++i )
+        existing_types[i] = 0;
+
+    for( i = 0; i < system->N; ++i )
+        existing_types[ system->atoms[i].type ] = 1;
+
+    /* fill in the lookup table entries for existing atom types.
+       only lower half should be enough. */
+    for( i = 0; i < num_atom_types; ++i )
+        if( existing_types[i] )
+            for( j = i; j < num_atom_types; ++j )
+                if( existing_types[j] ) {
+
+                    init_LR_values <<< 1, 1 >>>
+                        ( d_LR, (control_params *)control->d_control, dr, i, j, num_atom_types );
+                    cudaThreadSynchronize ();
+                    //fprintf (stderr, "Done with init LR Values --> %d \n", cudaGetLastError ());
+
+                    cuda_malloc ((void **) &d_y, LR_DATA_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_Y );
+                    copy_host_device ( &d_y, &d_LR [ index_lr (i, j, num_atom_types) ].y, LR_DATA_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_Y );
+
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_H );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].H, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_H );
+
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_VDW );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].vdW,CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_VDW );
+
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CEVD );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEvd, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CEVD );
+
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_ELE );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].ele, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_ELE );
+
+                    cuda_malloc ((void **) &temp, CUBIC_SPLINE_COEF_SIZE * (control->tabulate + 1), 0, RES_LR_LOOKUP_CECLMB );
+                    copy_host_device ( &temp, &d_LR [ index_lr (i, j, num_atom_types) ].CEclmb, CUBIC_SPLINE_COEF_PTR_SIZE, cudaMemcpyHostToDevice, RES_LR_LOOKUP_CECLMB );
+
+                    //TODO check the bounds
+                    compute_blocks ( &blocks, &block_size, control->tabulate );
+                    calculate_LR_Values <<<blocks, block_size>>>
+                        ( d_LR, h, fh, fvdw, fCEvd, fele, fCEclmb, 
+                          system->reaxprm.d_gp, system->reaxprm.d_tbp, 
+                          (control_params *)control->d_control, i, j, system->reaxprm.num_atom_types, 
+                          d_y, dr, control->tabulate );
+                    cudaThreadSynchronize ();
+
+                    //fprintf (stderr, "Done with LR Values Calculation --> %d \n", cudaGetLastError ());
+
+                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fh" );
+                      for( r = 1; r <= control->tabulate; ++r )
+                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fh[r] ); */
+                    Cuda_Natural_Cubic_Spline( h+1, fh+1, 
+                            d_LR + index_lr (i,j,num_atom_types), SPLINE_H_OFFSET, control->tabulate+1 );
+
+                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fvdw" );
+                      for( r = 1; r <= control->tabulate; ++r )
+                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fvdw[r] );
+                    //fprintf( stderr, "v0_vdw: %f, vlast_vdw: %f\n", v0_vdw, vlast_vdw );
+                     */
+
+                    //TODO -- Pass the right v0 and vlast for the cubic spline
+                    //Cuda_Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw_r, vlast_vdw_r, 
+                    //         &(LR[ index_lr (i,j,num_atom_types) ].vdW[1]), control->tabulate+1 );
+                    //Cuda_Natural_Cubic_Spline( &h[1], &fCEvd[1], 
+                    //        &(LR[ index_lr (i,j,num_atom_types) ].CEvd[1]), control->tabulate+1 );
+                    Cuda_Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw_r, vlast_vdw_r, 
+                            d_LR + index_lr (i,j,num_atom_types) , SPLINE_VDW_OFFSET, control->tabulate+1 );
+                    Cuda_Natural_Cubic_Spline( &h[1], &fCEvd[1], 
+                            d_LR + index_lr (i,j,num_atom_types) , SPLINE_CEVD_OFFSET, control->tabulate+1 );
+
+                    /*//fprintf( stderr, "%-6s  %-6s  %-6s\n", "r", "h", "fele" );
+                      for( r = 1; r <= control->tabulate; ++r )
+                    //fprintf( stderr, "%f  %f  %f\n", r * dr, h[r], fele[r] );
+                    //fprintf( stderr, "v0_ele: %f, vlast_ele: %f\n", v0_ele, vlast_ele );
+                     */
+                    //Cuda_Complete_Cubic_Spline( &h[1], &fele[1], v0_ele_r, vlast_ele_r, 
+                    //         &(LR[index_lr (i,j,num_atom_types) ].ele[1]), control->tabulate+1 );
+                    //Cuda_Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
+                    //        &(LR[ index_lr (i,j,num_atom_types) ].CEclmb[1]), control->tabulate+1 );
+                    Cuda_Complete_Cubic_Spline( &h[1], &fele[1], v0_ele_r, vlast_ele_r, 
+                            d_LR + index_lr (i,j,num_atom_types) , SPLINE_ELE_OFFSET, control->tabulate+1 );
+                    Cuda_Natural_Cubic_Spline( &h[1], &fCEclmb[1], 
+                            d_LR + index_lr (i,j,num_atom_types) , SPLINE_CECLMB_OFFSET, control->tabulate+1 );
+                }
+
+    cuda_free(h, RES_LR_LOOKUP_Y);
+    cuda_free(fh, RES_LR_LOOKUP_H);
+    cuda_free(fvdw, RES_LR_LOOKUP_VDW);
+    cuda_free(fCEvd, RES_LR_LOOKUP_CEVD);
+    cuda_free(fele, RES_LR_LOOKUP_ELE);
+    cuda_free(fCEclmb, RES_LR_LOOKUP_CECLMB);
 }
 
 
@@ -923,36 +923,36 @@ void Cuda_Make_LR_Lookup_Table( reax_system *system, control_params *control )
 
 int Lookup_Index_Of( real x, lookup_table* t )
 {
-	return (int)( t->a * ( x - t->xmin ) );
+    return (int)( t->a * ( x - t->xmin ) );
 }
 
 
 real Lookup( real x, lookup_table* t )
 {
-	real x1, x2;
-	real b;
-	int i;
-
-	/* if ( x < t->xmin) 
-	   {
-	//fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x);
-	exit(0);
-	}
-	if ( x > t->xmax) 
-	{
-	//fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x);
-	exit(0);
-	} */
-
-	i = Lookup_Index_Of( x, t );
-	x1 = i * t->dx + t->xmin;
-	x2 = (i+1) * t->dx + t->xmin;
-
-	b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx;
-	// //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n",
-	//          i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x));
-
-	return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b;
+    real x1, x2;
+    real b;
+    int i;
+
+    /* if ( x < t->xmin) 
+       {
+    //fprintf(stderr,"Domain check %lf > %lf\n",t->xmin,x);
+    exit(0);
+    }
+    if ( x > t->xmax) 
+    {
+    //fprintf(stderr,"Domain check %lf < %lf\n",t->xmax,x);
+    exit(0);
+    } */
+
+    i = Lookup_Index_Of( x, t );
+    x1 = i * t->dx + t->xmin;
+    x2 = (i+1) * t->dx + t->xmin;
+
+    b = ( x2 * t->y[i] - x1 * t->y[i+1] ) * t->inv_dx;
+    // //fprintf( stdout,"SLookup_Entry: %d, %lf, %lf, %lf, %lf: %lf, %lf\n",
+    //          i,x1,x2,x,b,t->one_over_dx*(t->y[i+1]-t->y[i])*x+b,exp(x));
+
+    return t->inv_dx * ( t->y[i+1] - t->y[i] ) * x + b;
 }
 
 
diff --git a/PuReMD-GPU/src/matvec.cu b/PuReMD-GPU/src/matvec.cu
index 2f0b7bb0..bf08cdf8 100644
--- a/PuReMD-GPU/src/matvec.cu
+++ b/PuReMD-GPU/src/matvec.cu
@@ -24,22 +24,22 @@
 //one thread per row
 GLOBAL void Cuda_Matvec (sparse_matrix H, real *vec, real *results, int rows)
 {
-	real results_row = 0;
-	int col;
-	real val;
+    real results_row = 0;
+    int col;
+    real val;
 
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= rows) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= rows) return;
 
-	for (int c = H.start[i]; c < H.end[i]; c++)
-	{
-		col = H.entries [c].j;
-		val = H.entries[c].val;
+    for (int c = H.start[i]; c < H.end[i]; c++)
+    {
+        col = H.entries [c].j;
+        val = H.entries[c].val;
 
-		results_row += val * vec [col];
-	}
+        results_row += val * vec [col];
+    }
 
-	results [i] = results_row;
+    results [i] = results_row;
 }
 
 //32 thread warp per matrix row.
@@ -47,43 +47,43 @@ GLOBAL void Cuda_Matvec (sparse_matrix H, real *vec, real *results, int rows)
 // <<< system->N, 32 >>>
 GLOBAL void Cuda_Matvec_csr (sparse_matrix H, real *vec, real *results, int num_rows)
 {
-	extern __shared__ real vals [];
-	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
-	int warp_id = thread_id / 32;
-	int lane = thread_id & (32 - 1);
-
-	int row_start;
-	int row_end;
-
-	// one warp per row
-	//int row = warp_id;
-	int row = warp_id;
-	//if (row < num_rows)
-	{
-		vals[threadIdx.x] = 0;
-
-		if (row < num_rows) {
-			row_start = H.start[row];
-			row_end = H.end[row];
-
-			// compute running sum per thread
-			for(int jj = row_start + lane; jj < row_end; jj += 32)
-				vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ];
-			//vals[threadIdx.x] += H.val[jj] * vec [ H.j[jj] ];
-		}
-
-		__syncthreads ();
-
-		// parallel reduction in shared memory
-		//SIMD instructions with a WARP are synchronous -- so we do not need to synch here
-		if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads();
-		if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads ();
-		if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads ();
-		if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads ();
-		if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads ();
-
-		// first thread writes the result
-		if (lane == 0 && row < num_rows)
-			results[row] = vals[threadIdx.x];
-	}
+    extern __shared__ real vals [];
+    int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
+    int warp_id = thread_id / 32;
+    int lane = thread_id & (32 - 1);
+
+    int row_start;
+    int row_end;
+
+    // one warp per row
+    //int row = warp_id;
+    int row = warp_id;
+    //if (row < num_rows)
+    {
+        vals[threadIdx.x] = 0;
+
+        if (row < num_rows) {
+            row_start = H.start[row];
+            row_end = H.end[row];
+
+            // compute running sum per thread
+            for(int jj = row_start + lane; jj < row_end; jj += 32)
+                vals[threadIdx.x] += H.entries[jj].val * vec [ H.entries[jj].j ];
+            //vals[threadIdx.x] += H.val[jj] * vec [ H.j[jj] ];
+        }
+
+        __syncthreads ();
+
+        // parallel reduction in shared memory
+        //SIMD instructions with a WARP are synchronous -- so we do not need to synch here
+        if (lane < 16) vals[threadIdx.x] += vals[threadIdx.x + 16]; __syncthreads();
+        if (lane < 8) vals[threadIdx.x] += vals[threadIdx.x + 8]; __syncthreads ();
+        if (lane < 4) vals[threadIdx.x] += vals[threadIdx.x + 4]; __syncthreads ();
+        if (lane < 2) vals[threadIdx.x] += vals[threadIdx.x + 2]; __syncthreads ();
+        if (lane < 1) vals[threadIdx.x] += vals[threadIdx.x + 1]; __syncthreads ();
+
+        // first thread writes the result
+        if (lane == 0 && row < num_rows)
+            results[row] = vals[threadIdx.x];
+    }
 }
diff --git a/PuReMD-GPU/src/neighbors.cu b/PuReMD-GPU/src/neighbors.cu
index 7bf8ed2a..90779538 100644
--- a/PuReMD-GPU/src/neighbors.cu
+++ b/PuReMD-GPU/src/neighbors.cu
@@ -30,1383 +30,1383 @@
 
 extern inline DEVICE int index_grid (int blocksize)
 {
-	return blockIdx.x * gridDim.y * gridDim.z * blocksize +  
-		blockIdx.y * gridDim.z * blocksize +  
-		blockIdx.z * blocksize ;
+    return blockIdx.x * gridDim.y * gridDim.z * blocksize +  
+        blockIdx.y * gridDim.z * blocksize +  
+        blockIdx.z * blocksize ;
 }
 
 extern inline HOST_DEVICE int index_grid_debug (int x, int y, int z, int blocksize)
 {
-	return x * 8 * 8 * blocksize +  
-		y * 8 * blocksize +  
-		z * blocksize ;
+    return x * 8 * 8 * blocksize +  
+        y * 8 * blocksize +  
+        z * blocksize ;
 }
 
 inline HOST_DEVICE real DistSqr_to_CP( rvec cp, rvec x )
 {
-	int  i;
-	real d_sqr = 0;
+    int  i;
+    real d_sqr = 0;
 
-	for( i = 0; i < 3; ++i )
-		if( cp[i] > NEG_INF )
-			d_sqr += SQR( cp[i] - x[i] );
+    for( i = 0; i < 3; ++i )
+        if( cp[i] > NEG_INF )
+            d_sqr += SQR( cp[i] - x[i] );
 
-	return d_sqr;
+    return d_sqr;
 }
 
 HOST_DEVICE int Are_Far_Neighbors( rvec x1, rvec x2, simulation_box *box, 
-		real cutoff, far_neighbor_data *data )
+        real cutoff, far_neighbor_data *data )
 {
-	real norm_sqr, d, tmp;
-	int i;
-
-	norm_sqr = 0;
-
-	for( i = 0; i < 3; i++ ) { 
-		d = x2[i] - x1[i];
-		tmp = SQR(d);
-
-		if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
-			if( x2[i] > x1[i] ) { 
-				d -= box->box_norms[i];
-				data->rel_box[i] = -1; 
-			}   
-			else {
-				d += box->box_norms[i];
-				data->rel_box[i] = +1; 
-			}   
-
-			data->dvec[i] = d;
-			norm_sqr += SQR(d);
-		}   
-		else {
-			data->dvec[i] = d;
-			norm_sqr += tmp;
-			data->rel_box[i] = 0;
-		}   
-	}
-
-	if( norm_sqr <= SQR(cutoff) ){
-		data->d = sqrt(norm_sqr);
-		return 1;
-	}
-
-	return 0;
+    real norm_sqr, d, tmp;
+    int i;
+
+    norm_sqr = 0;
+
+    for( i = 0; i < 3; i++ ) { 
+        d = x2[i] - x1[i];
+        tmp = SQR(d);
+
+        if( tmp >= SQR( box->box_norms[i] / 2.0 ) ) {    
+            if( x2[i] > x1[i] ) { 
+                d -= box->box_norms[i];
+                data->rel_box[i] = -1; 
+            }   
+            else {
+                d += box->box_norms[i];
+                data->rel_box[i] = +1; 
+            }   
+
+            data->dvec[i] = d;
+            norm_sqr += SQR(d);
+        }   
+        else {
+            data->dvec[i] = d;
+            norm_sqr += tmp;
+            data->rel_box[i] = 0;
+        }   
+    }
+
+    if( norm_sqr <= SQR(cutoff) ){
+        data->d = sqrt(norm_sqr);
+        return 1;
+    }
+
+    return 0;
 }
 
 void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace,
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-	int  i, j, k, l, m, itr;
-	int  x, y, z;
-	int  atom1, atom2, max;
-	int  num_far;
-	int  *nbr_atoms;
-	ivec *nbrs;
-	rvec *nbrs_cp;
-	grid *g;
-	list *far_nbrs;
-	far_neighbor_data *nbr_data;
-	real t_start, t_elapsed;
-
-	// fprintf( stderr, "\n\tentered nbrs - " );
-	g = &( system->g );
-	far_nbrs = (*lists) + FAR_NBRS;
-	Bin_Atoms( system, workspace );
-
-	t_start = Get_Time( );
-
-	// fprintf( stderr, "atoms sorted - " );
-	num_far = 0;
-
-	/* first pick up a cell in the grid */
-	for( i = 0; i < g->ncell[0]; i++ )
-		for( j = 0; j < g->ncell[1]; j++ )
-			for( k = 0; k < g->ncell[2]; k++ ) {
-				nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ];
-				nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ];
-				//fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
-
-				/* pick up an atom from the current cell */
-				for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){
-					atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ];
-					Set_Start_Index( atom1, num_far, far_nbrs );
-					//fprintf( stderr, "\tatom %d\n", atom1 );
-
-					itr = 0;
-					while( nbrs[itr][0] >= 0 ){
-						x = nbrs[itr][0];
-						y = nbrs[itr][1];
-						z = nbrs[itr][2];
-						//fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
-
-						if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-								SQR(control->vlist_cut) ) { 	
-							nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ];
-							max = g->top[ index_grid_3d (x,y,z,g) ];
-							//fprintf( stderr, "\t\tmax: %d\n", max );
-
-							/* pick up another atom from the neighbor cell */
-							for( m = 0; m < max; ++m ) {
-								atom2 = nbr_atoms[m];
-								if( atom1 > atom2 ) {
-									nbr_data = &(far_nbrs->select.far_nbr_list[num_far]);
-									if(Are_Far_Neighbors(system->atoms[atom1].x,
-												system->atoms[atom2].x, 
-												&(system->box), control->vlist_cut, 
-												nbr_data)) {
-										nbr_data->nbr = atom2;
-
-										++num_far;
-									}
-								}
-							}
-						}
-
-						++itr;
-					}
-
-					Set_End_Index( atom1, num_far, far_nbrs );
-					//fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", 
-					//  atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs),
-					//  itr); 
-				}
-			}
-
-	fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far);
-
-	if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) {
-		workspace->realloc.num_far = num_far;
-		if( num_far > far_nbrs->num_intrs ){
-			fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d",
-					data->step, num_far, far_nbrs->num_intrs );
-			exit( INSUFFICIENT_SPACE );
-		}
-	}
-
-	t_elapsed = Get_Timing_Info( t_start );
-	data->timing.nbrs += t_elapsed;
+    int  i, j, k, l, m, itr;
+    int  x, y, z;
+    int  atom1, atom2, max;
+    int  num_far;
+    int  *nbr_atoms;
+    ivec *nbrs;
+    rvec *nbrs_cp;
+    grid *g;
+    list *far_nbrs;
+    far_neighbor_data *nbr_data;
+    real t_start, t_elapsed;
+
+    // fprintf( stderr, "\n\tentered nbrs - " );
+    g = &( system->g );
+    far_nbrs = (*lists) + FAR_NBRS;
+    Bin_Atoms( system, workspace );
+
+    t_start = Get_Time( );
+
+    // fprintf( stderr, "atoms sorted - " );
+    num_far = 0;
+
+    /* first pick up a cell in the grid */
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ ) {
+                nbrs = &g->nbrs[ index_grid_nbrs (i,j,k,0,g) ];
+                nbrs_cp = &g->nbrs_cp[ index_grid_nbrs (i,j,k,0,g) ];
+                //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
+
+                /* pick up an atom from the current cell */
+                for(l = 0; l < g->top[ index_grid_3d (i,j,k,g) ]; ++l ){
+                    atom1 = g->atoms[ index_grid_atoms (i,j,k,l,g) ];
+                    Set_Start_Index( atom1, num_far, far_nbrs );
+                    //fprintf( stderr, "\tatom %d\n", atom1 );
+
+                    itr = 0;
+                    while( nbrs[itr][0] >= 0 ){
+                        x = nbrs[itr][0];
+                        y = nbrs[itr][1];
+                        z = nbrs[itr][2];
+                        //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
+
+                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
+                                SQR(control->vlist_cut) ) {     
+                            nbr_atoms = &g->atoms[ index_grid_atoms (x,y,z,0,g) ];
+                            max = g->top[ index_grid_3d (x,y,z,g) ];
+                            //fprintf( stderr, "\t\tmax: %d\n", max );
+
+                            /* pick up another atom from the neighbor cell */
+                            for( m = 0; m < max; ++m ) {
+                                atom2 = nbr_atoms[m];
+                                if( atom1 > atom2 ) {
+                                    nbr_data = &(far_nbrs->select.far_nbr_list[num_far]);
+                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
+                                                system->atoms[atom2].x, 
+                                                &(system->box), control->vlist_cut, 
+                                                nbr_data)) {
+                                        nbr_data->nbr = atom2;
+
+                                        ++num_far;
+                                    }
+                                }
+                            }
+                        }
+
+                        ++itr;
+                    }
+
+                    Set_End_Index( atom1, num_far, far_nbrs );
+                    //fprintf(stderr, "i:%d, start: %d, end: %d - itr: %d\n", 
+                    //  atom1,Start_Index(atom1,far_nbrs),End_Index(atom1,far_nbrs),
+                    //  itr); 
+                }
+            }
+
+    fprintf (stderr, " TOTAL HOST NEIGHBORS : %d \n", num_far);
+
+    if( num_far > far_nbrs->num_intrs * DANGER_ZONE ) {
+        workspace->realloc.num_far = num_far;
+        if( num_far > far_nbrs->num_intrs ){
+            fprintf( stderr, "step%d-ran out of space on far_nbrs: top=%d, max=%d",
+                    data->step, num_far, far_nbrs->num_intrs );
+            exit( INSUFFICIENT_SPACE );
+        }
+    }
+
+    t_elapsed = Get_Timing_Info( t_start );
+    data->timing.nbrs += t_elapsed;
 
 #if defined(DEBUG)
-	for( i = 0; i < system->N; ++i ) {
-		qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-				Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-				compare_far_nbrs ); 
-	}
+    for( i = 0; i < system->N; ++i ) {
+        qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
+                Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
+                compare_far_nbrs ); 
+    }
 #endif
 #if defined(DEBUG_FOCUS)  
-	//fprintf( stderr, "nbrs - ");
-	//fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
+    //fprintf( stderr, "nbrs - ");
+    //fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
 #endif
 #if defined(TEST_ENERGY)
-	//Print_Far_Neighbors( system, control, workspace, lists );
+    //Print_Far_Neighbors( system, control, workspace, lists );
 #endif
 }
 
 
 int Estimate_NumNeighbors( reax_system *system, control_params *control, 
-		static_storage *workspace, list **lists )
+        static_storage *workspace, list **lists )
 {
-	int  i, j, k, l, m, itr;
-	int  x, y, z;
-	int  atom1, atom2, max;
-	int  num_far;
-	int  *nbr_atoms;
-	ivec *nbrs;
-	rvec *nbrs_cp;
-	grid *g;
-	far_neighbor_data nbr_data;
-
-	int 	start = 0, finish = 0;
-
-	// fprintf( stderr, "\n\tentered nbrs - " );
-	g = &( system->g );
-	Bin_Atoms( system, workspace );
-	// fprintf( stderr, "atoms sorted - " );
-	num_far = 0;
-	g->max_cuda_nbrs = 0;
-
-	/* first pick up a cell in the grid */
-	for( i = 0; i < g->ncell[0]; i++ )
-		for( j = 0; j < g->ncell[1]; j++ )
-			for( k = 0; k < g->ncell[2]; k++ ) {
-				nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ];
-				nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ];
-				//fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
-
-				/* pick up an atom from the current cell */
-				for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){
-					atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ];
-					start = num_far;
-
-					itr = 0;
-					while( nbrs[itr][0] >= 0 ){
-						x = nbrs[itr][0];
-						y = nbrs[itr][1];
-						z = nbrs[itr][2];
-						//fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
-
-						if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-								SQR(control->vlist_cut) ) { 	
-							nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ];
-							max = g->top[index_grid_3d (x,y,z,g) ];
-							//fprintf( stderr, "\t\tmax: %d\n", max );
-
-							/* pick up another atom from the neighbor cell -
-							   we have to compare atom1 with its own periodic images as well, 
-							   that's why there is also equality in the if stmt below */
-							for( m = 0; m < max; ++m ) {
-								atom2 = nbr_atoms[m];
-								//if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) {
-								if( atom1 > atom2 ) {
-									if(Are_Far_Neighbors(system->atoms[atom1].x,
-												system->atoms[atom2].x, 
-												&(system->box), control->vlist_cut, 
-												&nbr_data))
-										++num_far;
-								}
-							}
-							}
-
-							++itr;
-						}
-
-						// finish note
-						finish = num_far;
-						if (g->max_cuda_nbrs <= (finish - start)){
-							g->max_cuda_nbrs	= finish - start;
-						}
-					}
-				}
+    int  i, j, k, l, m, itr;
+    int  x, y, z;
+    int  atom1, atom2, max;
+    int  num_far;
+    int  *nbr_atoms;
+    ivec *nbrs;
+    rvec *nbrs_cp;
+    grid *g;
+    far_neighbor_data nbr_data;
+
+    int     start = 0, finish = 0;
+
+    // fprintf( stderr, "\n\tentered nbrs - " );
+    g = &( system->g );
+    Bin_Atoms( system, workspace );
+    // fprintf( stderr, "atoms sorted - " );
+    num_far = 0;
+    g->max_cuda_nbrs = 0;
+
+    /* first pick up a cell in the grid */
+    for( i = 0; i < g->ncell[0]; i++ )
+        for( j = 0; j < g->ncell[1]; j++ )
+            for( k = 0; k < g->ncell[2]; k++ ) {
+                nbrs = &g->nbrs[index_grid_nbrs (i,j,k,0,g) ];
+                nbrs_cp = &g->nbrs_cp[index_grid_nbrs (i,j,k,0,g) ];
+                //fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
+
+                /* pick up an atom from the current cell */
+                for(l = 0; l < g->top[index_grid_3d (i,j,k,g) ]; ++l ){
+                    atom1 = g->atoms[index_grid_atoms (i,j,k,l,g) ];
+                    start = num_far;
+
+                    itr = 0;
+                    while( nbrs[itr][0] >= 0 ){
+                        x = nbrs[itr][0];
+                        y = nbrs[itr][1];
+                        z = nbrs[itr][2];
+                        //fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
+
+                        if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
+                                SQR(control->vlist_cut) ) {     
+                            nbr_atoms = &g->atoms[index_grid_atoms (x,y,z,0,g) ];
+                            max = g->top[index_grid_3d (x,y,z,g) ];
+                            //fprintf( stderr, "\t\tmax: %d\n", max );
+
+                            /* pick up another atom from the neighbor cell -
+                               we have to compare atom1 with its own periodic images as well, 
+                               that's why there is also equality in the if stmt below */
+                            for( m = 0; m < max; ++m ) {
+                                atom2 = nbr_atoms[m];
+                                //if( nbrs[itr+1][0] >= 0 || atom1 > atom2 ) {
+                                if( atom1 > atom2 ) {
+                                    if(Are_Far_Neighbors(system->atoms[atom1].x,
+                                                system->atoms[atom2].x, 
+                                                &(system->box), control->vlist_cut, 
+                                                &nbr_data))
+                                        ++num_far;
+                                }
+                            }
+                            }
+
+                            ++itr;
+                        }
+
+                        // finish note
+                        finish = num_far;
+                        if (g->max_cuda_nbrs <= (finish - start)){
+                            g->max_cuda_nbrs    = finish - start;
+                        }
+                    }
+                }
 
 #if defined(DEBUG_FOCUS)  
-				fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far );
+                fprintf( stderr, "estimate nbrs done, num_far: %d\n", num_far );
 #endif
-				return num_far * SAFE_ZONE;
-			}
-
-	GLOBAL void Estimate_NumNeighbors ( reax_atom *sys_atoms,
-			grid g,
-			simulation_box *box,
-			control_params *control,
-			int *indices)
-	{
-		int *atoms = g.atoms;
-		int *top = g.top;
-		ivec *nbrs = g.nbrs; 
-		rvec *nbrs_cp = g.nbrs_cp;
-
-		int *nbr_atoms;
-		int atom1, atom2, l, iter, max, m, num_far;
-		far_neighbor_data nbr_data;
-		int x, y, z, i;
-
-		if (threadIdx.x >= *(top + index_grid(1))){
-			return;
-		} 
-
-		nbrs = nbrs + index_grid (g.max_nbrs);
-		nbrs_cp = nbrs_cp + index_grid (g.max_nbrs);
-		atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x];
-
-		num_far = 0;
-		iter = 0;
-
-		while (nbrs[iter][0] >= 0) {
-			x = nbrs[iter][0];
-			y = nbrs[iter][1];
-			z = nbrs[iter][2];
-
-			//condition check for cutoff here
-			if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
-					SQR (control->vlist_cut)) 
-			{
-				nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-				max = top [index_grid_3d(x, y, z, &g)];
-				for (m = 0; m < max; m++) {
-					atom2 = nbr_atoms[m];
-
-					//CHANGE ORIGINAL
-					/*
-					   if (atom1 > atom2) {
-					   if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-					   control->vlist_cut, &nbr_data)){
-					   ++num_far;
-					   }
-					   }
-					 */
-					if (atom1 > atom2) {
-						if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-									control->vlist_cut, &nbr_data)){
-							++num_far;
-						}
-					}
-					else if (atom1 < atom2) {
-						if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
-									control->vlist_cut, &nbr_data)){
-							++num_far;
-						}
-					}
-					//CHANGE ORIGINAL
-				}
-			}
-			++iter;
-		}
-
-		//indices[ atom1 ] = num_far;// * SAFE_ZONE;
-		indices[ atom1 ] = num_far * SAFE_ZONE;
-	}
-
-	/*One thread per atom Implementation */
-	GLOBAL void New_Estimate_NumNeighbors ( 	reax_atom *sys_atoms,
-			grid g,
-			simulation_box *box,
-			control_params* control, 
-			int N, int *indices)
-	{
-		int *atoms = g.atoms;
-		int *top = g.top;
-		ivec *nbrs = g.nbrs; 
-		rvec *nbrs_cp = g.nbrs_cp;
-
-		int 	*nbr_atoms;
-		int   atom1, atom2, iter, max, m, num_far;
-		int 	x, y, z, i;
-		int atom_x, atom_y, atom_z;
-		far_neighbor_data temp;
-		rvec atom1_x;
-
-		int index = blockIdx.x * blockDim.x + threadIdx.x;
-		if (index > N) return;
-
-		atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]);
-		atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]);
-		atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]);
+                return num_far * SAFE_ZONE;
+            }
+
+    GLOBAL void Estimate_NumNeighbors ( reax_atom *sys_atoms,
+            grid g,
+            simulation_box *box,
+            control_params *control,
+            int *indices)
+    {
+        int *atoms = g.atoms;
+        int *top = g.top;
+        ivec *nbrs = g.nbrs; 
+        rvec *nbrs_cp = g.nbrs_cp;
+
+        int *nbr_atoms;
+        int atom1, atom2, l, iter, max, m, num_far;
+        far_neighbor_data nbr_data;
+        int x, y, z, i;
+
+        if (threadIdx.x >= *(top + index_grid(1))){
+            return;
+        } 
+
+        nbrs = nbrs + index_grid (g.max_nbrs);
+        nbrs_cp = nbrs_cp + index_grid (g.max_nbrs);
+        atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x];
+
+        num_far = 0;
+        iter = 0;
+
+        while (nbrs[iter][0] >= 0) {
+            x = nbrs[iter][0];
+            y = nbrs[iter][1];
+            z = nbrs[iter][2];
+
+            //condition check for cutoff here
+            if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
+                    SQR (control->vlist_cut)) 
+            {
+                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+                max = top [index_grid_3d(x, y, z, &g)];
+                for (m = 0; m < max; m++) {
+                    atom2 = nbr_atoms[m];
+
+                    //CHANGE ORIGINAL
+                    /*
+                       if (atom1 > atom2) {
+                       if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                       control->vlist_cut, &nbr_data)){
+                       ++num_far;
+                       }
+                       }
+                     */
+                    if (atom1 > atom2) {
+                        if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                                    control->vlist_cut, &nbr_data)){
+                            ++num_far;
+                        }
+                    }
+                    else if (atom1 < atom2) {
+                        if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
+                                    control->vlist_cut, &nbr_data)){
+                            ++num_far;
+                        }
+                    }
+                    //CHANGE ORIGINAL
+                }
+            }
+            ++iter;
+        }
+
+        //indices[ atom1 ] = num_far;// * SAFE_ZONE;
+        indices[ atom1 ] = num_far * SAFE_ZONE;
+    }
+
+    /*One thread per atom Implementation */
+    GLOBAL void New_Estimate_NumNeighbors (     reax_atom *sys_atoms,
+            grid g,
+            simulation_box *box,
+            control_params* control, 
+            int N, int *indices)
+    {
+        int *atoms = g.atoms;
+        int *top = g.top;
+        ivec *nbrs = g.nbrs; 
+        rvec *nbrs_cp = g.nbrs_cp;
+
+        int     *nbr_atoms;
+        int   atom1, atom2, iter, max, m, num_far;
+        int     x, y, z, i;
+        int atom_x, atom_y, atom_z;
+        far_neighbor_data temp;
+        rvec atom1_x;
+
+        int index = blockIdx.x * blockDim.x + threadIdx.x;
+        if (index > N) return;
+
+        atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]);
+        atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]);
+        atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]);
 
 #ifdef __BNVT_FIX__
-		if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
-		if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
-		if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
+        if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
+        if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
+        if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
 #endif
 
-		nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-		nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-		atom1 = index;
-
-		rvec_Copy (atom1_x, sys_atoms [atom1].x );
-
-		num_far = 0;
-		iter = 0;
-
-		while (nbrs[iter][0] >= 0) {
-			x = nbrs[iter][0];
-			y = nbrs[iter][1];
-			z = nbrs[iter][2];
-
-			if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= 
-					SQR (control->vlist_cut)) 
-			{
-				nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-				max = top [index_grid_3d(x, y, z, &g)];
-
-				for (m = 0; m < max; m++) 
-				{
-					atom2 = nbr_atoms[m];
-					if (atom1 > atom2) {
-						if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
-									control->vlist_cut, &temp)){
-							num_far++;
-						}
-					}
-					else if (atom1 < atom2) {
-						if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
-									control->vlist_cut, &temp)){
-							num_far ++;
-						}
-					}
-				}
-			}
-			++iter;
-		}
-		indices [atom1] = num_far * SAFE_ZONE;
-	}
-
-
-
-	/*One thread per entry in the gcell implementation */
-	GLOBAL void Generate_Neighbor_Lists ( 	reax_atom *sys_atoms,
-			grid g,
-			simulation_box *box,
-			control_params* control, 
-			list far_nbrs)
-	{
-		int *atoms = g.atoms;
-		int *top = g.top;
-		ivec *nbrs = g.nbrs; 
-		rvec *nbrs_cp = g.nbrs_cp;
-
-		int 	*nbr_atoms;
-		int   atom1, atom2, l, iter, max, m, num_far;
-		int 	x, y, z, i;
-		far_neighbor_data *nbr_data;
-		far_neighbor_data temp;
-
-		if (threadIdx.x >= *(top + index_grid(1))){
-			return;
-		} 
-
-		nbrs = nbrs + index_grid (g.max_nbrs);
-		nbrs_cp = nbrs_cp + index_grid (g.max_nbrs);
-		atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x];
-
-		num_far = Start_Index (atom1, &far_nbrs);
-		//Set_Start_Index (atom1, 0, &far_nbrs);
-		//num_far =  0;
-		iter = 0;
-
-		while (nbrs[iter][0] >= 0) {
-			x = nbrs[iter][0];
-			y = nbrs[iter][1];
-			z = nbrs[iter][2];
-
-			//condition check for cutoff here
-			if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
-					SQR (control->vlist_cut)) 
-			{
-				nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-				max = top [index_grid_3d(x, y, z, &g)];
-
-				for (m = 0; m < max; m++) {
-					atom2 = nbr_atoms[m];
-
-					//nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] );
-
-					//CHANGE ORIGINAL
-					/*
-					   if (atom1 > atom2) {
-					   if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-					   control->vlist_cut, &temp)){
-
-					   nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-					   nbr_data->nbr = atom2;
-					   nbr_data->rel_box[0] = temp.rel_box[0];
-					   nbr_data->rel_box[1] = temp.rel_box[1];
-					   nbr_data->rel_box[2] = temp.rel_box[2];
-
-					   nbr_data->d = temp.d;
-					   nbr_data->dvec[0] = temp.dvec[0];
-					   nbr_data->dvec[1] = temp.dvec[1];
-					   nbr_data->dvec[2] = temp.dvec[2];
-					   ++num_far;
-					   }
-					   }
-					 */
-					if (atom1 > atom2) {
-						if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-									control->vlist_cut, &temp)){
-							nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-							nbr_data->nbr = atom2;
-							nbr_data->rel_box[0] = temp.rel_box[0];
-							nbr_data->rel_box[1] = temp.rel_box[1];
-							nbr_data->rel_box[2] = temp.rel_box[2];
-
-							nbr_data->d = temp.d;
-							nbr_data->dvec[0] = temp.dvec[0];
-							nbr_data->dvec[1] = temp.dvec[1];
-							nbr_data->dvec[2] = temp.dvec[2];
-							++num_far;
-						}
-					}
-					else if (atom1 < atom2) {
-						if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
-									control->vlist_cut, &temp)){
-							nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-							nbr_data->nbr = atom2;
-							nbr_data->rel_box[0] = temp.rel_box[0];
-							nbr_data->rel_box[1] = temp.rel_box[1];
-							nbr_data->rel_box[2] = temp.rel_box[2];
-
-							nbr_data->d = temp.d;
-							nbr_data->dvec[0] = temp.dvec[0];
-							nbr_data->dvec[1] = temp.dvec[1];
-							nbr_data->dvec[2] = temp.dvec[2];
-							++num_far;
-						}
-					}
-					//CHANGE ORIGINAL
-				}
-			}
-			++iter;
-		}
-
-		//end the far_neighbor list here
-		Set_End_Index (atom1, num_far, &far_nbrs);
-	}
-
-
-	/*One thread per atom Implementation */
-	GLOBAL void New_Generate_Neighbor_Lists ( 	reax_atom *sys_atoms,
-			grid g,
-			simulation_box *box,
-			control_params* control, 
-			list far_nbrs, int N)
-	{
-		int *atoms = g.atoms;
-		int *top = g.top;
-		ivec *nbrs = g.nbrs; 
-		rvec *nbrs_cp = g.nbrs_cp;
-
-		int 	*nbr_atoms;
-		int   atom1, atom2, l, iter, max, m, num_far;
-		int 	x, y, z, i;
-		far_neighbor_data *nbr_data, *my_start;
-		far_neighbor_data temp;
-		int atom_x, atom_y, atom_z;
-		rvec atom1_x;
-
-		int index = blockIdx.x * blockDim.x + threadIdx.x;
-		if (index > N) return;
-
-		atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]);
-		atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]);
-		atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]);
+        nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+        nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+        atom1 = index;
+
+        rvec_Copy (atom1_x, sys_atoms [atom1].x );
+
+        num_far = 0;
+        iter = 0;
+
+        while (nbrs[iter][0] >= 0) {
+            x = nbrs[iter][0];
+            y = nbrs[iter][1];
+            z = nbrs[iter][2];
+
+            if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= 
+                    SQR (control->vlist_cut)) 
+            {
+                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+                max = top [index_grid_3d(x, y, z, &g)];
+
+                for (m = 0; m < max; m++) 
+                {
+                    atom2 = nbr_atoms[m];
+                    if (atom1 > atom2) {
+                        if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
+                                    control->vlist_cut, &temp)){
+                            num_far++;
+                        }
+                    }
+                    else if (atom1 < atom2) {
+                        if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
+                                    control->vlist_cut, &temp)){
+                            num_far ++;
+                        }
+                    }
+                }
+            }
+            ++iter;
+        }
+        indices [atom1] = num_far * SAFE_ZONE;
+    }
+
+
+
+    /*One thread per entry in the gcell implementation */
+    GLOBAL void Generate_Neighbor_Lists (     reax_atom *sys_atoms,
+            grid g,
+            simulation_box *box,
+            control_params* control, 
+            list far_nbrs)
+    {
+        int *atoms = g.atoms;
+        int *top = g.top;
+        ivec *nbrs = g.nbrs; 
+        rvec *nbrs_cp = g.nbrs_cp;
+
+        int     *nbr_atoms;
+        int   atom1, atom2, l, iter, max, m, num_far;
+        int     x, y, z, i;
+        far_neighbor_data *nbr_data;
+        far_neighbor_data temp;
+
+        if (threadIdx.x >= *(top + index_grid(1))){
+            return;
+        } 
+
+        nbrs = nbrs + index_grid (g.max_nbrs);
+        nbrs_cp = nbrs_cp + index_grid (g.max_nbrs);
+        atom1 = atoms [ index_grid (g.max_atoms) + threadIdx.x];
+
+        num_far = Start_Index (atom1, &far_nbrs);
+        //Set_Start_Index (atom1, 0, &far_nbrs);
+        //num_far =  0;
+        iter = 0;
+
+        while (nbrs[iter][0] >= 0) {
+            x = nbrs[iter][0];
+            y = nbrs[iter][1];
+            z = nbrs[iter][2];
+
+            //condition check for cutoff here
+            if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
+                    SQR (control->vlist_cut)) 
+            {
+                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+                max = top [index_grid_3d(x, y, z, &g)];
+
+                for (m = 0; m < max; m++) {
+                    atom2 = nbr_atoms[m];
+
+                    //nbr_data = & ( far_nbrs.select.far_nbr_list[atom1 * g.max_cuda_nbrs + num_far] );
+
+                    //CHANGE ORIGINAL
+                    /*
+                       if (atom1 > atom2) {
+                       if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                       control->vlist_cut, &temp)){
+
+                       nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                       nbr_data->nbr = atom2;
+                       nbr_data->rel_box[0] = temp.rel_box[0];
+                       nbr_data->rel_box[1] = temp.rel_box[1];
+                       nbr_data->rel_box[2] = temp.rel_box[2];
+
+                       nbr_data->d = temp.d;
+                       nbr_data->dvec[0] = temp.dvec[0];
+                       nbr_data->dvec[1] = temp.dvec[1];
+                       nbr_data->dvec[2] = temp.dvec[2];
+                       ++num_far;
+                       }
+                       }
+                     */
+                    if (atom1 > atom2) {
+                        if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                                    control->vlist_cut, &temp)){
+                            nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                            nbr_data->nbr = atom2;
+                            nbr_data->rel_box[0] = temp.rel_box[0];
+                            nbr_data->rel_box[1] = temp.rel_box[1];
+                            nbr_data->rel_box[2] = temp.rel_box[2];
+
+                            nbr_data->d = temp.d;
+                            nbr_data->dvec[0] = temp.dvec[0];
+                            nbr_data->dvec[1] = temp.dvec[1];
+                            nbr_data->dvec[2] = temp.dvec[2];
+                            ++num_far;
+                        }
+                    }
+                    else if (atom1 < atom2) {
+                        if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
+                                    control->vlist_cut, &temp)){
+                            nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                            nbr_data->nbr = atom2;
+                            nbr_data->rel_box[0] = temp.rel_box[0];
+                            nbr_data->rel_box[1] = temp.rel_box[1];
+                            nbr_data->rel_box[2] = temp.rel_box[2];
+
+                            nbr_data->d = temp.d;
+                            nbr_data->dvec[0] = temp.dvec[0];
+                            nbr_data->dvec[1] = temp.dvec[1];
+                            nbr_data->dvec[2] = temp.dvec[2];
+                            ++num_far;
+                        }
+                    }
+                    //CHANGE ORIGINAL
+                }
+            }
+            ++iter;
+        }
+
+        //end the far_neighbor list here
+        Set_End_Index (atom1, num_far, &far_nbrs);
+    }
+
+
+    /*One thread per atom Implementation */
+    GLOBAL void New_Generate_Neighbor_Lists (     reax_atom *sys_atoms,
+            grid g,
+            simulation_box *box,
+            control_params* control, 
+            list far_nbrs, int N)
+    {
+        int *atoms = g.atoms;
+        int *top = g.top;
+        ivec *nbrs = g.nbrs; 
+        rvec *nbrs_cp = g.nbrs_cp;
+
+        int     *nbr_atoms;
+        int   atom1, atom2, l, iter, max, m, num_far;
+        int     x, y, z, i;
+        far_neighbor_data *nbr_data, *my_start;
+        far_neighbor_data temp;
+        int atom_x, atom_y, atom_z;
+        rvec atom1_x;
+
+        int index = blockIdx.x * blockDim.x + threadIdx.x;
+        if (index > N) return;
+
+        atom_x = (int)(sys_atoms[index].x[0] * g.inv_len[0]);
+        atom_y = (int)(sys_atoms[index].x[1] * g.inv_len[1]);
+        atom_z = (int)(sys_atoms[index].x[2] * g.inv_len[2]);
 
 #ifdef __BNVT_FIX__
-		if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
-		if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
-		if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
+        if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
+        if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
+        if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
 #endif
 
-		nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-		nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-		atom1 = index;
-
-		rvec_Copy (atom1_x, sys_atoms [atom1].x );
-
-		num_far = Start_Index (atom1, &far_nbrs);
-		my_start = & (far_nbrs.select.far_nbr_list [num_far] );
-
-		//Set_Start_Index (atom1, 0, &far_nbrs);
-		//num_far =  0;
-		iter = 0;
-
-		while (nbrs[iter][0] >= 0) {
-			x = nbrs[iter][0];
-			y = nbrs[iter][1];
-			z = nbrs[iter][2];
-
-			//condition check for cutoff here
-			//if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
-			if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= 
-					SQR (control->vlist_cut)) 
-			{
-				nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-				max = top [index_grid_3d(x, y, z, &g)];
-
-				for (m = 0; m < max; m++) 
-				{
-					atom2 = nbr_atoms[m];
-					if (atom1 > atom2) {
-						if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
-									control->vlist_cut, &temp)){
-							//nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-							nbr_data = my_start;
-							nbr_data->nbr = atom2;
-							nbr_data->rel_box[0] = temp.rel_box[0];
-							nbr_data->rel_box[1] = temp.rel_box[1];
-							nbr_data->rel_box[2] = temp.rel_box[2];
-
-							nbr_data->d = temp.d;
-							nbr_data->dvec[0] = temp.dvec[0];
-							nbr_data->dvec[1] = temp.dvec[1];
-							nbr_data->dvec[2] = temp.dvec[2];
-							num_far++;
-							my_start ++;
-						}
-					}
-					else if (atom1 < atom2) {
-						if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
-									control->vlist_cut, &temp)){
-							//nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
-							nbr_data = my_start;
-							nbr_data->nbr = atom2;
-							nbr_data->rel_box[0] = temp.rel_box[0];
-							nbr_data->rel_box[1] = temp.rel_box[1];
-							nbr_data->rel_box[2] = temp.rel_box[2];
-
-							nbr_data->d = temp.d;
-							nbr_data->dvec[0] = temp.dvec[0];
-							nbr_data->dvec[1] = temp.dvec[1];
-							nbr_data->dvec[2] = temp.dvec[2];
-							num_far ++;
-							my_start ++;
-						}
-					}
-					//CHANGE ORIGINAL
-				}
-			}
-			++iter;
-		}
-
-		//end the far_neighbor list here
-		Set_End_Index (atom1, num_far, &far_nbrs);
-	}
-
-	/*Multiple threads per atom Implementation */
-	GLOBAL void Test_Generate_Neighbor_Lists ( 	reax_atom *sys_atoms,
-			grid g,
-			simulation_box *box,
-			control_params* control, 
-			list far_nbrs, int N )
-	{
-
-		extern __shared__ int __nbr[];
-		extern __shared__ int __sofar [];
-		bool	nbrgen;
-
-		int __THREADS_PER_ATOM__ = NBRS_THREADS_PER_ATOM;
-
-		int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-		int warp_id = thread_id / __THREADS_PER_ATOM__;
-		int lane_id = thread_id & (__THREADS_PER_ATOM__ -1);
-		int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-
-		if (warp_id >= N ) return;
-
-		int *tnbr = __nbr;
-		//int *nbrssofar = __nbr + __THREADS_PER_ATOM__;
-		int *nbrssofar = __nbr + blockDim.x;
-
-		int *atoms = g.atoms;
-		int *top = g.top;
-		ivec *nbrs = g.nbrs; 
-		rvec *nbrs_cp = g.nbrs_cp;
-
-		int 	*nbr_atoms;
-		int   atom1, atom2, l, iter, max, m, num_far;
-		int leader = -10;
-		int 	x, y, z, i;
-		far_neighbor_data *nbr_data, *my_start;
-		far_neighbor_data temp;
-		int atom_x, atom_y, atom_z;
-
-
-		atom1 = warp_id;
-		atom_x = (int)(sys_atoms[atom1].x[0] * g.inv_len[0]);
-		atom_y = (int)(sys_atoms[atom1].x[1] * g.inv_len[1]);
-		atom_z = (int)(sys_atoms[atom1].x[2] * g.inv_len[2]);
+        nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+        nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+        atom1 = index;
+
+        rvec_Copy (atom1_x, sys_atoms [atom1].x );
+
+        num_far = Start_Index (atom1, &far_nbrs);
+        my_start = & (far_nbrs.select.far_nbr_list [num_far] );
+
+        //Set_Start_Index (atom1, 0, &far_nbrs);
+        //num_far =  0;
+        iter = 0;
+
+        while (nbrs[iter][0] >= 0) {
+            x = nbrs[iter][0];
+            y = nbrs[iter][1];
+            z = nbrs[iter][2];
+
+            //condition check for cutoff here
+            //if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms[atom1].x) <= 
+            if (DistSqr_to_CP (nbrs_cp[iter], atom1_x) <= 
+                    SQR (control->vlist_cut)) 
+            {
+                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+                max = top [index_grid_3d(x, y, z, &g)];
+
+                for (m = 0; m < max; m++) 
+                {
+                    atom2 = nbr_atoms[m];
+                    if (atom1 > atom2) {
+                        if (Are_Far_Neighbors (atom1_x, sys_atoms[atom2].x, box, 
+                                    control->vlist_cut, &temp)){
+                            //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                            nbr_data = my_start;
+                            nbr_data->nbr = atom2;
+                            nbr_data->rel_box[0] = temp.rel_box[0];
+                            nbr_data->rel_box[1] = temp.rel_box[1];
+                            nbr_data->rel_box[2] = temp.rel_box[2];
+
+                            nbr_data->d = temp.d;
+                            nbr_data->dvec[0] = temp.dvec[0];
+                            nbr_data->dvec[1] = temp.dvec[1];
+                            nbr_data->dvec[2] = temp.dvec[2];
+                            num_far++;
+                            my_start ++;
+                        }
+                    }
+                    else if (atom1 < atom2) {
+                        if (Are_Far_Neighbors (sys_atoms[atom2].x, atom1_x, box, 
+                                    control->vlist_cut, &temp)){
+                            //nbr_data = & ( far_nbrs.select.far_nbr_list[num_far] );
+                            nbr_data = my_start;
+                            nbr_data->nbr = atom2;
+                            nbr_data->rel_box[0] = temp.rel_box[0];
+                            nbr_data->rel_box[1] = temp.rel_box[1];
+                            nbr_data->rel_box[2] = temp.rel_box[2];
+
+                            nbr_data->d = temp.d;
+                            nbr_data->dvec[0] = temp.dvec[0];
+                            nbr_data->dvec[1] = temp.dvec[1];
+                            nbr_data->dvec[2] = temp.dvec[2];
+                            num_far ++;
+                            my_start ++;
+                        }
+                    }
+                    //CHANGE ORIGINAL
+                }
+            }
+            ++iter;
+        }
+
+        //end the far_neighbor list here
+        Set_End_Index (atom1, num_far, &far_nbrs);
+    }
+
+    /*Multiple threads per atom Implementation */
+    GLOBAL void Test_Generate_Neighbor_Lists (     reax_atom *sys_atoms,
+            grid g,
+            simulation_box *box,
+            control_params* control, 
+            list far_nbrs, int N )
+    {
+
+        extern __shared__ int __nbr[];
+        extern __shared__ int __sofar [];
+        bool    nbrgen;
+
+        int __THREADS_PER_ATOM__ = NBRS_THREADS_PER_ATOM;
+
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        int warp_id = thread_id / __THREADS_PER_ATOM__;
+        int lane_id = thread_id & (__THREADS_PER_ATOM__ -1);
+        int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+
+        if (warp_id >= N ) return;
+
+        int *tnbr = __nbr;
+        //int *nbrssofar = __nbr + __THREADS_PER_ATOM__;
+        int *nbrssofar = __nbr + blockDim.x;
+
+        int *atoms = g.atoms;
+        int *top = g.top;
+        ivec *nbrs = g.nbrs; 
+        rvec *nbrs_cp = g.nbrs_cp;
+
+        int     *nbr_atoms;
+        int   atom1, atom2, l, iter, max, m, num_far;
+        int leader = -10;
+        int     x, y, z, i;
+        far_neighbor_data *nbr_data, *my_start;
+        far_neighbor_data temp;
+        int atom_x, atom_y, atom_z;
+
+
+        atom1 = warp_id;
+        atom_x = (int)(sys_atoms[atom1].x[0] * g.inv_len[0]);
+        atom_y = (int)(sys_atoms[atom1].x[1] * g.inv_len[1]);
+        atom_z = (int)(sys_atoms[atom1].x[2] * g.inv_len[2]);
 
 #ifdef __BNVT_FIX__
-		if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
-		if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
-		if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
+        if (atom_x >= g.ncell[0]) atom_x = g.ncell[0]-1;
+        if (atom_y >= g.ncell[1]) atom_y = g.ncell[1]-1;
+        if (atom_z >= g.ncell[2]) atom_z = g.ncell[2]-1;
 #endif
 
-		nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-		nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
-
-		num_far = Start_Index (atom1, &far_nbrs);
-		my_start = & (far_nbrs.select.far_nbr_list [num_far] );
-
-		iter = 0;
-		tnbr[threadIdx.x] = 0;
-
-		if (lane_id == 0) {
-			//nbrssofar [threadIdx.x /__THREADS_PER_ATOM__] = 0;
-			nbrssofar [my_bucket] = 0;
-		}
-
-		__syncthreads ();
-
-		while ((nbrs[iter][0] >= 0)) {
-			x = nbrs[iter][0];
-			y = nbrs[iter][1];
-			z = nbrs[iter][2];
-
-			tnbr[threadIdx.x] = 0;
-			nbrgen = false;
-
-			if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms [atom1].x) <= 
-					SQR (control->vlist_cut)) 
-			{
-				nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
-				max = top [index_grid_3d(x, y, z, &g)];
-
-				tnbr[threadIdx.x] = 0;
-				nbrgen = false;
-				m = lane_id ; //0-31
-				int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1);
-				int iterations = 0;
-				//while (m < max)
-				while (iterations < loopcount)
-				{
-					tnbr [threadIdx.x] = 0;
-					nbrgen = false;
-
-					if (m < max) {
-						atom2 = nbr_atoms[m];
-						if (atom1 > atom2) {
-							if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
-										control->vlist_cut, &temp))
-							{
-								tnbr [threadIdx.x] = 1;
-								nbrgen = true;
-							}
-						}
-						else if (atom1 < atom2) {
-							if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
-										control->vlist_cut, &temp)){
-								tnbr [threadIdx.x] = 1;
-								nbrgen = true;
-							}
-						}
-					}
-
-					if (nbrgen)
-					{
-						//do leader selection here
-						leader = -1;
-						//for (l = threadIdx.x / __THREADS_PER_ATOM__; l < threadIdx.x / __THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++)
-						for (l = my_bucket *__THREADS_PER_ATOM__; l < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++)
-							if (tnbr[l]){
-								leader = l;
-								break;
-							}
-
-						//do the reduction;
-						if (threadIdx.x == leader) 
-							for (l = 1; l < __THREADS_PER_ATOM__; l++)
-								//tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + l] += tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + (l-1)];	
-								tnbr [my_bucket * __THREADS_PER_ATOM__ + l] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (l-1)];	
-					}
-
-					//__syncthreads ();
-					//atomicAdd ( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1);
-					//while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ;
-
-					if (nbrgen)
-					{
-						//got the indices
-						//nbr_data = my_start + nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] + tnbr [threadIdx.x] - 1;
-						nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1;
-						nbr_data->nbr = atom2;
-						nbr_data->rel_box[0] = temp.rel_box[0];
-						nbr_data->rel_box[1] = temp.rel_box[1];
-						nbr_data->rel_box[2] = temp.rel_box[2];
-
-						nbr_data->d = temp.d;
-						nbr_data->dvec[0] = temp.dvec[0];
-						nbr_data->dvec[1] = temp.dvec[1];
-						nbr_data->dvec[2] = temp.dvec[2];
-
-						if (threadIdx.x == leader)
-							//nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] += tnbr[(threadIdx.x / __THREADS_PER_ATOM__)*__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
-							nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
-					}
-
-					m += __THREADS_PER_ATOM__;
-					iterations ++;
-
-					//cleanup
-					nbrgen = false;
-					tnbr [threadIdx.x] = 0;
-				}
-			}
-			++iter;
-		}
-
-		__syncthreads ();
-
-		//end the far_neighbor list here
-		if (lane_id == 0)
-			Set_End_Index (atom1, num_far + nbrssofar[my_bucket], &far_nbrs);
-		//Set_End_Index (atom1, num_far + tnbr[63], &far_nbrs);
-	}
-
-	void Cuda_Generate_Neighbor_Lists (reax_system *system, static_storage *workspace, control_params *control, bool estimate)
-	{
-		real t_start, t_elapsed;
-		real t_1, t_2;
-
-		list *far_nbrs = dev_lists + FAR_NBRS;
-
-		int *d_indices = (int *) scratch;
-		int *nbrs_start, *nbrs_end;
-		int i, max_nbrs = 0;
-		int nbs;
-
-		t_start = Get_Time (); 
-
-		Cuda_Bin_Atoms (system, workspace);
-		Cuda_Bin_Atoms_Sync ( system );
-
-		if (dev_workspace->realloc.estimate_nbrs > -1) {
-
-			/*reset the re-neighbor condition */
-			dev_workspace->realloc.estimate_nbrs = -1;
-
-			//#ifdef __DEBUG_CUDA__
-			fprintf (stderr, "Recomputing the neighbors estimate.... \n");
-			//#endif
-			cuda_memset (d_indices, 0, INT_SIZE * system->N, RES_SCRATCH );
-			/*
-			   dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
-			   dim3 threadsperblock (system->g.max_atoms);
-
-			   Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>>
-			   (system->d_atoms, system->d_g, system->d_box, 
-			   (control_params *)control->d_control, d_indices);
-			   cudaThreadSynchronize ();
-			   cudaCheckError ();
-			 */
-			nbs = (system->N / NBRS_BLOCK_SIZE) + (((system->N) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-			New_Estimate_NumNeighbors <<<nbs, NBRS_BLOCK_SIZE>>> 
-				( 	system->d_atoms, system->d_g,
-					system->d_box, (control_params *)control->d_control,
-					system->N, d_indices);
-			cudaThreadSynchronize ();
-			cudaCheckError ();
-
-
-			int *nbrs_indices = NULL;
-			nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) );
-			if (nbrs_indices == NULL) 
-			{
-				fprintf (stderr, "Malloc failed for nbrs indices .... \n");
-				exit (1);
-			}
-			memset (nbrs_indices , 0, INT_SIZE * (system->N+1) ); 
-
-			copy_host_device (nbrs_indices+1, d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); 
-			for (int i = 1; i <= system->N; i++) 
-				nbrs_indices [i] += nbrs_indices [i-1];
-
-			copy_host_device (nbrs_indices, (far_nbrs->index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ );
-			copy_host_device (nbrs_indices, (far_nbrs->end_index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ );
-
-			free (nbrs_indices);
-		}
-
-		/*
-		   One thread per atom Implementation
-		   Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> 
-		   (system->d_atoms, system->d_g, system->d_box, 
-		   (control_params *)control->d_control, *far_nbrs);
-		 */
-		nbs = (system->N * NBRS_THREADS_PER_ATOM/ NBRS_BLOCK_SIZE) + 
-			(((system->N *NBRS_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
-
-		/* Multiple threads per atom Implementation */
-		Test_Generate_Neighbor_Lists <<<nbs, NBRS_BLOCK_SIZE, 
-					     INT_SIZE * (NBRS_BLOCK_SIZE+ NBRS_BLOCK_SIZE/NBRS_THREADS_PER_ATOM) >>> 
-						     (system->d_atoms, system->d_g, system->d_box, 
-						      (control_params *)control->d_control, *far_nbrs, system->N );
-		cudaThreadSynchronize (); 
-		cudaCheckError (); 
-
-		t_elapsed = Get_Timing_Info (t_start);
-		d_timing.nbrs += t_elapsed;
+        nbrs = nbrs + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+        nbrs_cp = nbrs_cp + index_grid_nbrs (atom_x, atom_y, atom_z, 0, &g);
+
+        num_far = Start_Index (atom1, &far_nbrs);
+        my_start = & (far_nbrs.select.far_nbr_list [num_far] );
+
+        iter = 0;
+        tnbr[threadIdx.x] = 0;
+
+        if (lane_id == 0) {
+            //nbrssofar [threadIdx.x /__THREADS_PER_ATOM__] = 0;
+            nbrssofar [my_bucket] = 0;
+        }
+
+        __syncthreads ();
+
+        while ((nbrs[iter][0] >= 0)) {
+            x = nbrs[iter][0];
+            y = nbrs[iter][1];
+            z = nbrs[iter][2];
+
+            tnbr[threadIdx.x] = 0;
+            nbrgen = false;
+
+            if (DistSqr_to_CP (nbrs_cp[iter], sys_atoms [atom1].x) <= 
+                    SQR (control->vlist_cut)) 
+            {
+                nbr_atoms = &(atoms [index_grid_atoms (x, y, z, 0, &g) ]);
+                max = top [index_grid_3d(x, y, z, &g)];
+
+                tnbr[threadIdx.x] = 0;
+                nbrgen = false;
+                m = lane_id ; //0-31
+                int loopcount = max / __THREADS_PER_ATOM__ + ((max % __THREADS_PER_ATOM__) == 0 ? 0 : 1);
+                int iterations = 0;
+                //while (m < max)
+                while (iterations < loopcount)
+                {
+                    tnbr [threadIdx.x] = 0;
+                    nbrgen = false;
+
+                    if (m < max) {
+                        atom2 = nbr_atoms[m];
+                        if (atom1 > atom2) {
+                            if (Are_Far_Neighbors (sys_atoms[atom1].x, sys_atoms[atom2].x, box, 
+                                        control->vlist_cut, &temp))
+                            {
+                                tnbr [threadIdx.x] = 1;
+                                nbrgen = true;
+                            }
+                        }
+                        else if (atom1 < atom2) {
+                            if (Are_Far_Neighbors (sys_atoms[atom2].x, sys_atoms[atom1].x, box, 
+                                        control->vlist_cut, &temp)){
+                                tnbr [threadIdx.x] = 1;
+                                nbrgen = true;
+                            }
+                        }
+                    }
+
+                    if (nbrgen)
+                    {
+                        //do leader selection here
+                        leader = -1;
+                        //for (l = threadIdx.x / __THREADS_PER_ATOM__; l < threadIdx.x / __THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++)
+                        for (l = my_bucket *__THREADS_PER_ATOM__; l < (my_bucket)*__THREADS_PER_ATOM__ + __THREADS_PER_ATOM__; l++)
+                            if (tnbr[l]){
+                                leader = l;
+                                break;
+                            }
+
+                        //do the reduction;
+                        if (threadIdx.x == leader) 
+                            for (l = 1; l < __THREADS_PER_ATOM__; l++)
+                                //tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + l] += tnbr [(threadIdx.x / __THREADS_PER_ATOM__) * __THREADS_PER_ATOM__ + (l-1)];    
+                                tnbr [my_bucket * __THREADS_PER_ATOM__ + l] += tnbr [my_bucket * __THREADS_PER_ATOM__ + (l-1)];    
+                    }
+
+                    //__syncthreads ();
+                    //atomicAdd ( &warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ], 1);
+                    //while ( warp_sync [threadIdx.x / __THREADS_PER_ATOM__ ] < __THREADS_PER_ATOM__ ) ;
+
+                    if (nbrgen)
+                    {
+                        //got the indices
+                        //nbr_data = my_start + nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] + tnbr [threadIdx.x] - 1;
+                        nbr_data = my_start + nbrssofar[my_bucket] + tnbr [threadIdx.x] - 1;
+                        nbr_data->nbr = atom2;
+                        nbr_data->rel_box[0] = temp.rel_box[0];
+                        nbr_data->rel_box[1] = temp.rel_box[1];
+                        nbr_data->rel_box[2] = temp.rel_box[2];
+
+                        nbr_data->d = temp.d;
+                        nbr_data->dvec[0] = temp.dvec[0];
+                        nbr_data->dvec[1] = temp.dvec[1];
+                        nbr_data->dvec[2] = temp.dvec[2];
+
+                        if (threadIdx.x == leader)
+                            //nbrssofar[threadIdx.x / __THREADS_PER_ATOM__] += tnbr[(threadIdx.x / __THREADS_PER_ATOM__)*__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
+                            nbrssofar[my_bucket] += tnbr[my_bucket *__THREADS_PER_ATOM__ + (__THREADS_PER_ATOM__ - 1)];
+                    }
+
+                    m += __THREADS_PER_ATOM__;
+                    iterations ++;
+
+                    //cleanup
+                    nbrgen = false;
+                    tnbr [threadIdx.x] = 0;
+                }
+            }
+            ++iter;
+        }
+
+        __syncthreads ();
+
+        //end the far_neighbor list here
+        if (lane_id == 0)
+            Set_End_Index (atom1, num_far + nbrssofar[my_bucket], &far_nbrs);
+        //Set_End_Index (atom1, num_far + tnbr[63], &far_nbrs);
+    }
+
+    void Cuda_Generate_Neighbor_Lists (reax_system *system, static_storage *workspace, control_params *control, bool estimate)
+    {
+        real t_start, t_elapsed;
+        real t_1, t_2;
+
+        list *far_nbrs = dev_lists + FAR_NBRS;
+
+        int *d_indices = (int *) scratch;
+        int *nbrs_start, *nbrs_end;
+        int i, max_nbrs = 0;
+        int nbs;
+
+        t_start = Get_Time (); 
+
+        Cuda_Bin_Atoms (system, workspace);
+        Cuda_Bin_Atoms_Sync ( system );
+
+        if (dev_workspace->realloc.estimate_nbrs > -1) {
+
+            /*reset the re-neighbor condition */
+            dev_workspace->realloc.estimate_nbrs = -1;
+
+            //#ifdef __DEBUG_CUDA__
+            fprintf (stderr, "Recomputing the neighbors estimate.... \n");
+            //#endif
+            cuda_memset (d_indices, 0, INT_SIZE * system->N, RES_SCRATCH );
+            /*
+               dim3 blockspergrid (system->g.ncell[0], system->g.ncell[1], system->g.ncell[2]);
+               dim3 threadsperblock (system->g.max_atoms);
+
+               Estimate_NumNeighbors <<<blockspergrid, threadsperblock >>>
+               (system->d_atoms, system->d_g, system->d_box, 
+               (control_params *)control->d_control, d_indices);
+               cudaThreadSynchronize ();
+               cudaCheckError ();
+             */
+            nbs = (system->N / NBRS_BLOCK_SIZE) + (((system->N) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+            New_Estimate_NumNeighbors <<<nbs, NBRS_BLOCK_SIZE>>> 
+                (     system->d_atoms, system->d_g,
+                    system->d_box, (control_params *)control->d_control,
+                    system->N, d_indices);
+            cudaThreadSynchronize ();
+            cudaCheckError ();
+
+
+            int *nbrs_indices = NULL;
+            nbrs_indices = (int *) malloc( INT_SIZE * (system->N+1) );
+            if (nbrs_indices == NULL) 
+            {
+                fprintf (stderr, "Malloc failed for nbrs indices .... \n");
+                exit (1);
+            }
+            memset (nbrs_indices , 0, INT_SIZE * (system->N+1) ); 
+
+            copy_host_device (nbrs_indices+1, d_indices, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__); 
+            for (int i = 1; i <= system->N; i++) 
+                nbrs_indices [i] += nbrs_indices [i-1];
+
+            copy_host_device (nbrs_indices, (far_nbrs->index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ );
+            copy_host_device (nbrs_indices, (far_nbrs->end_index), INT_SIZE * (system->N), cudaMemcpyHostToDevice, __LINE__ );
+
+            free (nbrs_indices);
+        }
+
+        /*
+           One thread per atom Implementation
+           Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> 
+           (system->d_atoms, system->d_g, system->d_box, 
+           (control_params *)control->d_control, *far_nbrs);
+         */
+        nbs = (system->N * NBRS_THREADS_PER_ATOM/ NBRS_BLOCK_SIZE) + 
+            (((system->N *NBRS_THREADS_PER_ATOM) % NBRS_BLOCK_SIZE) == 0 ? 0 : 1);
+
+        /* Multiple threads per atom Implementation */
+        Test_Generate_Neighbor_Lists <<<nbs, NBRS_BLOCK_SIZE, 
+                         INT_SIZE * (NBRS_BLOCK_SIZE+ NBRS_BLOCK_SIZE/NBRS_THREADS_PER_ATOM) >>> 
+                             (system->d_atoms, system->d_g, system->d_box, 
+                              (control_params *)control->d_control, *far_nbrs, system->N );
+        cudaThreadSynchronize (); 
+        cudaCheckError (); 
+
+        t_elapsed = Get_Timing_Info (t_start);
+        d_timing.nbrs += t_elapsed;
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, "Done with neighbor generation ---> %f \n", t_elapsed);
+        fprintf (stderr, "Done with neighbor generation ---> %f \n", t_elapsed);
 #endif
 
-		/*validate neighbors list*/
-		nbrs_start = (int *) calloc (system->N, INT_SIZE);
-		nbrs_end = (int *) calloc (system->N, INT_SIZE);
+        /*validate neighbors list*/
+        nbrs_start = (int *) calloc (system->N, INT_SIZE);
+        nbrs_end = (int *) calloc (system->N, INT_SIZE);
 
-		copy_host_device (nbrs_start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ );
-		copy_host_device (nbrs_end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ );
+        copy_host_device (nbrs_start, far_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ );
+        copy_host_device (nbrs_end, far_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__ );
 
-		int device_nbrs = 0;
-		for(i = 0; i < system->N; i++)
-		{
-			if ((nbrs_end[i] - nbrs_start[i]) > max_nbrs)
-				max_nbrs = nbrs_end[i] - nbrs_start[i];
+        int device_nbrs = 0;
+        for(i = 0; i < system->N; i++)
+        {
+            if ((nbrs_end[i] - nbrs_start[i]) > max_nbrs)
+                max_nbrs = nbrs_end[i] - nbrs_start[i];
 
-			device_nbrs += nbrs_end[i] - nbrs_start[i]; 
-		}
+            device_nbrs += nbrs_end[i] - nbrs_start[i]; 
+        }
 #ifdef __CUDA_TEST__
-		//fprintf (stderr, " New Device count is : %d \n", device_nbrs);
-		//dev_workspace->realloc.num_far = device_nbrs;
+        //fprintf (stderr, " New Device count is : %d \n", device_nbrs);
+        //dev_workspace->realloc.num_far = device_nbrs;
 #endif
 
 #ifdef __DEBUG_CUDA__
-		fprintf (stderr, "Max neighbors is ---> %d \n", max_nbrs );
-		fprintf (stderr, "DEVICE NEIGHBORS ---> %d \n", device_nbrs);
+        fprintf (stderr, "Max neighbors is ---> %d \n", max_nbrs );
+        fprintf (stderr, "DEVICE NEIGHBORS ---> %d \n", device_nbrs);
 #endif
 
-		//validate check here
-		//get the num_far from the list here
-		for (i = 0; i < system->N-1; i++)
-		{
-			if ((nbrs_end[i] - nbrs_start[i]) > (nbrs_start[i+1] - nbrs_start[i]) * DANGER_ZONE )
-			{
-				dev_workspace->realloc.num_far = device_nbrs;
-				//#ifdef __CUDA_MEM__
-				//fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far);
-				//fprintf (stderr, "Reaching the limits of neighbors for index ----> %d (%d %d %d) \n", 
-				//							i, nbrs_start[i], nbrs_end[i], nbrs_start[i+1]);
-				//#endif
-			}
-
-			if (nbrs_end[i] > nbrs_start[i+1]) {
-				fprintf( stderr, "**ran out of space on far_nbrs: start[i] = %d, end[i]=%d, start[i+1]=%d, end[i+1] = %d",
-						nbrs_start[i], nbrs_end[i], nbrs_start[i+1], nbrs_end[i+1]);
-				exit( INSUFFICIENT_SPACE );
-			}
-		}
-
-		if ((nbrs_end[i] - nbrs_start[i]) > (far_nbrs->num_intrs - nbrs_start[i]) * DANGER_ZONE ) {
-			dev_workspace->realloc.num_far = device_nbrs;
-			//#ifdef __CUDA_MEM__
-			//fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far);
-			//fprintf (stderr, "Reaching the limits of neighbors for index ----> %d start: %d, end: %d, count: %d\n"
-			//					, i, nbrs_start[i], nbrs_end[i], far_nbrs->num_intrs);
-			//#endif
-		}
-		if (nbrs_end[i] > far_nbrs->num_intrs) {
-			fprintf( stderr, "**ran out of space on far_nbrs: top=%d, max=%d",
-					nbrs_end[i], far_nbrs->num_intrs );
-			exit( INSUFFICIENT_SPACE );
-		}
-
-		free (nbrs_start);
-		free (nbrs_end);
-	}
-
-	//Code not used anymore
+        //validate check here
+        //get the num_far from the list here
+        for (i = 0; i < system->N-1; i++)
+        {
+            if ((nbrs_end[i] - nbrs_start[i]) > (nbrs_start[i+1] - nbrs_start[i]) * DANGER_ZONE )
+            {
+                dev_workspace->realloc.num_far = device_nbrs;
+                //#ifdef __CUDA_MEM__
+                //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far);
+                //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d (%d %d %d) \n", 
+                //                            i, nbrs_start[i], nbrs_end[i], nbrs_start[i+1]);
+                //#endif
+            }
+
+            if (nbrs_end[i] > nbrs_start[i+1]) {
+                fprintf( stderr, "**ran out of space on far_nbrs: start[i] = %d, end[i]=%d, start[i+1]=%d, end[i+1] = %d",
+                        nbrs_start[i], nbrs_end[i], nbrs_start[i+1], nbrs_end[i+1]);
+                exit( INSUFFICIENT_SPACE );
+            }
+        }
+
+        if ((nbrs_end[i] - nbrs_start[i]) > (far_nbrs->num_intrs - nbrs_start[i]) * DANGER_ZONE ) {
+            dev_workspace->realloc.num_far = device_nbrs;
+            //#ifdef __CUDA_MEM__
+            //fprintf (stderr, "Need to reallocate the neighbors ----> %d \n", dev_workspace->realloc.num_far);
+            //fprintf (stderr, "Reaching the limits of neighbors for index ----> %d start: %d, end: %d, count: %d\n"
+            //                    , i, nbrs_start[i], nbrs_end[i], far_nbrs->num_intrs);
+            //#endif
+        }
+        if (nbrs_end[i] > far_nbrs->num_intrs) {
+            fprintf( stderr, "**ran out of space on far_nbrs: top=%d, max=%d",
+                    nbrs_end[i], far_nbrs->num_intrs );
+            exit( INSUFFICIENT_SPACE );
+        }
+
+        free (nbrs_start);
+        free (nbrs_end);
+    }
+
+    //Code not used anymore
 #if defined DONE
 
-	void Choose_Neighbor_Finder( reax_system *system, control_params *control, 
-			get_far_neighbors_function *Get_Far_Neighbors )
-	{
-		if( control->periodic_boundaries )
-		{
-			if( system->box.box_norms[0] > 2.0 * control->vlist_cut &&
-					system->box.box_norms[1] > 2.0 * control->vlist_cut &&
-					system->box.box_norms[2] > 2.0 * control->vlist_cut )
-				(*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box;
-			else  (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box;
-		}
-		else
-			(*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors;
-	}
-
-
-	int compare_near_nbrs(const void *v1, const void *v2)
-	{
-		return ((*(near_neighbor_data *)v1).nbr - (*(near_neighbor_data *)v2).nbr);
-	}
-
-
-	int compare_far_nbrs(const void *v1, const void *v2)
-	{
-		return ((*(far_neighbor_data *)v1).nbr - (*(far_neighbor_data *)v2).nbr);
-	}
-
-
-	inline void Set_Far_Neighbor( far_neighbor_data *dest, int nbr, real d, real C,
-			rvec dvec, ivec rel_box/*, rvec ext_factor*/ )
-	{
-		dest->nbr = nbr;
-		dest->d = d;
-		rvec_Scale( dest->dvec, C, dvec );
-		ivec_Copy( dest->rel_box, rel_box );
-		// rvec_Scale( dest->ext_factor, C, ext_factor );
-	}
-
-
-	inline void Set_Near_Neighbor(near_neighbor_data *dest, int nbr, real d, real C,
-			rvec dvec, ivec rel_box/*, rvec ext_factor*/)
-	{
-		dest->nbr = nbr;
-		dest->d = d;
-		rvec_Scale( dest->dvec, C, dvec );
-		ivec_Scale( dest->rel_box, C, rel_box );
-		// rvec_Scale( dest->ext_factor, C, ext_factor );
-	}
-
-
-	/* In case bond restrictions are applied, this method checks if
-	   atom1 and atom2 are allowed to bond with each other */
-	inline int can_Bond( static_storage *workspace, int atom1, int atom2 )
-	{
-		int i;
-
-		// fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 );
-
-		if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] )
-			return 1;
-
-		for( i = 0; i < workspace->restricted[ atom1 ]; ++i )
-			if( workspace->restricted_list[ atom1 ][i] == atom2 )
-				return 1;
-
-		for( i = 0; i < workspace->restricted[ atom2 ]; ++i )
-			if( workspace->restricted_list[ atom2 ][i] == atom1 )
-				return 1;
-
-		return 0;
-	}
-
-
-	/* check if atom2 is on atom1's near neighbor list */
-	inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 )
-	{
-		int i;
-
-		for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i )
-			if( near_nbrs->select.near_nbr_list[i].nbr == atom2 )
-			{
-				// fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 );
-				return 1;
-			}
-
-		return 0;
-	}
-
-	void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
-			simulation_data *data, static_storage *workspace,
-			list **lists, output_controls *out_control )
-	{
-		int  i, j, k;
-		int  x, y, z;
-		int  *nbr_atoms;
-		int  atom1, atom2, max;
-		int   num_far;
-		int   c, count;
-		int   grid_top;
-		grid *g = &( system->g );  
-		list *far_nbrs = (*lists) + FAR_NBRS;
-		//int   hb_type1, hb_type2;
-		//list *hbonds = (*lists) + HBOND;
-		//int   top_hbond1, top_hbond2;
-		get_far_neighbors_function Get_Far_Neighbors;
-		far_neighbor_data new_nbrs[125];
+    void Choose_Neighbor_Finder( reax_system *system, control_params *control, 
+            get_far_neighbors_function *Get_Far_Neighbors )
+    {
+        if( control->periodic_boundaries )
+        {
+            if( system->box.box_norms[0] > 2.0 * control->vlist_cut &&
+                    system->box.box_norms[1] > 2.0 * control->vlist_cut &&
+                    system->box.box_norms[2] > 2.0 * control->vlist_cut )
+                (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Big_Box;
+            else  (*Get_Far_Neighbors) = Get_Periodic_Far_Neighbors_Small_Box;
+        }
+        else
+            (*Get_Far_Neighbors) = Get_NonPeriodic_Far_Neighbors;
+    }
+
+
+    int compare_near_nbrs(const void *v1, const void *v2)
+    {
+        return ((*(near_neighbor_data *)v1).nbr - (*(near_neighbor_data *)v2).nbr);
+    }
+
+
+    int compare_far_nbrs(const void *v1, const void *v2)
+    {
+        return ((*(far_neighbor_data *)v1).nbr - (*(far_neighbor_data *)v2).nbr);
+    }
+
+
+    inline void Set_Far_Neighbor( far_neighbor_data *dest, int nbr, real d, real C,
+            rvec dvec, ivec rel_box/*, rvec ext_factor*/ )
+    {
+        dest->nbr = nbr;
+        dest->d = d;
+        rvec_Scale( dest->dvec, C, dvec );
+        ivec_Copy( dest->rel_box, rel_box );
+        // rvec_Scale( dest->ext_factor, C, ext_factor );
+    }
+
+
+    inline void Set_Near_Neighbor(near_neighbor_data *dest, int nbr, real d, real C,
+            rvec dvec, ivec rel_box/*, rvec ext_factor*/)
+    {
+        dest->nbr = nbr;
+        dest->d = d;
+        rvec_Scale( dest->dvec, C, dvec );
+        ivec_Scale( dest->rel_box, C, rel_box );
+        // rvec_Scale( dest->ext_factor, C, ext_factor );
+    }
+
+
+    /* In case bond restrictions are applied, this method checks if
+       atom1 and atom2 are allowed to bond with each other */
+    inline int can_Bond( static_storage *workspace, int atom1, int atom2 )
+    {
+        int i;
+
+        // fprintf( stderr, "can bond %6d %6d?\n", atom1, atom2 );
+
+        if( !workspace->restricted[ atom1 ] && !workspace->restricted[ atom2 ] )
+            return 1;
+
+        for( i = 0; i < workspace->restricted[ atom1 ]; ++i )
+            if( workspace->restricted_list[ atom1 ][i] == atom2 )
+                return 1;
+
+        for( i = 0; i < workspace->restricted[ atom2 ]; ++i )
+            if( workspace->restricted_list[ atom2 ][i] == atom1 )
+                return 1;
+
+        return 0;
+    }
+
+
+    /* check if atom2 is on atom1's near neighbor list */
+    inline int is_Near_Neighbor( list *near_nbrs, int atom1, int atom2 )
+    {
+        int i;
+
+        for( i=Start_Index(atom1,near_nbrs); i<End_Index(atom1,near_nbrs); ++i )
+            if( near_nbrs->select.near_nbr_list[i].nbr == atom2 )
+            {
+                // fprintf( stderr, "near neighbors %6d %6d\n", atom1, atom2 );
+                return 1;
+            }
+
+        return 0;
+    }
+
+    void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+            simulation_data *data, static_storage *workspace,
+            list **lists, output_controls *out_control )
+    {
+        int  i, j, k;
+        int  x, y, z;
+        int  *nbr_atoms;
+        int  atom1, atom2, max;
+        int   num_far;
+        int   c, count;
+        int   grid_top;
+        grid *g = &( system->g );  
+        list *far_nbrs = (*lists) + FAR_NBRS;
+        //int   hb_type1, hb_type2;
+        //list *hbonds = (*lists) + HBOND;
+        //int   top_hbond1, top_hbond2;
+        get_far_neighbors_function Get_Far_Neighbors;
+        far_neighbor_data new_nbrs[125];
 #ifndef REORDER_ATOMS
-		int   l, m;
+        int   l, m;
 #endif
 
-		// fprintf( stderr, "\n\tentered nbrs - " );
-		if( control->ensemble == iNPT || control->ensemble == sNPT || 
-				control->ensemble == NPT )
-			Update_Grid( system );
-		// fprintf( stderr, "grid updated - " );
+        // fprintf( stderr, "\n\tentered nbrs - " );
+        if( control->ensemble == iNPT || control->ensemble == sNPT || 
+                control->ensemble == NPT )
+            Update_Grid( system );
+        // fprintf( stderr, "grid updated - " );
 
-		Bin_Atoms( system, out_control );
-		// fprintf( stderr, "atoms sorted - " );
+        Bin_Atoms( system, out_control );
+        // fprintf( stderr, "atoms sorted - " );
 
 #ifdef REORDER_ATOMS
-		Cluster_Atoms( system, workspace );
-		// fprintf( stderr, "atoms clustered - " );
+        Cluster_Atoms( system, workspace );
+        // fprintf( stderr, "atoms clustered - " );
 #endif
 
-		Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
-		// fprintf( stderr, "function chosen - " );  
-
-		Reset_Neighbor_Lists( system, workspace, lists );  
-		// fprintf( stderr, "lists cleared - " );
-
-		num_far = 0;
-		num_near = 0;
-		c = 0;
-
-		/* first pick up a cell in the grid */
-		for( i = 0; i < g->ncell[0]; i++ )
-			for( j = 0; j < g->ncell[1]; j++ )
-				for( k = 0; k < g->ncell[2]; k++ ) {
-					nbrs = g->nbrs[i][j][k];
-					nbrs_cp = g->nbrs_cp[i][j][k];
-
-					/* pick up an atom from the current cell */
-					//#ifdef REORDER_ATOMS
-					//  for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++)
-					//#else
-					for(l = 0; l < g->top[i][j][k]; ++l ){
-						atom1 = g->atoms[i][j][k][l];
-						Set_End_Index( atom1, num_far, far_nbrs );
-						// fprintf( stderr, "atom %d:\n", atom1 );
-
-						itr = 0;
-						while( nbrs[itr][0] > 0 ){
-							x = nbrs[itr][0];
-							y = nbrs[itr][1];
-							z = nbrs[itr][2];
-
-							// if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-							//     SQR(control->r_cut)) 	
-							nbr_atoms = g->atoms[x][y][z];
-							max_atoms = g->top[x][y][z];
-
-							/* pick up another atom from the neighbor cell -
-							   we have to compare atom1 with its own periodic images as well, 
-							   that's why there is also equality in the if stmt below */
-							//#ifdef REORDER_ATOMS
-							//for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++)
-							//#else
-							for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
-								if( atom1 >= atom2 ) {
-									//fprintf( stderr, "\tatom2 %d", atom2 );
-									//top_near1 = End_Index( atom1, near_nbrs );
-									//Set_Start_Index( atom1, num_far, far_nbrs );
-									//hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond;
-									Get_Far_Neighbors( system->atoms[atom1].x,
-											system->atoms[atom2].x, 
-											&(system->box), control, new_nbrs, &count );
-									fprintf( stderr, "\t%d count:%d\n", atom2, count );
-
-									for( c = 0; c < count; ++c )
-										if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
-											Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
-													atom2, new_nbrs[c].d, 1.0, 
-													new_nbrs[c].dvec, new_nbrs[c].rel_box );
-											++num_far;
-
-											/*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
-											  atom1, atom2, new_nbrs[c].d, 
-											  new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
-											  new_nbrs[c].dvec[2] ); */
-
-
-											/* hydrogen bond lists */ 
-											/*if( control->hb_cut > 0.1 && 
-											  new_nbrs[c].d <= control->hb_cut ) {
-											// fprintf( stderr, "%d %d\n", atom1, atom2 );
-											hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond;
-											if( hb_type1 == 1 && hb_type2 == 2 ) {
-											top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds);
-											Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]),
-											atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec,
-											new_nbrs[c].rel_box );
-											Set_End_Index( workspace->hbond_index[atom1], 
-											top_hbond1 + 1, hbonds );
-											}
-											else if( hb_type1 == 2 && hb_type2 == 1 ) {
-											top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds );
-											Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]),
-											atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, 
-											new_nbrs[c].rel_box );
-											Set_End_Index( workspace->hbond_index[atom2], 
-											top_hbond2 + 1, hbonds );
-											}*/
-										}
-										}
-								}
-
-							Set_End_Index( atom1, top_far1, far_nbrs );
-						}
-					}
-
-
-					fprintf( stderr, "nbrs done-" );
-
-
-					/* apply restrictions on near neighbors only */
-					if( (data->step - data->prev_steps) < control->restrict_bonds ) {
-						for( atom1 = 0; atom1 < system->N; ++atom1 )
-							if( workspace->restricted[ atom1 ] ) {
-								// fprintf( stderr, "atom1: %d\n", atom1 );
-
-								top_near1 = End_Index( atom1, near_nbrs );
-
-								for( j = 0; j < workspace->restricted[ atom1 ]; ++j )
-									if(!is_Near_Neighbor(near_nbrs, atom1, 
-												atom2 = workspace->restricted_list[atom1][j])) {
-										fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n",
-												atom1, atom2 );
-
-										top_near2 = End_Index( atom2, near_nbrs );		  
-
-										/* we just would like to get the nearest image, so a call to 
-										   Get_Periodic_Far_Neighbors_Big_Box is good enough. */
-										Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, 
-												system->atoms[ atom2 ].x, 
-												&(system->box), control, 
-												new_nbrs, &count );
-
-										Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]),
-												atom2, new_nbrs[c].d, 1.0, 
-												new_nbrs[c].dvec, new_nbrs[c].rel_box );
-										++top_near1;
-
-										Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]),
-												atom1, new_nbrs[c].d, -1.0, 
-												new_nbrs[c].dvec, new_nbrs[c].rel_box );
-										Set_End_Index( atom2, top_near2+1, near_nbrs );
-									}
-
-								Set_End_Index( atom1, top_near1, near_nbrs );
-							}
-					}
-					// fprintf( stderr, "restrictions applied-" );
-
-
-					/* verify nbrlists, count num_intrs, sort nearnbrs */
-					near_nbrs->num_intrs = 0;
-					far_nbrs->num_intrs = 0;
-					for( i = 0; i < system->N-1; ++i ) {
-						if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) {
-							fprintf( stderr, 
-									"step%3d: nearnbr list of atom%d is overwritten by atom%d\n",
-									data->step, i+1, i );
-							exit( 1 );
-						}
-
-						near_nbrs->num_intrs += Num_Entries(i, near_nbrs);
-
-						if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) {
-							fprintf( stderr, 
-									"step%3d: farnbr list of atom%d is overwritten by atom%d\n", 
-									data->step, i+1, i );
-							exit( 1 );
-						}
-
-						far_nbrs->num_intrs += Num_Entries(i, far_nbrs);
-					}
-
-					for( i = 0; i < system->N; ++i ) {
-						qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]),
-								Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), 
-								compare_near_nbrs );
-					}
-					// fprintf( stderr, "near nbrs sorted\n" );
+        Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
+        // fprintf( stderr, "function chosen - " );  
+
+        Reset_Neighbor_Lists( system, workspace, lists );  
+        // fprintf( stderr, "lists cleared - " );
+
+        num_far = 0;
+        num_near = 0;
+        c = 0;
+
+        /* first pick up a cell in the grid */
+        for( i = 0; i < g->ncell[0]; i++ )
+            for( j = 0; j < g->ncell[1]; j++ )
+                for( k = 0; k < g->ncell[2]; k++ ) {
+                    nbrs = g->nbrs[i][j][k];
+                    nbrs_cp = g->nbrs_cp[i][j][k];
+
+                    /* pick up an atom from the current cell */
+                    //#ifdef REORDER_ATOMS
+                    //  for(atom1 = g->start[i][j][k]; atom1 < g->end[i][j][k]; atom1++)
+                    //#else
+                    for(l = 0; l < g->top[i][j][k]; ++l ){
+                        atom1 = g->atoms[i][j][k][l];
+                        Set_End_Index( atom1, num_far, far_nbrs );
+                        // fprintf( stderr, "atom %d:\n", atom1 );
+
+                        itr = 0;
+                        while( nbrs[itr][0] > 0 ){
+                            x = nbrs[itr][0];
+                            y = nbrs[itr][1];
+                            z = nbrs[itr][2];
+
+                            // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
+                            //     SQR(control->r_cut))     
+                            nbr_atoms = g->atoms[x][y][z];
+                            max_atoms = g->top[x][y][z];
+
+                            /* pick up another atom from the neighbor cell -
+                               we have to compare atom1 with its own periodic images as well, 
+                               that's why there is also equality in the if stmt below */
+                            //#ifdef REORDER_ATOMS
+                            //for(atom2=g->start[x][y][z]; atom2<g->end[x][y][z]; atom2++)
+                            //#else
+                            for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
+                                if( atom1 >= atom2 ) {
+                                    //fprintf( stderr, "\tatom2 %d", atom2 );
+                                    //top_near1 = End_Index( atom1, near_nbrs );
+                                    //Set_Start_Index( atom1, num_far, far_nbrs );
+                                    //hb_type1=system->reaxprm.sbp[system->atoms[atom1].type].p_hbond;
+                                    Get_Far_Neighbors( system->atoms[atom1].x,
+                                            system->atoms[atom2].x, 
+                                            &(system->box), control, new_nbrs, &count );
+                                    fprintf( stderr, "\t%d count:%d\n", atom2, count );
+
+                                    for( c = 0; c < count; ++c )
+                                        if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
+                                            Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
+                                                    atom2, new_nbrs[c].d, 1.0, 
+                                                    new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                            ++num_far;
+
+                                            /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
+                                              atom1, atom2, new_nbrs[c].d, 
+                                              new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
+                                              new_nbrs[c].dvec[2] ); */
+
+
+                                            /* hydrogen bond lists */ 
+                                            /*if( control->hb_cut > 0.1 && 
+                                              new_nbrs[c].d <= control->hb_cut ) {
+                                            // fprintf( stderr, "%d %d\n", atom1, atom2 );
+                                            hb_type2=system->reaxprm.sbp[system->atoms[atom2].type].p_hbond;
+                                            if( hb_type1 == 1 && hb_type2 == 2 ) {
+                                            top_hbond1=End_Index(workspace->hbond_index[atom1],hbonds);
+                                            Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond1]),
+                                            atom2, new_nbrs[c].d, 1.0, new_nbrs[c].dvec,
+                                            new_nbrs[c].rel_box );
+                                            Set_End_Index( workspace->hbond_index[atom1], 
+                                            top_hbond1 + 1, hbonds );
+                                            }
+                                            else if( hb_type1 == 2 && hb_type2 == 1 ) {
+                                            top_hbond2 = End_Index( workspace->hbond_index[atom2], hbonds );
+                                            Set_Near_Neighbor(&(hbonds->select.hbond_list[top_hbond2]),
+                                            atom1, new_nbrs[c].d, -1.0, new_nbrs[c].dvec, 
+                                            new_nbrs[c].rel_box );
+                                            Set_End_Index( workspace->hbond_index[atom2], 
+                                            top_hbond2 + 1, hbonds );
+                                            }*/
+                                        }
+                                        }
+                                }
+
+                            Set_End_Index( atom1, top_far1, far_nbrs );
+                        }
+                    }
+
+
+                    fprintf( stderr, "nbrs done-" );
+
+
+                    /* apply restrictions on near neighbors only */
+                    if( (data->step - data->prev_steps) < control->restrict_bonds ) {
+                        for( atom1 = 0; atom1 < system->N; ++atom1 )
+                            if( workspace->restricted[ atom1 ] ) {
+                                // fprintf( stderr, "atom1: %d\n", atom1 );
+
+                                top_near1 = End_Index( atom1, near_nbrs );
+
+                                for( j = 0; j < workspace->restricted[ atom1 ]; ++j )
+                                    if(!is_Near_Neighbor(near_nbrs, atom1, 
+                                                atom2 = workspace->restricted_list[atom1][j])) {
+                                        fprintf( stderr, "%3d-%3d: added bond by applying restrictions!\n",
+                                                atom1, atom2 );
+
+                                        top_near2 = End_Index( atom2, near_nbrs );          
+
+                                        /* we just would like to get the nearest image, so a call to 
+                                           Get_Periodic_Far_Neighbors_Big_Box is good enough. */
+                                        Get_Periodic_Far_Neighbors_Big_Box( system->atoms[ atom1 ].x, 
+                                                system->atoms[ atom2 ].x, 
+                                                &(system->box), control, 
+                                                new_nbrs, &count );
+
+                                        Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near1 ]),
+                                                atom2, new_nbrs[c].d, 1.0, 
+                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                        ++top_near1;
+
+                                        Set_Near_Neighbor( &(near_nbrs->select.near_nbr_list[ top_near2 ]),
+                                                atom1, new_nbrs[c].d, -1.0, 
+                                                new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                        Set_End_Index( atom2, top_near2+1, near_nbrs );
+                                    }
+
+                                Set_End_Index( atom1, top_near1, near_nbrs );
+                            }
+                    }
+                    // fprintf( stderr, "restrictions applied-" );
+
+
+                    /* verify nbrlists, count num_intrs, sort nearnbrs */
+                    near_nbrs->num_intrs = 0;
+                    far_nbrs->num_intrs = 0;
+                    for( i = 0; i < system->N-1; ++i ) {
+                        if( End_Index(i, near_nbrs) > Start_Index(i+1, near_nbrs) ) {
+                            fprintf( stderr, 
+                                    "step%3d: nearnbr list of atom%d is overwritten by atom%d\n",
+                                    data->step, i+1, i );
+                            exit( 1 );
+                        }
+
+                        near_nbrs->num_intrs += Num_Entries(i, near_nbrs);
+
+                        if( End_Index(i, far_nbrs) > Start_Index(i+1, far_nbrs) ) {
+                            fprintf( stderr, 
+                                    "step%3d: farnbr list of atom%d is overwritten by atom%d\n", 
+                                    data->step, i+1, i );
+                            exit( 1 );
+                        }
+
+                        far_nbrs->num_intrs += Num_Entries(i, far_nbrs);
+                    }
+
+                    for( i = 0; i < system->N; ++i ) {
+                        qsort( &(near_nbrs->select.near_nbr_list[ Start_Index(i, near_nbrs) ]),
+                                Num_Entries(i, near_nbrs), sizeof(near_neighbor_data), 
+                                compare_near_nbrs );
+                    }
+                    // fprintf( stderr, "near nbrs sorted\n" );
 
 
 #ifdef TEST_ENERGY
-					/* for( i = 0; i < system->N; ++i ) {
-					   qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-					   Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-					   compare_far_nbrs ); 
-					   } */
-
-					fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", 
-							num_near / system->N );
-					fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", 
-							num_far / system->N, control->max_far_nbrs );
+                    /* for( i = 0; i < system->N; ++i ) {
+                       qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
+                       Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
+                       compare_far_nbrs ); 
+                       } */
+
+                    fprintf( stderr, "Near neighbors/atom: %d (compare to 150)\n", 
+                            num_near / system->N );
+                    fprintf( stderr, "Far neighbors per atom: %d (compare to %d)\n", 
+                            num_far / system->N, control->max_far_nbrs );
 #endif
 
-					//fprintf( stderr, "step%d: num of nearnbrs = %6d   num of farnbrs: %6d\n",
-					//       data->step, num_near, num_far );
-
-					//fprintf( stderr, "\talloc nearnbrs = %6d   alloc farnbrs: %6d\n", 
-					//   system->N * near_nbrs->intrs_per_unit, 
-					//   system->N * far_nbrs->intrs_per_unit );
-				}
-
-
-
-		void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
-				simulation_data *data, static_storage *workspace,
-				list **lists, output_controls *out_control )
-		{
-			int  i, j, k, l, m, itr;
-			int  x, y, z;
-			int  atom1, atom2, max;
-			int  num_far, c, count;
-			int  *nbr_atoms;
-			ivec *nbrs;
-			rvec *nbrs_cp;
-			grid *g;
-			list *far_nbrs;
-			get_far_neighbors_function Get_Far_Neighbors;
-			far_neighbor_data new_nbrs[125];
-
-			g = &( system->g );
-			far_nbrs = (*lists) + FAR_NBRS;
-
-			// fprintf( stderr, "\n\tentered nbrs - " );
-			if( control->ensemble == iNPT || 
-					control->ensemble == sNPT || 
-					control->ensemble == NPT )
-				Update_Grid( system );
-			// fprintf( stderr, "grid updated - " );
-
-			Bin_Atoms( system, out_control );
-			// fprintf( stderr, "atoms sorted - " );
-			Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
-			// fprintf( stderr, "function chosen - " );  
-			Reset_Neighbor_Lists( system, workspace, lists );  
-			// fprintf( stderr, "lists cleared - " );
-
-			num_far = 0;
-			c = 0;
-
-			/* first pick up a cell in the grid */
-			for( i = 0; i < g->ncell[0]; i++ )
-				for( j = 0; j < g->ncell[1]; j++ )
-					for( k = 0; k < g->ncell[2]; k++ ) {
-						nbrs = g->nbrs[i][j][k];
-						nbrs_cp = g->nbrs_cp[i][j][k];
-						fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
-
-						/* pick up an atom from the current cell */
-						for(l = 0; l < g->top[i][j][k]; ++l ){
-							atom1 = g->atoms[i][j][k][l];
-							Set_Start_Index( atom1, num_far, far_nbrs );
-							fprintf( stderr, "\tatom %d\n", atom1 );
-
-							itr = 0;
-							while( nbrs[itr][0] > 0 ){
-								x = nbrs[itr][0];
-								y = nbrs[itr][1];
-								z = nbrs[itr][2];
-								fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
-
-								// if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
-								//     SQR(control->r_cut)) 	
-								nbr_atoms = g->atoms[x][y][z];
-								max = g->top[x][y][z];
-								fprintf( stderr, "\t\tmax: %d\n", max );
-
-
-								/* pick up another atom from the neighbor cell -
-								   we have to compare atom1 with its own periodic images as well, 
-								   that's why there is also equality in the if stmt below */
-								for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
-									if( atom1 >= atom2 ) {
-										Get_Far_Neighbors( system->atoms[atom1].x,
-												system->atoms[atom2].x, 
-												&(system->box), control, new_nbrs, &count );
-										fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count );
-
-										for( c = 0; c < count; ++c )
-											if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
-												Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
-														atom2, new_nbrs[c].d, 1.0, 
-														new_nbrs[c].dvec, new_nbrs[c].rel_box );
-												++num_far;
-
-												/*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
-												  atom1, atom2, new_nbrs[c].d, 
-												  new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
-												  new_nbrs[c].dvec[2] ); */
-											}
-									}
-
-								++itr;
-							}
-
-							Set_End_Index( atom1, num_far, far_nbrs );
-						}
-					}
-
-			far_nbrs->num_intrs = num_far;  
-			fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
+                    //fprintf( stderr, "step%d: num of nearnbrs = %6d   num of farnbrs: %6d\n",
+                    //       data->step, num_near, num_far );
+
+                    //fprintf( stderr, "\talloc nearnbrs = %6d   alloc farnbrs: %6d\n", 
+                    //   system->N * near_nbrs->intrs_per_unit, 
+                    //   system->N * far_nbrs->intrs_per_unit );
+                }
+
+
+
+        void Generate_Neighbor_Lists( reax_system *system, control_params *control, 
+                simulation_data *data, static_storage *workspace,
+                list **lists, output_controls *out_control )
+        {
+            int  i, j, k, l, m, itr;
+            int  x, y, z;
+            int  atom1, atom2, max;
+            int  num_far, c, count;
+            int  *nbr_atoms;
+            ivec *nbrs;
+            rvec *nbrs_cp;
+            grid *g;
+            list *far_nbrs;
+            get_far_neighbors_function Get_Far_Neighbors;
+            far_neighbor_data new_nbrs[125];
+
+            g = &( system->g );
+            far_nbrs = (*lists) + FAR_NBRS;
+
+            // fprintf( stderr, "\n\tentered nbrs - " );
+            if( control->ensemble == iNPT || 
+                    control->ensemble == sNPT || 
+                    control->ensemble == NPT )
+                Update_Grid( system );
+            // fprintf( stderr, "grid updated - " );
+
+            Bin_Atoms( system, out_control );
+            // fprintf( stderr, "atoms sorted - " );
+            Choose_Neighbor_Finder( system, control, &Get_Far_Neighbors );
+            // fprintf( stderr, "function chosen - " );  
+            Reset_Neighbor_Lists( system, workspace, lists );  
+            // fprintf( stderr, "lists cleared - " );
+
+            num_far = 0;
+            c = 0;
+
+            /* first pick up a cell in the grid */
+            for( i = 0; i < g->ncell[0]; i++ )
+                for( j = 0; j < g->ncell[1]; j++ )
+                    for( k = 0; k < g->ncell[2]; k++ ) {
+                        nbrs = g->nbrs[i][j][k];
+                        nbrs_cp = g->nbrs_cp[i][j][k];
+                        fprintf( stderr, "gridcell %d %d %d\n", i, j, k );
+
+                        /* pick up an atom from the current cell */
+                        for(l = 0; l < g->top[i][j][k]; ++l ){
+                            atom1 = g->atoms[i][j][k][l];
+                            Set_Start_Index( atom1, num_far, far_nbrs );
+                            fprintf( stderr, "\tatom %d\n", atom1 );
+
+                            itr = 0;
+                            while( nbrs[itr][0] > 0 ){
+                                x = nbrs[itr][0];
+                                y = nbrs[itr][1];
+                                z = nbrs[itr][2];
+                                fprintf( stderr, "\t\tgridcell %d %d %d\n", x, y, z );
+
+                                // if( DistSqr_to_CP(nbrs_cp[itr], system->atoms[atom1].x ) <= 
+                                //     SQR(control->r_cut))     
+                                nbr_atoms = g->atoms[x][y][z];
+                                max = g->top[x][y][z];
+                                fprintf( stderr, "\t\tmax: %d\n", max );
+
+
+                                /* pick up another atom from the neighbor cell -
+                                   we have to compare atom1 with its own periodic images as well, 
+                                   that's why there is also equality in the if stmt below */
+                                for( m = 0, atom2=nbr_atoms[m]; m < max; ++m, atom2=nbr_atoms[m] )
+                                    if( atom1 >= atom2 ) {
+                                        Get_Far_Neighbors( system->atoms[atom1].x,
+                                                system->atoms[atom2].x, 
+                                                &(system->box), control, new_nbrs, &count );
+                                        fprintf( stderr, "\t\t\t%d count:%d\n", atom2, count );
+
+                                        for( c = 0; c < count; ++c )
+                                            if(atom1 != atom2 || (atom1 == atom2 && new_nbrs[c].d>=0.1)){
+                                                Set_Far_Neighbor(&(far_nbrs->select.far_nbr_list[num_far]),
+                                                        atom2, new_nbrs[c].d, 1.0, 
+                                                        new_nbrs[c].dvec, new_nbrs[c].rel_box );
+                                                ++num_far;
+
+                                                /*fprintf(stderr,"FARNBR:%6d%6d%8.3f[%8.3f%8.3f%8.3f]\n",
+                                                  atom1, atom2, new_nbrs[c].d, 
+                                                  new_nbrs[c].dvec[0], new_nbrs[c].dvec[1], 
+                                                  new_nbrs[c].dvec[2] ); */
+                                            }
+                                    }
+
+                                ++itr;
+                            }
+
+                            Set_End_Index( atom1, num_far, far_nbrs );
+                        }
+                    }
+
+            far_nbrs->num_intrs = num_far;  
+            fprintf( stderr, "nbrs done, num_far: %d\n", num_far );
 
 #if defined(DEBUG)
-			for( i = 0; i < system->N; ++i ) {
-				qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
-						Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
-						compare_far_nbrs ); 
-			}
-
-			fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far );
-			fprintf( stderr, "\tallocated farnbrs: %6d\n", 
-					system->N * far_nbrs->intrs_per_unit );
+            for( i = 0; i < system->N; ++i ) {
+                qsort( &(far_nbrs->select.far_nbr_list[ Start_Index(i, far_nbrs) ]), 
+                        Num_Entries(i, far_nbrs), sizeof(far_neighbor_data), 
+                        compare_far_nbrs ); 
+            }
+
+            fprintf( stderr, "step%d: num of farnbrs=%6d\n", data->step, num_far );
+            fprintf( stderr, "\tallocated farnbrs: %6d\n", 
+                    system->N * far_nbrs->intrs_per_unit );
 #endif
-		}
+        }
 
 
 
diff --git a/PuReMD-GPU/src/reduction.cu b/PuReMD-GPU/src/reduction.cu
index 4e2ee5bd..48fb5efc 100644
--- a/PuReMD-GPU/src/reduction.cu
+++ b/PuReMD-GPU/src/reduction.cu
@@ -25,124 +25,124 @@
 
 GLOBAL void Cuda_reduction(const real *input, real *per_block_results, const size_t n)
 {
-	extern __shared__ real sdata[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
-
-	if(i < n)
-	{
-		x = input[i];
-	}
-	sdata[threadIdx.x] = x;
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		per_block_results[blockIdx.x] = sdata[0];
-	}
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if(i < n)
+    {
+        x = input[i];
+    }
+    sdata[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sdata[0];
+    }
 }
 
 GLOBAL void Cuda_Norm (const real *input, real *per_block_results, const size_t n, int pass)
 {
-	extern __shared__ real sdata[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
-
-	if(i < n)
-	{
-		if (pass == INITIAL)
-			x = SQR (input[i]);
-		else 
-			x = input[i];
-	}
-	sdata[threadIdx.x] = x;
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		if (pass == INITIAL)
-			per_block_results[blockIdx.x] = sdata[0];
-		else
-			per_block_results[blockIdx.x] = SQRT (sdata[0]);
-	}
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if(i < n)
+    {
+        if (pass == INITIAL)
+            x = SQR (input[i]);
+        else 
+            x = input[i];
+    }
+    sdata[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        if (pass == INITIAL)
+            per_block_results[blockIdx.x] = sdata[0];
+        else
+            per_block_results[blockIdx.x] = SQRT (sdata[0]);
+    }
 }
 
 GLOBAL void Cuda_Dot (const real *a, const real *b, real *per_block_results, const size_t n )
 {
-	extern __shared__ real sdata[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
-
-	if(i < n)
-	{
-		x = a[i] * b[i];
-	}
-	sdata[threadIdx.x] = x;
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		per_block_results[blockIdx.x] = sdata[0];
-	}
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if(i < n)
+    {
+        x = a[i] * b[i];
+    }
+    sdata[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sdata[0];
+    }
 }
 
 GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results, const size_t n)
 {
-	extern __shared__ real sdata[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
-
-	if(i < n)
-	{
-		x = input[i * n + i];
-	}
-	sdata[threadIdx.x] = x;
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0) 
-	{
-		per_block_results[blockIdx.x] = sdata[0];
-	}
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if(i < n)
+    {
+        x = input[i * n + i];
+    }
+    sdata[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0) 
+    {
+        per_block_results[blockIdx.x] = sdata[0];
+    }
 }
 
 
@@ -152,65 +152,65 @@ GLOBAL void Cuda_matrix_col_reduction(const real *input, real *per_block_results
 
 GLOBAL void Cuda_reduction(const int *input, int *per_block_results, const size_t n)
 {
-	extern __shared__ int sh_input[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
-
-	if(i < n)
-	{
-		x = input[i];
-	}
-	sh_input[threadIdx.x] = x;
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			sh_input[threadIdx.x] += sh_input[threadIdx.x + offset];
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		per_block_results[blockIdx.x] = sh_input[0];
-	}
+    extern __shared__ int sh_input[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if(i < n)
+    {
+        x = input[i];
+    }
+    sh_input[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            sh_input[threadIdx.x] += sh_input[threadIdx.x + offset];
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        per_block_results[blockIdx.x] = sh_input[0];
+    }
 }
 
 
 GLOBAL void Cuda_reduction_rvec (rvec *input, rvec *results, size_t n)
 {
-	extern __shared__ rvec svec_data[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	rvec x;
-
-	rvec_MakeZero (x);
-
-	if(i < n)
-	{
-		rvec_Copy (x, input[i]);
-	}
-
-	rvec_Copy (svec_data[threadIdx.x], x);
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{
-			rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]);
-		}
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		//rvec_Copy (results[blockIdx.x], svec_data[0]);
-		rvec_Add (results[blockIdx.x], svec_data[0]);
-	}
+    extern __shared__ rvec svec_data[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    rvec x;
+
+    rvec_MakeZero (x);
+
+    if(i < n)
+    {
+        rvec_Copy (x, input[i]);
+    }
+
+    rvec_Copy (svec_data[threadIdx.x], x);
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {
+            rvec_Add (svec_data[threadIdx.x], svec_data[threadIdx.x + offset]);
+        }
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        //rvec_Copy (results[blockIdx.x], svec_data[0]);
+        rvec_Add (results[blockIdx.x], svec_data[0]);
+    }
 }
 
 //////////////////////////////////////////////////
@@ -219,24 +219,24 @@ GLOBAL void Cuda_reduction_rvec (rvec *input, rvec *results, size_t n)
 
 GLOBAL void Cuda_Vector_Sum( real* dest, real c, real* v, real d, real* y, int k ) 
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= k) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= k) return;
 
-	dest[i] = c * v[i] + d * y[i];
+    dest[i] = c * v[i] + d * y[i];
 }
 
 GLOBAL void Cuda_Vector_Scale( real* dest, real c, real* v, int k ) 
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= k) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= k) return;
 
-	dest[i] = c * v[i];
+    dest[i] = c * v[i];
 }
 
 GLOBAL void Cuda_Vector_Add( real* dest, real c, real* v, int k )
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= k) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= k) return;
 
-	dest[i] += c * v[i];
+    dest[i] += c * v[i];
 }
diff --git a/PuReMD-GPU/src/reset_utils.cu b/PuReMD-GPU/src/reset_utils.cu
index 9e5c5075..0c6f852b 100644
--- a/PuReMD-GPU/src/reset_utils.cu
+++ b/PuReMD-GPU/src/reset_utils.cu
@@ -27,68 +27,68 @@
 
 GLOBAL void Reset_Atoms (reax_atom *atoms, int N)
 {
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
 
-	atoms[i].f[0] = 0.0;
-	atoms[i].f[1] = 0.0;
-	atoms[i].f[2] = 0.0;
+    atoms[i].f[0] = 0.0;
+    atoms[i].f[1] = 0.0;
+    atoms[i].f[2] = 0.0;
 }
 
 void Cuda_Reset_Atoms (reax_system *system )
 {
-	Reset_Atoms <<<BLOCKS, BLOCK_SIZE>>>
-		(system->d_atoms, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Reset_Atoms <<<BLOCKS, BLOCK_SIZE>>>
+        (system->d_atoms, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 void Reset_Atoms( reax_system* system )
 {
-	int i;
+    int i;
 
-	for( i = 0; i < system->N; ++i )
-		memset( system->atoms[i].f, 0.0, RVEC_SIZE );
+    for( i = 0; i < system->N; ++i )
+        memset( system->atoms[i].f, 0.0, RVEC_SIZE );
 }
 
 
 void Reset_Pressures( simulation_data *data )
 {
-	rtensor_MakeZero( data->flex_bar.P );  
-	data->iso_bar.P = 0;
-	rvec_MakeZero( data->int_press );
-	rvec_MakeZero( data->ext_press );
-	/* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", 
-	   data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
+    rtensor_MakeZero( data->flex_bar.P );  
+    data->iso_bar.P = 0;
+    rvec_MakeZero( data->int_press );
+    rvec_MakeZero( data->ext_press );
+    /* fprintf( stderr, "reset: ext_press (%12.6f %12.6f %12.6f)\n", 
+       data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
 }
 
 
 void Reset_Simulation_Data( simulation_data* data )
 {
-	data->E_BE = 0;
-	data->E_Ov = 0;
-	data->E_Un = 0;
-	data->E_Lp = 0;
-	data->E_Ang = 0;
-	data->E_Pen = 0;
-	data->E_Coa = 0;
-	data->E_HB = 0;
-	data->E_Tor = 0;
-	data->E_Con = 0;
-	data->E_vdW = 0;
-	data->E_Ele = 0;
-	data->E_Kin = 0;
+    data->E_BE = 0;
+    data->E_Ov = 0;
+    data->E_Un = 0;
+    data->E_Lp = 0;
+    data->E_Ang = 0;
+    data->E_Pen = 0;
+    data->E_Coa = 0;
+    data->E_HB = 0;
+    data->E_Tor = 0;
+    data->E_Con = 0;
+    data->E_vdW = 0;
+    data->E_Ele = 0;
+    data->E_Kin = 0;
 }
 
 void Cuda_Sync_Simulation_Data (simulation_data *data)
 {
-	//copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, 
-	//										REAL_SIZE * 12, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
-	cuda_memset (&((simulation_data *)data->d_simulation_data)->E_BE, 0, REAL_SIZE * 12, RES_SIMULATION_DATA );
+    //copy_host_device (&data->E_BE, &((simulation_data *)data->d_simulation_data)->E_BE, 
+    //                                        REAL_SIZE * 12, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+    cuda_memset (&((simulation_data *)data->d_simulation_data)->E_BE, 0, REAL_SIZE * 12, RES_SIMULATION_DATA );
 
-	//copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, 
-	//										REAL_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
-	cuda_memset (&((simulation_data *)data->d_simulation_data)->E_Kin, 0, REAL_SIZE, RES_SIMULATION_DATA );
+    //copy_host_device (&data->E_Kin, &((simulation_data *)data->d_simulation_data)->E_Kin, 
+    //                                        REAL_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+    cuda_memset (&((simulation_data *)data->d_simulation_data)->E_Kin, 0, REAL_SIZE, RES_SIMULATION_DATA );
 
 }
 
@@ -96,195 +96,195 @@ void Cuda_Sync_Simulation_Data (simulation_data *data)
 #ifdef TEST_FORCES
 void Reset_Test_Forces( reax_system *system, static_storage *workspace )
 {
-	memset( workspace->f_ele, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_vdw, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_bo, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_be, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_lp, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_ov, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_un, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_ang, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_coa, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_pen, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_hb, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_tor, 0, system->N * sizeof(rvec) );
-	memset( workspace->f_con, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_ele, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_vdw, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_bo, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_be, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_lp, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_ov, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_un, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_ang, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_coa, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_pen, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_hb, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_tor, 0, system->N * sizeof(rvec) );
+    memset( workspace->f_con, 0, system->N * sizeof(rvec) );
 }
 #endif
 
 
 void Reset_Workspace( reax_system *system, static_storage *workspace )
 {
-	memset( workspace->total_bond_order, 0, system->N * sizeof( real ) );
-	memset( workspace->dDeltap_self, 0, system->N * sizeof( rvec ) );
+    memset( workspace->total_bond_order, 0, system->N * sizeof( real ) );
+    memset( workspace->dDeltap_self, 0, system->N * sizeof( rvec ) );
 
-	memset( workspace->CdDelta, 0, system->N * sizeof( real ) );
-	//memset( workspace->virial_forces, 0, system->N * sizeof( rvec ) );
+    memset( workspace->CdDelta, 0, system->N * sizeof( real ) );
+    //memset( workspace->virial_forces, 0, system->N * sizeof( rvec ) );
 
 #ifdef TEST_FORCES
-	memset( workspace->dDelta, 0, sizeof(rvec) * system->N );
-	Reset_Test_Forces( system, workspace );
+    memset( workspace->dDelta, 0, sizeof(rvec) * system->N );
+    Reset_Test_Forces( system, workspace );
 #endif
 }
 
 void Cuda_Reset_Workspace( reax_system *system, static_storage *workspace )
 {
-	cuda_memset( workspace->total_bond_order, 0, system->N * REAL_SIZE, RES_STORAGE_TOTAL_BOND_ORDER );
-	cuda_memset( workspace->dDeltap_self, 0, system->N * RVEC_SIZE, RES_STORAGE_DDELTAP_SELF );
-	cuda_memset( workspace->CdDelta, 0, system->N * REAL_SIZE, RES_STORAGE_CDDELTA );
+    cuda_memset( workspace->total_bond_order, 0, system->N * REAL_SIZE, RES_STORAGE_TOTAL_BOND_ORDER );
+    cuda_memset( workspace->dDeltap_self, 0, system->N * RVEC_SIZE, RES_STORAGE_DDELTAP_SELF );
+    cuda_memset( workspace->CdDelta, 0, system->N * REAL_SIZE, RES_STORAGE_CDDELTA );
 }
 
 
 GLOBAL void Reset_Neighbor_Lists (single_body_parameters *sbp, reax_atom *atoms, 
-		list bonds, list hbonds, control_params *control, 
-		static_storage workspace, int N)
+        list bonds, list hbonds, control_params *control, 
+        static_storage workspace, int N)
 {
-	int tmp;
-	int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int tmp;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if (index >= N) return;
+    if (index >= N) return;
 
-	tmp = Start_Index (index, &bonds);
-	Set_End_Index (index, tmp, &bonds);
+    tmp = Start_Index (index, &bonds);
+    Set_End_Index (index, tmp, &bonds);
 
 
-	if (control->hb_cut > 0) {
-		if ((sbp[ atoms[index].type ].p_hbond == 1) || 
-				(sbp[ atoms[index].type ].p_hbond == 2)) {
-			tmp = Start_Index ( workspace.hbond_index[index], &hbonds );
-			Set_End_Index ( workspace.hbond_index[index], tmp, &hbonds );
-		}
-	}
+    if (control->hb_cut > 0) {
+        if ((sbp[ atoms[index].type ].p_hbond == 1) || 
+                (sbp[ atoms[index].type ].p_hbond == 2)) {
+            tmp = Start_Index ( workspace.hbond_index[index], &hbonds );
+            Set_End_Index ( workspace.hbond_index[index], tmp, &hbonds );
+        }
+    }
 }
 
 void Cuda_Reset_Neighbor_Lists (reax_system *system, control_params *control, 
-		static_storage *workspace, list **lists ) 
+        static_storage *workspace, list **lists ) 
 {
-	Reset_Neighbor_Lists <<<BLOCKS, BLOCK_SIZE>>>
-		( system->reaxprm.d_sbp, system->d_atoms, *(dev_lists + BONDS), *(dev_lists + HBONDS), 
-		  (control_params *)control->d_control, *dev_workspace, system->N );
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	//reset here
-	list *bonds = (dev_lists + BONDS );
-	//TODO - check if this is needed.
-	cuda_memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs, LIST_BOND_DATA );
+    Reset_Neighbor_Lists <<<BLOCKS, BLOCK_SIZE>>>
+        ( system->reaxprm.d_sbp, system->d_atoms, *(dev_lists + BONDS), *(dev_lists + HBONDS), 
+          (control_params *)control->d_control, *dev_workspace, system->N );
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //reset here
+    list *bonds = (dev_lists + BONDS );
+    //TODO - check if this is needed.
+    cuda_memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs, LIST_BOND_DATA );
 }
 
 GLOBAL void Reset_Far_Neighbors_List (list far_nbrs, int N)
 {
-	int tmp;
-	int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int tmp;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if (index >= N) return;
+    if (index >= N) return;
 
-	tmp = Start_Index (index, &far_nbrs);
-	Set_End_Index (index, tmp, &far_nbrs);
+    tmp = Start_Index (index, &far_nbrs);
+    Set_End_Index (index, tmp, &far_nbrs);
 }
 
 void Cuda_Reset_Far_Neighbors_List ( reax_system *system )
 {
-	Reset_Far_Neighbors_List <<<BLOCKS, BLOCK_SIZE>>>
-		(*(dev_lists + FAR_NBRS), system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    Reset_Far_Neighbors_List <<<BLOCKS, BLOCK_SIZE>>>
+        (*(dev_lists + FAR_NBRS), system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 }
 
 void Reset_Neighbor_Lists( reax_system *system, control_params *control, 
-		static_storage *workspace, list **lists )
+        static_storage *workspace, list **lists )
 {
-	int i, tmp;
-	list *bonds = (*lists) + BONDS;
-	list *hbonds = (*lists) + HBONDS;
-
-	for( i = 0; i < system->N; ++i ) {
-		tmp = Start_Index( i, bonds );
-		Set_End_Index( i, tmp, bonds );
-	}
-
-	//TODO check if this is needed
-	memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs );
-
-	if( control->hb_cut > 0 )
-		for( i = 0; i < system->N; ++i )
-			if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) {
-				tmp = Start_Index( workspace->hbond_index[i], hbonds );
-				Set_End_Index( workspace->hbond_index[i], tmp, hbonds );
-				/* fprintf( stderr, "i:%d, hbond: %d-%d\n", 
-				   i, Start_Index( workspace->hbond_index[i], hbonds ), 
-				   End_Index( workspace->hbond_index[i], hbonds ) );*/
-			}
+    int i, tmp;
+    list *bonds = (*lists) + BONDS;
+    list *hbonds = (*lists) + HBONDS;
+
+    for( i = 0; i < system->N; ++i ) {
+        tmp = Start_Index( i, bonds );
+        Set_End_Index( i, tmp, bonds );
+    }
+
+    //TODO check if this is needed
+    memset (bonds->select.bond_list, 0, BOND_DATA_SIZE * bonds->num_intrs );
+
+    if( control->hb_cut > 0 )
+        for( i = 0; i < system->N; ++i )
+            if( system->reaxprm.sbp[system->atoms[i].type].p_hbond == 1) {
+                tmp = Start_Index( workspace->hbond_index[i], hbonds );
+                Set_End_Index( workspace->hbond_index[i], tmp, hbonds );
+                /* fprintf( stderr, "i:%d, hbond: %d-%d\n", 
+                   i, Start_Index( workspace->hbond_index[i], hbonds ), 
+                   End_Index( workspace->hbond_index[i], hbonds ) );*/
+            }
 }
 
 
 void Reset( reax_system *system, control_params *control,  
-		simulation_data *data, static_storage *workspace, list **lists  )
+        simulation_data *data, static_storage *workspace, list **lists  )
 {
-	Reset_Atoms( system );
+    Reset_Atoms( system );
 
-	Reset_Simulation_Data( data );
+    Reset_Simulation_Data( data );
 
-	if( control->ensemble == NPT || control->ensemble == sNPT || 
-			control->ensemble == iNPT )
-		Reset_Pressures( data );
+    if( control->ensemble == NPT || control->ensemble == sNPT || 
+            control->ensemble == iNPT )
+        Reset_Pressures( data );
 
-	Reset_Workspace( system, workspace );  
+    Reset_Workspace( system, workspace );  
 
-	Reset_Neighbor_Lists( system, control, workspace, lists );
+    Reset_Neighbor_Lists( system, control, workspace, lists );
 
 #if defined(DEBUG_FOCUS)  
-	fprintf( stderr, "reset - ");
+    fprintf( stderr, "reset - ");
 #endif
 }
 
 void Cuda_Reset_Sparse_Matrix (reax_system *system, static_storage *workspace)
 {
-	cuda_memset (workspace->H.j, 0, (system->N + 1) * INT_SIZE, RES_SPARSE_MATRIX_INDEX );
-	cuda_memset (workspace->H.val, 0, (system->N * system->max_sparse_matrix_entries) * INT_SIZE, RES_SPARSE_MATRIX_INDEX );
+    cuda_memset (workspace->H.j, 0, (system->N + 1) * INT_SIZE, RES_SPARSE_MATRIX_INDEX );
+    cuda_memset (workspace->H.val, 0, (system->N * system->max_sparse_matrix_entries) * INT_SIZE, RES_SPARSE_MATRIX_INDEX );
 }
 
 void Cuda_Reset( reax_system *system, control_params *control,  
-		simulation_data *data, static_storage *workspace, list **lists  )
+        simulation_data *data, static_storage *workspace, list **lists  )
 {
-	Cuda_Reset_Atoms( system );
+    Cuda_Reset_Atoms( system );
 
-	//Reset_Simulation_Data( data );
-	Cuda_Sync_Simulation_Data ( data );
-	//Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
+    //Reset_Simulation_Data( data );
+    Cuda_Sync_Simulation_Data ( data );
+    //Sync_Host_Device (data, (simulation_data *)data->d_simulation_data, cudaMemcpyHostToDevice);
 
-	if( control->ensemble == NPT || control->ensemble == sNPT || 
-			control->ensemble == iNPT )
-		Reset_Pressures( data );
+    if( control->ensemble == NPT || control->ensemble == sNPT || 
+            control->ensemble == iNPT )
+        Reset_Pressures( data );
 
-	Cuda_Reset_Workspace( system, dev_workspace );  
+    Cuda_Reset_Workspace( system, dev_workspace );  
 
-	Cuda_Reset_Neighbor_Lists( system, control, workspace, lists );
+    Cuda_Reset_Neighbor_Lists( system, control, workspace, lists );
 
-	Cuda_Reset_Far_Neighbors_List (system);
+    Cuda_Reset_Far_Neighbors_List (system);
 
-	Cuda_Reset_Sparse_Matrix (system, dev_workspace);
+    Cuda_Reset_Sparse_Matrix (system, dev_workspace);
 
 }
 
 
 void Reset_Grid( grid *g )
 {
-	memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]);
+    memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2]);
 }
 
 void Cuda_Reset_Grid (grid *g)
 {
-	cuda_memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2], RES_GRID_TOP);
+    cuda_memset (g->top, 0, INT_SIZE * g->ncell[0]*g->ncell[1]*g->ncell[2], RES_GRID_TOP);
 }
 
 
 void Reset_Marks( grid *g, ivec *grid_stack, int grid_top )
 {
-	int i;
+    int i;
 
-	for( i = 0; i < grid_top; ++i )
-		g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + 
-			grid_stack[i][1] * g->ncell[2] + 
-			grid_stack[i][2]] = 0;
+    for( i = 0; i < grid_top; ++i )
+        g->mark[grid_stack[i][0] * g->ncell[1]*g->ncell[2] + 
+            grid_stack[i][1] * g->ncell[2] + 
+            grid_stack[i][2]] = 0;
 }
diff --git a/PuReMD-GPU/src/single_body_interactions.cu b/PuReMD-GPU/src/single_body_interactions.cu
index 2c3fd44f..3c6c0882 100644
--- a/PuReMD-GPU/src/single_body_interactions.cu
+++ b/PuReMD-GPU/src/single_body_interactions.cu
@@ -29,289 +29,289 @@
 
 
 void LonePair_OverUnder_Coordination_Energy( reax_system *system, 
-		control_params *control, 
-		simulation_data *data,
-		static_storage *workspace, 
-		list **lists, 
-		output_controls *out_control )
+        control_params *control, 
+        simulation_data *data,
+        static_storage *workspace, 
+        list **lists, 
+        output_controls *out_control )
 {
-	int i, j, pj, type_i, type_j;
-	real Delta_lpcorr, dfvl;
-	real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
-	real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
-	real e_ov, CEover1, CEover2, CEover3, CEover4;
-	real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
-	real exp_ovun2n, exp_ovun6, exp_ovun8;
-	real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
-	real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
-	real p_lp1, p_lp2, p_lp3;
-	real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
-
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_data *pbond;
-	bond_order_data *bo_ij; 
-	list *bonds = (*lists) + BONDS;
-
-	/* Initialize parameters */
-	p_lp1 = system->reaxprm.gp.l[15];
-	p_lp3 = system->reaxprm.gp.l[5];
-	p_ovun3 = system->reaxprm.gp.l[32];
-	p_ovun4 = system->reaxprm.gp.l[31];
-	p_ovun6 = system->reaxprm.gp.l[6];
-	p_ovun7 = system->reaxprm.gp.l[8];
-	p_ovun8 = system->reaxprm.gp.l[9];
-
-	for( i = 0; i < system->N; ++i ) {
-		/* set the parameter pointer */
-		type_i = system->atoms[i].type;
-		sbp_i = &(system->reaxprm.sbp[ type_i ]);
-
-		/* lone-pair Energy */
-		p_lp2 = sbp_i->p_lp2;      
-		expvd2 = EXP( -75 * workspace->Delta_lp[i] );
-		inv_expvd2 = 1. / (1. + expvd2 );
-
-		/* calculate the energy */
-		data->E_Lp += e_lp = 
-			p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-
-		dElp = p_lp2 * inv_expvd2 + 
-			75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
-		CElp = dElp * workspace->dDelta_lp[i];
-
-		workspace->CdDelta[i] += CElp;      // lp - 1st term
+    int i, j, pj, type_i, type_j;
+    real Delta_lpcorr, dfvl;
+    real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
+    real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
+    real e_ov, CEover1, CEover2, CEover3, CEover4;
+    real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
+    real exp_ovun2n, exp_ovun6, exp_ovun8;
+    real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
+    real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
+    real p_lp1, p_lp2, p_lp3;
+    real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
+
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_data *pbond;
+    bond_order_data *bo_ij; 
+    list *bonds = (*lists) + BONDS;
+
+    /* Initialize parameters */
+    p_lp1 = system->reaxprm.gp.l[15];
+    p_lp3 = system->reaxprm.gp.l[5];
+    p_ovun3 = system->reaxprm.gp.l[32];
+    p_ovun4 = system->reaxprm.gp.l[31];
+    p_ovun6 = system->reaxprm.gp.l[6];
+    p_ovun7 = system->reaxprm.gp.l[8];
+    p_ovun8 = system->reaxprm.gp.l[9];
+
+    for( i = 0; i < system->N; ++i ) {
+        /* set the parameter pointer */
+        type_i = system->atoms[i].type;
+        sbp_i = &(system->reaxprm.sbp[ type_i ]);
+
+        /* lone-pair Energy */
+        p_lp2 = sbp_i->p_lp2;      
+        expvd2 = EXP( -75 * workspace->Delta_lp[i] );
+        inv_expvd2 = 1. / (1. + expvd2 );
+
+        /* calculate the energy */
+        data->E_Lp += e_lp = 
+            p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+
+        dElp = p_lp2 * inv_expvd2 + 
+            75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
+        CElp = dElp * workspace->dDelta_lp[i];
+
+        workspace->CdDelta[i] += CElp;      // lp - 1st term
 
 #ifdef TEST_ENERGY
-		fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", 
-				p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
-		fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n",
-				workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp );
+        fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", 
+                p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
+        fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n",
+                workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp );
 #endif
 #ifdef TEST_FORCES
-		Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
+        Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
 #endif
 
-		/* correction for C2 */
-		if( system->reaxprm.gp.l[5] > 0.001 && 
-				!strcmp( system->reaxprm.sbp[type_i].name, "C" ) )
-			for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-				if( i < bonds->select.bond_list[pj].nbr ) {
-					j = bonds->select.bond_list[pj].nbr;
-					type_j = system->atoms[j].type;
-
-					if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) {
-						twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
-						bo_ij = &( bonds->select.bond_list[pj].bo_data );
-						Di = workspace->Delta[i];
-						vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
-
-						if( vov3 > 3. ) {
-							data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0);
-							//estrain(i) += e_lph;
-
-							deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-							deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
-
-							bo_ij->Cdbo += deahu2dbo;
-							workspace->CdDelta[i] += deahu2dsbo;
+        /* correction for C2 */
+        if( system->reaxprm.gp.l[5] > 0.001 && 
+                !strcmp( system->reaxprm.sbp[type_i].name, "C" ) )
+            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+                if( i < bonds->select.bond_list[pj].nbr ) {
+                    j = bonds->select.bond_list[pj].nbr;
+                    type_j = system->atoms[j].type;
+
+                    if( !strcmp( system->reaxprm.sbp[type_j].name, "C" ) ) {
+                        twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
+                        bo_ij = &( bonds->select.bond_list[pj].bo_data );
+                        Di = workspace->Delta[i];
+                        vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+
+                        if( vov3 > 3. ) {
+                            data->E_Lp += e_lph = p_lp3 * SQR(vov3-3.0);
+                            //estrain(i) += e_lph;
+
+                            deahu2dbo = 2.*p_lp3*(vov3 - 3.);
+                            deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+
+                            bo_ij->Cdbo += deahu2dbo;
+                            workspace->CdDelta[i] += deahu2dsbo;
 #ifdef TEST_ENERGY
-							fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n",
-									// workspace->orig_id[i], workspace->orig_id[j],
-									i+1, j+1, e_lph, deahu2dbo, deahu2dsbo );
+                            fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n",
+                                    // workspace->orig_id[i], workspace->orig_id[j],
+                                    i+1, j+1, e_lph, deahu2dbo, deahu2dsbo );
 #endif
 #ifdef TEST_FORCES
-							Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
-							Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
+                            Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
+                            Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
 #endif
-						}
-					}
+                        }
+                    }
 
-				}
-	}
+                }
+    }
 
 
-	for( i = 0; i < system->N; ++i ) {
-		type_i = system->atoms[i].type;
-		sbp_i = &(system->reaxprm.sbp[ type_i ]);
+    for( i = 0; i < system->N; ++i ) {
+        type_i = system->atoms[i].type;
+        sbp_i = &(system->reaxprm.sbp[ type_i ]);
 
-		/* over-coordination energy */
-		if( sbp_i->mass > 21.0 ) 
-			dfvl = 0.0;
-		else dfvl = 1.0; // only for 1st-row elements
+        /* over-coordination energy */
+        if( sbp_i->mass > 21.0 ) 
+            dfvl = 0.0;
+        else dfvl = 1.0; // only for 1st-row elements
 
-		p_ovun2 = sbp_i->p_ovun2;
-		sum_ovun1 = 0;
-		sum_ovun2 = 0;
+        p_ovun2 = sbp_i->p_ovun2;
+        sum_ovun1 = 0;
+        sum_ovun2 = 0;
 
-		for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
-			j = bonds->select.bond_list[pj].nbr;
-			type_j = system->atoms[j].type;	  
-			bo_ij = &(bonds->select.bond_list[pj].bo_data);
-			sbp_j = &(system->reaxprm.sbp[ type_j ]);
-			twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
+        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
+            j = bonds->select.bond_list[pj].nbr;
+            type_j = system->atoms[j].type;      
+            bo_ij = &(bonds->select.bond_list[pj].bo_data);
+            sbp_j = &(system->reaxprm.sbp[ type_j ]);
+            twbp = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
 
-			sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
-			sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
-				( bo_ij->BO_pi + bo_ij->BO_pi2 );
+            sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
+            sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
+                ( bo_ij->BO_pi + bo_ij->BO_pi2 );
 
-			/*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", 
-			  i+1, j+1, 
-			  dfvl * workspace->Delta_lp_temp[j],
-			  sbp_j->nlp_opt,
-			  workspace->nlp_temp[j] );*/
-		}
+            /*fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", 
+              i+1, j+1, 
+              dfvl * workspace->Delta_lp_temp[j],
+              sbp_j->nlp_opt,
+              workspace->nlp_temp[j] );*/
+        }
 
-		exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
-		inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
-		Delta_lpcorr  = workspace->Delta[i] - 
-			(dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
+        exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
+        inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
+        Delta_lpcorr  = workspace->Delta[i] - 
+            (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
 
-		exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
-		inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
+        exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
+        inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
 
-		DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 );
-		CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
+        DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 );
+        CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
 
-		data->E_Ov += e_ov = sum_ovun1 * CEover1;
+        data->E_Ov += e_ov = sum_ovun1 * CEover1;
 
-		CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
-			( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
+        CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
+            ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
 
-		CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
+        CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
 
-		CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
-			p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
+        CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
+            p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
 
 
-		/* under-coordination potential */
-		p_ovun2 = sbp_i->p_ovun2;
-		p_ovun5 = sbp_i->p_ovun5;
+        /* under-coordination potential */
+        p_ovun2 = sbp_i->p_ovun2;
+        p_ovun5 = sbp_i->p_ovun5;
 
-		exp_ovun2n = 1.0 / exp_ovun2;
-		exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
-		exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
-		inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
-		inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
+        exp_ovun2n = 1.0 / exp_ovun2;
+        exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
+        exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
+        inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
+        inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
-		data->E_Un += e_un =
-			-p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+        data->E_Un += e_un =
+            -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
 
-		CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
-				p_ovun2 * e_un * exp_ovun2n);
-		CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
-		CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
-		CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
-			p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
+        CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
+                p_ovun2 * e_un * exp_ovun2n);
+        CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
+        CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
+        CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
+            p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
 
-		//fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n",
-		//       i+1, sum_ovun2, e_ov, e_un );
+        //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n",
+        //       i+1, sum_ovun2, e_ov, e_un );
 
-		/* forces */
-		workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
-		workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
+        /* forces */
+        workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
+        workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
 
 #ifdef TEST_FORCES
-		Add_dDelta( system, lists, i, CEover3, workspace->f_ov );  // OvCoor - 2nd
-		Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st
+        Add_dDelta( system, lists, i, CEover3, workspace->f_ov );  // OvCoor - 2nd
+        Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st
 #endif
 
 
-		for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
-			pbond = &(bonds->select.bond_list[pj]);
-			j = pbond->nbr;
-			type_j = system->atoms[j].type;
-			bo_ij = &(pbond->bo_data);
-			twbp  = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
+        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+            pbond = &(bonds->select.bond_list[pj]);
+            j = pbond->nbr;
+            type_j = system->atoms[j].type;
+            bo_ij = &(pbond->bo_data);
+            twbp  = &(system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ]);
 
 
-			bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
-			workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])*
-				(bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
-			bo_ij->Cdbopi += CEover4 * 
-				(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
-			bo_ij->Cdbopi2 += CEover4 * 
-				(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+            bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
+            workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])*
+                (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
+            bo_ij->Cdbopi += CEover4 * 
+                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+            bo_ij->Cdbopi2 += CEover4 * 
+                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
 
 
-			workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) *
-				(bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
-			bo_ij->Cdbopi += CEunder4 * 
-				(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
-			bo_ij->Cdbopi2 += CEunder4 * 
-				(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
+            workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) *
+                (bo_ij->BO_pi + bo_ij->BO_pi2);   // UnCoor - 2a
+            bo_ij->Cdbopi += CEunder4 * 
+                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
+            bo_ij->Cdbopi2 += CEunder4 * 
+                (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
 
 
 #ifdef TEST_ENERGY
-			/* fprintf( out_control->eov, "%6d%23.15e%23.15e"
-			   workspace->orig_id[j]+1,
-			//twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
-			CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */
-
-			/*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-			  workspace->orig_id[j]+1, 
-			  CEover4,
-			  CEover4*
-			  (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-			  CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), 
-			  (1.0 - dfvl*workspace->dDelta_lp[j]),
-			  CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-			  (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
-
-			/* fprintf( out_control->eun, "%6d%23.15e\n",
-			   workspace->orig_id[j]+1, CEunder3 ); */
-
-			/*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n",
-			  workspace->orig_id[j]+1,
-			  CEunder4,
-			  (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-			  CEunder4*
-			  (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-			  CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
-			  (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
+            /* fprintf( out_control->eov, "%6d%23.15e%23.15e"
+               workspace->orig_id[j]+1,
+            //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
+            CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); */
+
+            /*fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
+              workspace->orig_id[j]+1, 
+              CEover4,
+              CEover4*
+              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+              CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), 
+              (1.0 - dfvl*workspace->dDelta_lp[j]),
+              CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
+
+            /* fprintf( out_control->eun, "%6d%23.15e\n",
+               workspace->orig_id[j]+1, CEunder3 ); */
+
+            /*fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n",
+              workspace->orig_id[j]+1,
+              CEunder4,
+              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+              CEunder4*
+              (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+              CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
+              (bo_ij->BO_pi + bo_ij->BO_pi2) );*/
 #endif
 
 #ifdef TEST_FORCES
-			Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, 
-					workspace->f_ov ); // OvCoor - 1st term
-
-			Add_dDelta( system, lists, j,
-					CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-					(bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a
-
-			Add_dBOpinpi2( system, lists, i, pj, 
-					CEover4 * (workspace->Delta[j] - 
-						dfvl * workspace->Delta_lp_temp[j]),
-					CEover4 * (workspace->Delta[j] - 
-						dfvl * workspace->Delta_lp_temp[j]),
-					workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
-
-			Add_dDelta( system, lists, j,
-					CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-					(bo_ij->BO_pi + bo_ij->BO_pi2),
-					workspace->f_un ); // UnCoor - 2a
-
-			Add_dBOpinpi2( system, lists, i, pj, 
-					CEunder4 * (workspace->Delta[j] - 
-						dfvl * workspace->Delta_lp_temp[j]),
-					CEunder4 * (workspace->Delta[j] - 
-						dfvl * workspace->Delta_lp_temp[j]),
-					workspace->f_un, workspace->f_un ); // UnCoor - 2b
+            Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, 
+                    workspace->f_ov ); // OvCoor - 1st term
+
+            Add_dDelta( system, lists, j,
+                    CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+                    (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a
+
+            Add_dBOpinpi2( system, lists, i, pj, 
+                    CEover4 * (workspace->Delta[j] - 
+                        dfvl * workspace->Delta_lp_temp[j]),
+                    CEover4 * (workspace->Delta[j] - 
+                        dfvl * workspace->Delta_lp_temp[j]),
+                    workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
+
+            Add_dDelta( system, lists, j,
+                    CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+                    (bo_ij->BO_pi + bo_ij->BO_pi2),
+                    workspace->f_un ); // UnCoor - 2a
+
+            Add_dBOpinpi2( system, lists, i, pj, 
+                    CEunder4 * (workspace->Delta[j] - 
+                        dfvl * workspace->Delta_lp_temp[j]),
+                    CEunder4 * (workspace->Delta[j] - 
+                        dfvl * workspace->Delta_lp_temp[j]),
+                    workspace->f_un, workspace->f_un ); // UnCoor - 2b
 #endif
-		}
+        }
 
 #ifdef TEST_ENERGY      
 
-		fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", 
-				i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); 
+        fprintf( out_control->eov, "%6d%15.8f%15.8f%15.8f\n", 
+                i+1, DlpVi, Delta_lpcorr, sbp_i->valency ); 
 
-		fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
-				i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un );
+        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
+                i+1/*workspace->orig_id[i]+1*/, e_ov, data->E_Ov + data->E_Un );
 
-		fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
-				i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un );
+        fprintf( out_control->eov, "%6d%15.8f%15.8f\n", 
+                i+1/*workspace->orig_id[i]+1*/, e_un, data->E_Ov + data->E_Un );
 #endif
-	}
+    }
 }
 
 
@@ -324,324 +324,324 @@ void LonePair_OverUnder_Coordination_Energy( reax_system *system,
 
 //CUDA Functions
 GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, global_parameters g_params, 
-		single_body_parameters *sbp, two_body_parameters *tbp, 
-		static_storage p_workspace, simulation_data *data,
-		list p_bonds, int N, int num_atom_types )
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        static_storage p_workspace, simulation_data *data,
+        list p_bonds, int N, int num_atom_types )
 {
-	int i, j, pj, type_i, type_j;
-	real Delta_lpcorr, dfvl;
-	real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
-	real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
-	real e_ov, CEover1, CEover2, CEover3, CEover4;
-	real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
-	real exp_ovun2n, exp_ovun6, exp_ovun8;
-	real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
-	real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
-	real p_lp1, p_lp2, p_lp3;
-	real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
-
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_data *pbond;
-	bond_order_data *bo_ij; 
-	list *bonds = &p_bonds;
-	static_storage *workspace = &p_workspace;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	//if (i >= N) return;
-
-	/* Initialize parameters */
-	p_lp1 = g_params.l[15];
-	p_lp3 = g_params.l[5];
-	p_ovun3 = g_params.l[32];
-	p_ovun4 = g_params.l[31];
-	p_ovun6 = g_params.l[6];
-	p_ovun7 = g_params.l[8];
-	p_ovun8 = g_params.l[9];
-
-	//for( i = 0; i < system->N; ++i ) {
-	if (i < N) {
-		// set the parameter pointer 
-		type_i = atoms[i].type;
-		sbp_i = &(sbp[ type_i ]);
-
-		// lone-pair Energy 
-		p_lp2 = sbp_i->p_lp2;      
-		expvd2 = EXP( -75 * workspace->Delta_lp[i] );
-		inv_expvd2 = 1. / (1. + expvd2 );
-
-		// calculate the energy 
-		e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-
-		//PERFORMANCE IMPACT
-		atomicAdd (&data->E_Lp, e_lp);
-
-		dElp = p_lp2 * inv_expvd2 + 
-			75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
-		CElp = dElp * workspace->dDelta_lp[i];
-
-		//PERFORMANCE IMPACT
-		//workspace->CdDelta[i] += CElp;      // lp - 1st term
-		atomicAdd (&workspace->CdDelta[i], CElp);
+    int i, j, pj, type_i, type_j;
+    real Delta_lpcorr, dfvl;
+    real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
+    real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
+    real e_ov, CEover1, CEover2, CEover3, CEover4;
+    real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
+    real exp_ovun2n, exp_ovun6, exp_ovun8;
+    real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
+    real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
+    real p_lp1, p_lp2, p_lp3;
+    real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
+
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_data *pbond;
+    bond_order_data *bo_ij; 
+    list *bonds = &p_bonds;
+    static_storage *workspace = &p_workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    //if (i >= N) return;
+
+    /* Initialize parameters */
+    p_lp1 = g_params.l[15];
+    p_lp3 = g_params.l[5];
+    p_ovun3 = g_params.l[32];
+    p_ovun4 = g_params.l[31];
+    p_ovun6 = g_params.l[6];
+    p_ovun7 = g_params.l[8];
+    p_ovun8 = g_params.l[9];
+
+    //for( i = 0; i < system->N; ++i ) {
+    if (i < N) {
+        // set the parameter pointer 
+        type_i = atoms[i].type;
+        sbp_i = &(sbp[ type_i ]);
+
+        // lone-pair Energy 
+        p_lp2 = sbp_i->p_lp2;      
+        expvd2 = EXP( -75 * workspace->Delta_lp[i] );
+        inv_expvd2 = 1. / (1. + expvd2 );
+
+        // calculate the energy 
+        e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+
+        //PERFORMANCE IMPACT
+        atomicAdd (&data->E_Lp, e_lp);
+
+        dElp = p_lp2 * inv_expvd2 + 
+            75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
+        CElp = dElp * workspace->dDelta_lp[i];
+
+        //PERFORMANCE IMPACT
+        //workspace->CdDelta[i] += CElp;      // lp - 1st term
+        atomicAdd (&workspace->CdDelta[i], CElp);
 
 
 #ifdef TEST_ENERGY
-		//TODO
-		//fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", 
-		//   p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
-		//fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n",
-		//   workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp );
+        //TODO
+        //fprintf( out_control->elp, "%23.15e%23.15e%23.15e%23.15e\n", 
+        //   p_lp2, workspace->Delta_lp_temp[i], expvd2, dElp );
+        //fprintf( out_control->elp, "%6d%23.15e%23.15e%23.15e\n",
+        //   workspace->orig_id[i]+1, workspace->nlp[i], e_lp, data->E_Lp );
 #endif
 #ifdef TEST_FORCES
-		//TODO
-		//Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
-		//TODO
+        //TODO
+        //Add_dDelta( system, lists, i, CElp, workspace->f_lp );  // lp - 1st term
+        //TODO
 #endif
 
-		// correction for C2 
-		if( g_params.l[5] > 0.001 && 
-				!cuda_strcmp( sbp[type_i].name, "C" , 15) )
-			for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-				if( i < bonds->select.bond_list[pj].nbr ) {
-					j = bonds->select.bond_list[pj].nbr;
-					type_j = atoms[j].type;
+        // correction for C2 
+        if( g_params.l[5] > 0.001 && 
+                !cuda_strcmp( sbp[type_i].name, "C" , 15) )
+            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+                if( i < bonds->select.bond_list[pj].nbr ) {
+                    j = bonds->select.bond_list[pj].nbr;
+                    type_j = atoms[j].type;
 
-					if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) {
-						twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
-						bo_ij = &( bonds->select.bond_list[pj].bo_data );
-						Di = workspace->Delta[i];
-						vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+                    if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) {
+                        twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+                        bo_ij = &( bonds->select.bond_list[pj].bo_data );
+                        Di = workspace->Delta[i];
+                        vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
 
-						if( vov3 > 3. ) {
+                        if( vov3 > 3. ) {
 
-							//PERFORMANCE IMPACT
-							e_lph = p_lp3 * SQR(vov3-3.0);
-							atomicAdd (&data->E_Lp, e_lph );
-							//estrain(i) += e_lph;
+                            //PERFORMANCE IMPACT
+                            e_lph = p_lp3 * SQR(vov3-3.0);
+                            atomicAdd (&data->E_Lp, e_lph );
+                            //estrain(i) += e_lph;
 
-							deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-							deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+                            deahu2dbo = 2.*p_lp3*(vov3 - 3.);
+                            deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
 
-							bo_ij->Cdbo += deahu2dbo;
+                            bo_ij->Cdbo += deahu2dbo;
 
 
-							//PERFORMANCE IMPACT
-							atomicAdd (&workspace->CdDelta[i], deahu2dsbo);
+                            //PERFORMANCE IMPACT
+                            atomicAdd (&workspace->CdDelta[i], deahu2dsbo);
 #ifdef TEST_ENERGY
-							//TODO
-							//fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n",
-							// workspace->orig_id[i], workspace->orig_id[j],
-							//  i+1, j+1, e_lph, deahu2dbo, deahu2dsbo );
+                            //TODO
+                            //fprintf(out_control->elp,"C2cor%6d%6d%23.15e%23.15e%23.15e\n",
+                            // workspace->orig_id[i], workspace->orig_id[j],
+                            //  i+1, j+1, e_lph, deahu2dbo, deahu2dsbo );
 #endif
 #ifdef TEST_FORCES
-							//TODO
-							//Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
-							//Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
+                            //TODO
+                            //Add_dBO(system, lists, i, pj, deahu2dbo, workspace->f_lp);
+                            //Add_dDelta(system, lists, i, deahu2dsbo, workspace->f_lp);
 #endif
-						}
-					}
+                        }
+                    }
 
-				}
-	} // end of if statement for the all the threads
+                }
+    } // end of if statement for the all the threads
 
-	__syncthreads ();
+    __syncthreads ();
 
-	//TODO
-	if (i >= N) return;
-	//TODO
+    //TODO
+    if (i >= N) return;
+    //TODO
 
 
-	//for( i = 0; i < system->N; ++i ) {
-	type_i = atoms[i].type;
-	sbp_i = &(sbp[ type_i ]);
+    //for( i = 0; i < system->N; ++i ) {
+    type_i = atoms[i].type;
+    sbp_i = &(sbp[ type_i ]);
 
-	// over-coordination energy 
-	if( sbp_i->mass > 21.0 ) 
-		dfvl = 0.0;
-	else dfvl = 1.0; // only for 1st-row elements
+    // over-coordination energy 
+    if( sbp_i->mass > 21.0 ) 
+        dfvl = 0.0;
+    else dfvl = 1.0; // only for 1st-row elements
 
-	p_ovun2 = sbp_i->p_ovun2;
-	sum_ovun1 = 0;
-	sum_ovun2 = 0;
+    p_ovun2 = sbp_i->p_ovun2;
+    sum_ovun1 = 0;
+    sum_ovun2 = 0;
 
-	for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
-		j = bonds->select.bond_list[pj].nbr;
-		type_j = atoms[j].type;	  
-		bo_ij = &(bonds->select.bond_list[pj].bo_data);
-		sbp_j = &(sbp[ type_j ]);
-		twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
+        j = bonds->select.bond_list[pj].nbr;
+        type_j = atoms[j].type;      
+        bo_ij = &(bonds->select.bond_list[pj].bo_data);
+        sbp_j = &(sbp[ type_j ]);
+        twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
 
-		sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
-		sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
-			( bo_ij->BO_pi + bo_ij->BO_pi2 );
+        sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
+        sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
+            ( bo_ij->BO_pi + bo_ij->BO_pi2 );
 
-		//fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", 
-		//i+1, j+1, 
-		//dfvl * workspace->Delta_lp_temp[j],
-		//sbp_j->nlp_opt,
-		//workspace->nlp_temp[j] );
-	}
+        //fprintf( stdout, "%4d%4d%23.15e%23.15e%23.15e\n", 
+        //i+1, j+1, 
+        //dfvl * workspace->Delta_lp_temp[j],
+        //sbp_j->nlp_opt,
+        //workspace->nlp_temp[j] );
+    }
 
-	//__syncthreads ();
+    //__syncthreads ();
 
 
 
-	exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
-	inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
-	Delta_lpcorr  = workspace->Delta[i] - 
-		(dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
+    exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
+    inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
+    Delta_lpcorr  = workspace->Delta[i] - 
+        (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
 
-	exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
-	inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
+    exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
+    inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
 
-	DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 );
-	CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
+    DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 );
+    CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
 
-	//PERFORMANCE IMPACT
-	//data->E_Ov += e_ov = sum_ovun1 * CEover1;
-	e_ov = sum_ovun1 * CEover1;
-	atomicAdd (&data->E_Ov, e_ov ); 
+    //PERFORMANCE IMPACT
+    //data->E_Ov += e_ov = sum_ovun1 * CEover1;
+    e_ov = sum_ovun1 * CEover1;
+    atomicAdd (&data->E_Ov, e_ov ); 
 
-	CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
-		( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
+    CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
+        ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
 
-	CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
+    CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
 
-	CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
-		p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
+    CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
+        p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
 
 
-	// under-coordination potential 
-	p_ovun2 = sbp_i->p_ovun2;
-	p_ovun5 = sbp_i->p_ovun5;
+    // under-coordination potential 
+    p_ovun2 = sbp_i->p_ovun2;
+    p_ovun5 = sbp_i->p_ovun5;
 
-	exp_ovun2n = 1.0 / exp_ovun2;
-	exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
-	exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
-	inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
-	inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
+    exp_ovun2n = 1.0 / exp_ovun2;
+    exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
+    exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
+    inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
+    inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
-	//PERFORMANCE IMPACT
-	e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
-	atomicAdd (&data->E_Un, e_un );
+    //PERFORMANCE IMPACT
+    e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+    atomicAdd (&data->E_Un, e_un );
 
-	CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
-			p_ovun2 * e_un * exp_ovun2n);
-	CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
-	CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
-	CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
-		p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
+    CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
+            p_ovun2 * e_un * exp_ovun2n);
+    CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
+    CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
+    CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
+        p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
 
-	//fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n",
-	//       i+1, sum_ovun2, e_ov, e_un );
+    //fprintf( stdout, "%6d%23.15e%23.15e%23.15e\n",
+    //       i+1, sum_ovun2, e_ov, e_un );
 
-	// forces 
-	//PERFORMANCE IMPACT
-	atomicAdd (&workspace->CdDelta[i] , CEover3);   // OvCoor - 2nd term
-	atomicAdd (&workspace->CdDelta[i], CEunder3);  // UnCoor - 1st term
+    // forces 
+    //PERFORMANCE IMPACT
+    atomicAdd (&workspace->CdDelta[i] , CEover3);   // OvCoor - 2nd term
+    atomicAdd (&workspace->CdDelta[i], CEunder3);  // UnCoor - 1st term
 
 #ifdef TEST_FORCES
-	//TODO
-	//Add_dDelta( system, lists, i, CEover3, workspace->f_ov );  // OvCoor - 2nd
-	//Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st
+    //TODO
+    //Add_dDelta( system, lists, i, CEover3, workspace->f_ov );  // OvCoor - 2nd
+    //Add_dDelta( system, lists, i, CEunder3, workspace->f_un ); // UnCoor - 1st
 #endif
 
-	//__syncthreads ();
+    //__syncthreads ();
 
-	for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
-		pbond = &(bonds->select.bond_list[pj]);
-		j = pbond->nbr;
-		type_j = atoms[j].type;
-		bo_ij = &(pbond->bo_data);
-		twbp  = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+        pbond = &(bonds->select.bond_list[pj]);
+        j = pbond->nbr;
+        type_j = atoms[j].type;
+        bo_ij = &(pbond->bo_data);
+        twbp  = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
 
 
-		bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
+        bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
 
-		//PERFORMANCE IMPACT
-		atomicAdd (&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a
+        //PERFORMANCE IMPACT
+        atomicAdd (&workspace->CdDelta[j], CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2)); // OvCoor - 3a
 
-		bo_ij->Cdbopi += CEover4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
-		bo_ij->Cdbopi2 += CEover4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+        bo_ij->Cdbopi += CEover4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+        bo_ij->Cdbopi2 += CEover4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
 
 
-		//PERFORMANCE IMPACT
-		atomicAdd (&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) );   // UnCoor - 2a
+        //PERFORMANCE IMPACT
+        atomicAdd (&workspace->CdDelta[j], CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) );   // UnCoor - 2a
 
-		bo_ij->Cdbopi += CEunder4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
-		bo_ij->Cdbopi2 += CEunder4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
+        bo_ij->Cdbopi += CEunder4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
+        bo_ij->Cdbopi2 += CEunder4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
 
 
 #ifdef TEST_ENERGY
-		//       fprintf( out_control->eov, "%6d%23.15e%23.15e"
-		//	 workspace->orig_id[j]+1,
-		//twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
-		//	 CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); 
-
-		//      fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
-		//	workspace->orig_id[j]+1, 
-		//	CEover4,
-		//	CEover4*
-		//	(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-		//	CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), 
-		//	(1.0 - dfvl*workspace->dDelta_lp[j]),
-		//	CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-		//	(bo_ij->BO_pi + bo_ij->BO_pi2) );
-
-		//      fprintf( out_control->eun, "%6d%23.15e\n",
-		//	 workspace->orig_id[j]+1, CEunder3 ); 
-
-		//     fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n",
-		//	workspace->orig_id[j]+1,
-		//	CEunder4,
-		//	(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-		//	CEunder4*
-		//	(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
-		//	CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
-		//	(bo_ij->BO_pi + bo_ij->BO_pi2) );
+        //       fprintf( out_control->eov, "%6d%23.15e%23.15e"
+        //     workspace->orig_id[j]+1,
+        //twbp->p_ovun1,twbp->De_s,Delta_lpcorr*DlpVi*inv_exp_ovun2,
+        //     CEover1*twbp->p_ovun1*twbp->De_s, CEover3 ); 
+
+        //      fprintf( out_control->eov, "%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n", 
+        //    workspace->orig_id[j]+1, 
+        //    CEover4,
+        //    CEover4*
+        //    (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+        //    CEover4 * (bo_ij->BO_pi + bo_ij->BO_pi2), 
+        //    (1.0 - dfvl*workspace->dDelta_lp[j]),
+        //    CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+        //    (bo_ij->BO_pi + bo_ij->BO_pi2) );
+
+        //      fprintf( out_control->eun, "%6d%23.15e\n",
+        //     workspace->orig_id[j]+1, CEunder3 ); 
+
+        //     fprintf( out_control->eun, "%6d%23.15e%23.15e%23.15e%23.15e\n",
+        //    workspace->orig_id[j]+1,
+        //    CEunder4,
+        //    (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+        //    CEunder4*
+        //    (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]),
+        //    CEunder4*(1.0 - dfvl*workspace->dDelta_lp[j])*
+        //    (bo_ij->BO_pi + bo_ij->BO_pi2) );
 #endif
 
 #ifdef TEST_FORCES
-		//TODO
-		//      Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, 
-		//	       workspace->f_ov ); // OvCoor - 1st term
-
-		//     Add_dDelta( system, lists, j,
-		//		  CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-		//		  (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a
-
-		//     Add_dBOpinpi2( system, lists, i, pj, 
-		//		     CEover4 * (workspace->Delta[j] - 
-		//				dfvl * workspace->Delta_lp_temp[j]),
-		//		     CEover4 * (workspace->Delta[j] - 
-		//				dfvl * workspace->Delta_lp_temp[j]),
-		//		     workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
-
-		//     Add_dDelta( system, lists, j,
-		//		  CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
-		//		  (bo_ij->BO_pi + bo_ij->BO_pi2),
-		//		  workspace->f_un ); // UnCoor - 2a
-
-		//     Add_dBOpinpi2( system, lists, i, pj, 
-		//		     CEunder4 * (workspace->Delta[j] - 
-		//				 dfvl * workspace->Delta_lp_temp[j]),
-		//		     CEunder4 * (workspace->Delta[j] - 
-		//				 dfvl * workspace->Delta_lp_temp[j]),
-		//		     workspace->f_un, workspace->f_un ); // UnCoor - 2b
+        //TODO
+        //      Add_dBO( system, lists, i, pj, CEover1 * twbp->p_ovun1 * twbp->De_s, 
+        //           workspace->f_ov ); // OvCoor - 1st term
+
+        //     Add_dDelta( system, lists, j,
+        //          CEover4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+        //          (bo_ij->BO_pi+bo_ij->BO_pi2), workspace->f_ov );//OvCoor3a
+
+        //     Add_dBOpinpi2( system, lists, i, pj, 
+        //             CEover4 * (workspace->Delta[j] - 
+        //                dfvl * workspace->Delta_lp_temp[j]),
+        //             CEover4 * (workspace->Delta[j] - 
+        //                dfvl * workspace->Delta_lp_temp[j]),
+        //             workspace->f_ov, workspace->f_ov ); // OvCoor - 3b
+
+        //     Add_dDelta( system, lists, j,
+        //          CEunder4 * (1.0 - dfvl*workspace->dDelta_lp[j]) * 
+        //          (bo_ij->BO_pi + bo_ij->BO_pi2),
+        //          workspace->f_un ); // UnCoor - 2a
+
+        //     Add_dBOpinpi2( system, lists, i, pj, 
+        //             CEunder4 * (workspace->Delta[j] - 
+        //                 dfvl * workspace->Delta_lp_temp[j]),
+        //             CEunder4 * (workspace->Delta[j] - 
+        //                 dfvl * workspace->Delta_lp_temp[j]),
+        //             workspace->f_un, workspace->f_un ); // UnCoor - 2b
 #endif
-	}
+    }
 
 #ifdef TEST_ENERGY      
 
-	//TODO
-	//replace the code here... you deleted for compiling
-	//TODO
+    //TODO
+    //replace the code here... you deleted for compiling
+    //TODO
 #endif
-	//} .. end of for loop
+    //} .. end of for loop
 
 
 }
@@ -656,325 +656,325 @@ GLOBAL void Cuda_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, glob
 
 //CUDA Functions
 GLOBAL void test_LonePair_OverUnder_Coordination_Energy ( reax_atom *atoms, global_parameters g_params, 
-		single_body_parameters *sbp, two_body_parameters *tbp, 
-		static_storage p_workspace, simulation_data *data,
-		list p_bonds, int N, int num_atom_types, 
-		real *E_Lp, real *E_Ov, real *E_Un)
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        static_storage p_workspace, simulation_data *data,
+        list p_bonds, int N, int num_atom_types, 
+        real *E_Lp, real *E_Ov, real *E_Un)
 {
-	int i, j, pj, type_i, type_j;
-	real Delta_lpcorr, dfvl;
-	real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
-	real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
-	real e_ov, CEover1, CEover2, CEover3, CEover4;
-	real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
-	real exp_ovun2n, exp_ovun6, exp_ovun8;
-	real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
-	real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
-	real p_lp1, p_lp2, p_lp3;
-	real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
-
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_data *pbond;
-	bond_order_data *bo_ij; 
-	list *bonds = &p_bonds;
-	static_storage *workspace = &p_workspace;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	/* Initialize parameters */
-	p_lp1 = g_params.l[15];
-	p_lp3 = g_params.l[5];
-	p_ovun3 = g_params.l[32];
-	p_ovun4 = g_params.l[31];
-	p_ovun6 = g_params.l[6];
-	p_ovun7 = g_params.l[8];
-	p_ovun8 = g_params.l[9];
-
-	/*
-	   if (i < N) {
-	// set the parameter pointer 
-	type_i = atoms[i].type;
-	sbp_i = &(sbp[ type_i ]);
-
-	// lone-pair Energy 
-	p_lp2 = sbp_i->p_lp2;      
-	expvd2 = EXP( -75 * workspace->Delta_lp[i] );
-	inv_expvd2 = 1. / (1. + expvd2 );
-
-	// calculate the energy 
-	e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-	//atomicAdd (&data->E_Lp, e_lp );
-	E_Lp [ i ] = e_lp;
-
-	dElp = p_lp2 * inv_expvd2 + 
-	75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
-	CElp = dElp * workspace->dDelta_lp[i];
-
-	workspace->CdDelta[i] += CElp;      // lp - 1st term
-
-	// correction for C2 
-	if( g_params.l[5] > 0.001 && 
-	!cuda_strcmp( sbp[type_i].name, "C" , 15) )
-	for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-	if( i < bonds->select.bond_list[pj].nbr ) {
-	j = bonds->select.bond_list[pj].nbr;
-	type_j = atoms[j].type;
-
-	if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) {
-	twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
-	bo_ij = &( bonds->select.bond_list[pj].bo_data );
-	Di = workspace->Delta[i];
-	vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
-
-	if( vov3 > 3. ) {
-
-	e_lph = p_lp3 * SQR(vov3-3.0);
-	E_Lp [i] += e_lph;
-	//atomicAdd (&data->E_Lp, e_lph );
-	//estrain(i) += e_lph;
-
-	deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-	deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
-
-	bo_ij->Cdbo += deahu2dbo;
-
-	workspace->CdDelta[i] += deahu2dsbo;
-	}
-	}
-	}
-	} // end of if statement for the all the threads
-
-	__syncthreads ();
-
-	if (i >= N) return;
-
-	 */
-
-	type_i = atoms[i].type;
-	sbp_i = &(sbp[ type_i ]);
-
-	// over-coordination energy 
-	if( sbp_i->mass > 21.0 ) 
-		dfvl = 0.0;
-	else dfvl = 1.0; // only for 1st-row elements
-
-	p_ovun2 = sbp_i->p_ovun2;
-	sum_ovun1 = 0;
-	sum_ovun2 = 0;
-
-	for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
-		j = bonds->select.bond_list[pj].nbr;
-		type_j = atoms[j].type;	  
-		bo_ij = &(bonds->select.bond_list[pj].bo_data);
-		sbp_j = &(sbp[ type_j ]);
-		twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
-
-		sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
-		sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
-			( bo_ij->BO_pi + bo_ij->BO_pi2 );
-	}
-
-
-	exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
-	inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
-	Delta_lpcorr  = workspace->Delta[i] - 
-		(dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
+    int i, j, pj, type_i, type_j;
+    real Delta_lpcorr, dfvl;
+    real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
+    real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
+    real e_ov, CEover1, CEover2, CEover3, CEover4;
+    real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
+    real exp_ovun2n, exp_ovun6, exp_ovun8;
+    real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
+    real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
+    real p_lp1, p_lp2, p_lp3;
+    real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
+
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_data *pbond;
+    bond_order_data *bo_ij; 
+    list *bonds = &p_bonds;
+    static_storage *workspace = &p_workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    /* Initialize parameters */
+    p_lp1 = g_params.l[15];
+    p_lp3 = g_params.l[5];
+    p_ovun3 = g_params.l[32];
+    p_ovun4 = g_params.l[31];
+    p_ovun6 = g_params.l[6];
+    p_ovun7 = g_params.l[8];
+    p_ovun8 = g_params.l[9];
+
+    /*
+       if (i < N) {
+    // set the parameter pointer 
+    type_i = atoms[i].type;
+    sbp_i = &(sbp[ type_i ]);
+
+    // lone-pair Energy 
+    p_lp2 = sbp_i->p_lp2;      
+    expvd2 = EXP( -75 * workspace->Delta_lp[i] );
+    inv_expvd2 = 1. / (1. + expvd2 );
+
+    // calculate the energy 
+    e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+    //atomicAdd (&data->E_Lp, e_lp );
+    E_Lp [ i ] = e_lp;
+
+    dElp = p_lp2 * inv_expvd2 + 
+    75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
+    CElp = dElp * workspace->dDelta_lp[i];
+
+    workspace->CdDelta[i] += CElp;      // lp - 1st term
+
+    // correction for C2 
+    if( g_params.l[5] > 0.001 && 
+    !cuda_strcmp( sbp[type_i].name, "C" , 15) )
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+    if( i < bonds->select.bond_list[pj].nbr ) {
+    j = bonds->select.bond_list[pj].nbr;
+    type_j = atoms[j].type;
+
+    if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) {
+    twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+    bo_ij = &( bonds->select.bond_list[pj].bo_data );
+    Di = workspace->Delta[i];
+    vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+
+    if( vov3 > 3. ) {
+
+    e_lph = p_lp3 * SQR(vov3-3.0);
+    E_Lp [i] += e_lph;
+    //atomicAdd (&data->E_Lp, e_lph );
+    //estrain(i) += e_lph;
+
+    deahu2dbo = 2.*p_lp3*(vov3 - 3.);
+    deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+
+    bo_ij->Cdbo += deahu2dbo;
+
+    workspace->CdDelta[i] += deahu2dsbo;
+    }
+    }
+    }
+    } // end of if statement for the all the threads
+
+    __syncthreads ();
+
+    if (i >= N) return;
+
+     */
+
+    type_i = atoms[i].type;
+    sbp_i = &(sbp[ type_i ]);
+
+    // over-coordination energy 
+    if( sbp_i->mass > 21.0 ) 
+        dfvl = 0.0;
+    else dfvl = 1.0; // only for 1st-row elements
+
+    p_ovun2 = sbp_i->p_ovun2;
+    sum_ovun1 = 0;
+    sum_ovun2 = 0;
+
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
+        j = bonds->select.bond_list[pj].nbr;
+        type_j = atoms[j].type;      
+        bo_ij = &(bonds->select.bond_list[pj].bo_data);
+        sbp_j = &(sbp[ type_j ]);
+        twbp = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+
+        sum_ovun1 += twbp->p_ovun1 * twbp->De_s * bo_ij->BO;
+        sum_ovun2 += (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j])*
+            ( bo_ij->BO_pi + bo_ij->BO_pi2 );
+    }
+
+
+    exp_ovun1 = p_ovun3 * EXP( p_ovun4 * sum_ovun2 );
+    inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
+    Delta_lpcorr  = workspace->Delta[i] - 
+        (dfvl*workspace->Delta_lp_temp[i]) * inv_exp_ovun1;
 
-	exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
-	inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
-
-	DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 );
-	CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
+    exp_ovun2 = EXP( p_ovun2 * Delta_lpcorr );
+    inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
+
+    DlpVi = 1.0 / (Delta_lpcorr + sbp_i->valency + 1e-8 );
+    CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
 
-	e_ov = sum_ovun1 * CEover1;
-	E_Ov [ i ] = e_ov;
-	//atomicAdd ( &data->E_Ov, e_ov );
+    e_ov = sum_ovun1 * CEover1;
+    E_Ov [ i ] = e_ov;
+    //atomicAdd ( &data->E_Ov, e_ov );
 
-	CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
-		( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
+    CEover2 = sum_ovun1 * DlpVi * inv_exp_ovun2 *
+        ( 1.0 - Delta_lpcorr*( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ) );
 
-	CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
+    CEover3 = CEover2 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1 );
 
-	CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
-		p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
+    CEover4 = CEover2 * (dfvl*workspace->Delta_lp_temp[i]) * 
+        p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
 
 
-	// under-coordination potential 
-	p_ovun2 = sbp_i->p_ovun2;
-	p_ovun5 = sbp_i->p_ovun5;
+    // under-coordination potential 
+    p_ovun2 = sbp_i->p_ovun2;
+    p_ovun5 = sbp_i->p_ovun5;
 
-	exp_ovun2n = 1.0 / exp_ovun2;
-	exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
-	exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
-	inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
-	inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
+    exp_ovun2n = 1.0 / exp_ovun2;
+    exp_ovun6 = EXP( p_ovun6 * Delta_lpcorr );
+    exp_ovun8 = p_ovun7 * EXP(p_ovun8 * sum_ovun2);
+    inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
+    inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
-	e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
-	E_Un [i] = e_un;
-	//atomicAdd ( &data->E_Un, e_un );
+    e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
+    E_Un [i] = e_un;
+    //atomicAdd ( &data->E_Un, e_un );
 
-	CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
-			p_ovun2 * e_un * exp_ovun2n);
-	CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
-	CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
-	CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
-		p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
+    CEunder1 = inv_exp_ovun2n * ( p_ovun5*p_ovun6*exp_ovun6*inv_exp_ovun8 +
+            p_ovun2 * e_un * exp_ovun2n);
+    CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
+    CEunder3 = CEunder1 * (1.0 - dfvl*workspace->dDelta_lp[i]*inv_exp_ovun1);
+    CEunder4 = CEunder1 * (dfvl*workspace->Delta_lp_temp[i]) * 
+        p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1) + CEunder2;
 
-	// forces 
-	workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
-	workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
+    // forces 
+    workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
+    workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
 
-	for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
-		pbond = &(bonds->select.bond_list[pj]);
-		j = pbond->nbr;
-		type_j = atoms[j].type;
-		bo_ij = &(pbond->bo_data);
-		twbp  = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+        pbond = &(bonds->select.bond_list[pj]);
+        j = pbond->nbr;
+        type_j = atoms[j].type;
+        bo_ij = &(pbond->bo_data);
+        twbp  = &(tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
 
-		bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
+        bo_ij->Cdbo += CEover1 * twbp->p_ovun1 * twbp->De_s; // OvCoor - 1st  
 
-		//workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
-		pbond->scratch += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
+        //workspace->CdDelta[j] += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
+        pbond->scratch += CEover4*(1.0 - dfvl*workspace->dDelta_lp[j])* (bo_ij->BO_pi + bo_ij->BO_pi2); // OvCoor - 3a
 
-		bo_ij->Cdbopi += CEover4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
-		bo_ij->Cdbopi2 += CEover4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+        bo_ij->Cdbopi += CEover4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
+        bo_ij->Cdbopi2 += CEover4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//OvCoor-3b
 
-		//workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ;   // UnCoor - 2a
-		pbond->scratch += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ;   // UnCoor - 2a
+        //workspace->CdDelta[j] += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ;   // UnCoor - 2a
+        pbond->scratch += CEunder4*(1.0-dfvl*workspace->dDelta_lp[j]) * (bo_ij->BO_pi + bo_ij->BO_pi2) ;   // UnCoor - 2a
 
-		bo_ij->Cdbopi += CEunder4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
-		bo_ij->Cdbopi2 += CEunder4 * 
-			(workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
+        bo_ij->Cdbopi += CEunder4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
+        bo_ij->Cdbopi2 += CEunder4 * 
+            (workspace->Delta[j] - dfvl*workspace->Delta_lp_temp[j]);//UnCoor-2b
 
-	}
+    }
 }
 
 ///////////////////////////////////////////////////////////
 GLOBAL void test_LonePair_OverUnder_Coordination_Energy_LP ( reax_atom *atoms, global_parameters g_params, 
-		single_body_parameters *sbp, two_body_parameters *tbp, 
-		static_storage p_workspace, simulation_data *data,
-		list p_bonds, int N, int num_atom_types, 
-		real *E_Lp, real *E_Ov, real *E_Un)
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        static_storage p_workspace, simulation_data *data,
+        list p_bonds, int N, int num_atom_types, 
+        real *E_Lp, real *E_Ov, real *E_Un)
 {
-	int i, j, pj, type_i, type_j;
-	real Delta_lpcorr, dfvl;
-	real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
-	real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
-	real e_ov, CEover1, CEover2, CEover3, CEover4;
-	real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
-	real exp_ovun2n, exp_ovun6, exp_ovun8;
-	real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
-	real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
-	real p_lp1, p_lp2, p_lp3;
-	real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
-
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_data *pbond;
-	bond_order_data *bo_ij; 
-	list *bonds = &p_bonds;
-	static_storage *workspace = &p_workspace;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	/* Initialize parameters */
-	p_lp1 = g_params.l[15];
-	p_lp3 = g_params.l[5];
-	p_ovun3 = g_params.l[32];
-	p_ovun4 = g_params.l[31];
-	p_ovun6 = g_params.l[6];
-	p_ovun7 = g_params.l[8];
-	p_ovun8 = g_params.l[9];
-
-	// set the parameter pointer 
-	type_i = atoms[i].type;
-	sbp_i = &(sbp[ type_i ]);
-
-	// lone-pair Energy 
-	p_lp2 = sbp_i->p_lp2;      
-	expvd2 = EXP( -75 * workspace->Delta_lp[i] );
-	inv_expvd2 = 1. / (1. + expvd2 );
-
-	// calculate the energy 
-	e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
-	//atomicAdd (&data->E_Lp, e_lp );
-	E_Lp [ i ] = e_lp;
-
-	dElp = p_lp2 * inv_expvd2 + 
-		75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
-	CElp = dElp * workspace->dDelta_lp[i];
-
-	workspace->CdDelta[i] += CElp;      // lp - 1st term
-
-	// correction for C2 
-	if( g_params.l[5] > 0.001 && 
-			!cuda_strcmp( sbp[type_i].name, "C" , 15) )
-		for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
-			if( i < bonds->select.bond_list[pj].nbr ) {
-				j = bonds->select.bond_list[pj].nbr;
-				type_j = atoms[j].type;
-
-				if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) {
-					twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
-					bo_ij = &( bonds->select.bond_list[pj].bo_data );
-					Di = workspace->Delta[i];
-					vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
-
-					if( vov3 > 3. ) {
-
-						e_lph = p_lp3 * SQR(vov3-3.0);
-						E_Lp [i] += e_lph;
-						//atomicAdd (&data->E_Lp, e_lph );
-						//estrain(i) += e_lph;
-
-						deahu2dbo = 2.*p_lp3*(vov3 - 3.);
-						deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
-
-						bo_ij->Cdbo += deahu2dbo;
-
-						workspace->CdDelta[i] += deahu2dsbo;
-					}
-				}
-			}
+    int i, j, pj, type_i, type_j;
+    real Delta_lpcorr, dfvl;
+    real e_lp, expvd2, inv_expvd2, dElp, CElp, DlpVi;
+    real e_lph, Di, vov3, deahu2dbo, deahu2dsbo;
+    real e_ov, CEover1, CEover2, CEover3, CEover4;
+    real exp_ovun1, exp_ovun2, sum_ovun1, sum_ovun2;
+    real exp_ovun2n, exp_ovun6, exp_ovun8;
+    real inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
+    real e_un, CEunder1, CEunder2, CEunder3, CEunder4;
+    real p_lp1, p_lp2, p_lp3;
+    real p_ovun2, p_ovun3, p_ovun4, p_ovun5, p_ovun6, p_ovun7, p_ovun8;
+
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_data *pbond;
+    bond_order_data *bo_ij; 
+    list *bonds = &p_bonds;
+    static_storage *workspace = &p_workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    /* Initialize parameters */
+    p_lp1 = g_params.l[15];
+    p_lp3 = g_params.l[5];
+    p_ovun3 = g_params.l[32];
+    p_ovun4 = g_params.l[31];
+    p_ovun6 = g_params.l[6];
+    p_ovun7 = g_params.l[8];
+    p_ovun8 = g_params.l[9];
+
+    // set the parameter pointer 
+    type_i = atoms[i].type;
+    sbp_i = &(sbp[ type_i ]);
+
+    // lone-pair Energy 
+    p_lp2 = sbp_i->p_lp2;      
+    expvd2 = EXP( -75 * workspace->Delta_lp[i] );
+    inv_expvd2 = 1. / (1. + expvd2 );
+
+    // calculate the energy 
+    e_lp = p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
+    //atomicAdd (&data->E_Lp, e_lp );
+    E_Lp [ i ] = e_lp;
+
+    dElp = p_lp2 * inv_expvd2 + 
+        75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
+    CElp = dElp * workspace->dDelta_lp[i];
+
+    workspace->CdDelta[i] += CElp;      // lp - 1st term
+
+    // correction for C2 
+    if( g_params.l[5] > 0.001 && 
+            !cuda_strcmp( sbp[type_i].name, "C" , 15) )
+        for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
+            if( i < bonds->select.bond_list[pj].nbr ) {
+                j = bonds->select.bond_list[pj].nbr;
+                type_j = atoms[j].type;
+
+                if( !cuda_strcmp( sbp[type_j].name, "C", 15 ) ) {
+                    twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ]);
+                    bo_ij = &( bonds->select.bond_list[pj].bo_data );
+                    Di = workspace->Delta[i];
+                    vov3 = bo_ij->BO - Di - 0.040*POW(Di, 4.);
+
+                    if( vov3 > 3. ) {
+
+                        e_lph = p_lp3 * SQR(vov3-3.0);
+                        E_Lp [i] += e_lph;
+                        //atomicAdd (&data->E_Lp, e_lph );
+                        //estrain(i) += e_lph;
+
+                        deahu2dbo = 2.*p_lp3*(vov3 - 3.);
+                        deahu2dsbo = 2.*p_lp3*(vov3 - 3.)*(-1. - 0.16*POW(Di, 3.));
+
+                        bo_ij->Cdbo += deahu2dbo;
+
+                        workspace->CdDelta[i] += deahu2dsbo;
+                    }
+                }
+            }
 }
 ///////////////////////////////////////////////////////////
 
 GLOBAL void test_LonePair_Postprocess ( reax_atom *atoms, global_parameters g_params, 
-		single_body_parameters *sbp, two_body_parameters *tbp, 
-		static_storage p_workspace, simulation_data *data,
-		list p_bonds, int N, int num_atom_types )
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        static_storage p_workspace, simulation_data *data,
+        list p_bonds, int N, int num_atom_types )
 {
-	int i, j, pj, type_i, type_j;
+    int i, j, pj, type_i, type_j;
 
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_data *pbond, *sbond;
-	bond_data *dbond_index_bond, *sym_index_bond;
-	bond_order_data *bo_ij; 
-	list *bonds = &p_bonds;
-	static_storage *workspace = &p_workspace;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_data *pbond, *sbond;
+    bond_data *dbond_index_bond, *sym_index_bond;
+    bond_order_data *bo_ij; 
+    list *bonds = &p_bonds;
+    static_storage *workspace = &p_workspace;
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if ( i >= N) return;
+    if ( i >= N) return;
 
-	for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
 
-		/*
-		   pbond = &(bonds->select.bond_list[pj]);
-		   dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] );
-		   workspace->CdDelta [i] += dbond_index_bond->scratch;
-		 */
+        /*
+           pbond = &(bonds->select.bond_list[pj]);
+           dbond_index_bond = &( bonds->select.bond_list[ pbond->dbond_index ] );
+           workspace->CdDelta [i] += dbond_index_bond->scratch;
+         */
 
-		sbond = &(bonds->select.bond_list [pj]);
-		sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]);
-		workspace->CdDelta [i] += sym_index_bond->scratch;
-	}
+        sbond = &(bonds->select.bond_list [pj]);
+        sym_index_bond = &( bonds->select.bond_list[ sbond->sym_index ]);
+        workspace->CdDelta [i] += sym_index_bond->scratch;
+    }
 
 }
diff --git a/PuReMD-GPU/src/system_props.cu b/PuReMD-GPU/src/system_props.cu
index 9de96916..3ec39134 100644
--- a/PuReMD-GPU/src/system_props.cu
+++ b/PuReMD-GPU/src/system_props.cu
@@ -31,460 +31,460 @@
 
 real Get_Time( )
 {
-	struct timeval tim;
+    struct timeval tim;
 
-	gettimeofday(&tim, NULL );
-	return( tim.tv_sec + (tim.tv_usec / 1000000.0) );
+    gettimeofday(&tim, NULL );
+    return( tim.tv_sec + (tim.tv_usec / 1000000.0) );
 }
 
 
 real Get_Timing_Info( real t_start )
 {
-	struct timeval tim;
-	real t_end;
+    struct timeval tim;
+    real t_end;
 
-	gettimeofday(&tim, NULL );
-	t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
-	return (t_end - t_start);
+    gettimeofday(&tim, NULL );
+    t_end = tim.tv_sec + (tim.tv_usec / 1000000.0);
+    return (t_end - t_start);
 }
 
 
 void Temperature_Control( control_params *control, simulation_data *data, 
-		output_controls *out_control )
+        output_controls *out_control )
 {
-	real tmp;
-
-	if( control->T_mode == 1 ) { // step-wise temperature control
-		if( (data->step - data->prev_steps) % 
-				((int)(control->T_freq / control->dt)) == 0 ) {
-			if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
-				control->T += control->T_rate;
-			else control->T = control->T_final;	 
-		}
-	}
-	else if( control->T_mode == 2 ) { // constant slope control
-		tmp = control->T_rate * control->dt / control->T_freq;
-
-		if( fabs( control->T - control->T_final ) >= fabs( tmp ) )
-			control->T += tmp;       
-	}
+    real tmp;
+
+    if( control->T_mode == 1 ) { // step-wise temperature control
+        if( (data->step - data->prev_steps) % 
+                ((int)(control->T_freq / control->dt)) == 0 ) {
+            if( fabs( control->T - control->T_final ) >= fabs( control->T_rate ) )
+                control->T += control->T_rate;
+            else control->T = control->T_final;     
+        }
+    }
+    else if( control->T_mode == 2 ) { // constant slope control
+        tmp = control->T_rate * control->dt / control->T_freq;
+
+        if( fabs( control->T - control->T_final ) >= fabs( tmp ) )
+            control->T += tmp;       
+    }
 }
 
 void prep_dev_system (reax_system *system) 
 {
-	//copy the system atoms to the device
-	Sync_Host_Device ( system, cudaMemcpyHostToDevice );
+    //copy the system atoms to the device
+    Sync_Host_Device ( system, cudaMemcpyHostToDevice );
 }
 
 
 void Compute_Total_Mass( reax_system *system, simulation_data *data )
 {
-	int i;
-	int blocks;
-	int block_size;
-	real	*partial_sums = 0;
+    int i;
+    int blocks;
+    int block_size;
+    real    *partial_sums = 0;
 
-	data->M = 0;
+    data->M = 0;
 
-	for( i = 0; i < system->N; i++ ) 
-		data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;  
+    for( i = 0; i < system->N; i++ ) 
+        data->M += system->reaxprm.sbp[ system->atoms[i].type ].mass;  
 
-	data->inv_M = 1. / data->M;    
+    data->inv_M = 1. / data->M;    
 }
 
 void Cuda_Compute_Total_Mass( reax_system *system, simulation_data *data )
 {
-	real	*partial_sums = (real *) scratch;
-	//data->M = 0;
-
-	//cuda_malloc ((void **)&partial_sums, sizeof (real) * (blocks + 1), 1, 0);
-	cuda_memset (partial_sums, 0, REAL_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH );
-
-	Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
-		(system->reaxprm.d_sbp, system->d_atoms, partial_sums, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
-		(partial_sums, partial_sums + BLOCKS_POW_2, BLOCKS_POW_2);
-	//(partial_sums, &((simulation_data *)data->d_simulation_data)->M, BLOCKS_POW_2);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	//#ifdef __BUILD_DEBUG__
-	//	validate_data ( system, data );
-	//#endif
-
-	//copy_host_device (&data->M, &((simulation_data *)data->d_simulation_data)->M, 
-	//#ifdef __BUILD_DEBUG__
-	//	t_data_M = data->M;
-	//#endif
-	copy_host_device (&data->M, partial_sums + BLOCKS_POW_2, 
-			REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-
-	//#ifdef __BUILD_DEBUG__
-	//	if (check_zero (t_data, data->M))
-	//	{
-	//		fprintf (stderr, "SimulationData:M does not match on host and device (%f %f) \n", t_data, data->M );
-	//		exit (0);
-	//	}
-	//#endif
-	data->inv_M = 1. / data->M;    
+    real    *partial_sums = (real *) scratch;
+    //data->M = 0;
+
+    //cuda_malloc ((void **)&partial_sums, sizeof (real) * (blocks + 1), 1, 0);
+    cuda_memset (partial_sums, 0, REAL_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH );
+
+    Compute_Total_Mass <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE >>> 
+        (system->reaxprm.d_sbp, system->d_atoms, partial_sums, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    Cuda_reduction <<<1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2>>> 
+        (partial_sums, partial_sums + BLOCKS_POW_2, BLOCKS_POW_2);
+    //(partial_sums, &((simulation_data *)data->d_simulation_data)->M, BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //#ifdef __BUILD_DEBUG__
+    //    validate_data ( system, data );
+    //#endif
+
+    //copy_host_device (&data->M, &((simulation_data *)data->d_simulation_data)->M, 
+    //#ifdef __BUILD_DEBUG__
+    //    t_data_M = data->M;
+    //#endif
+    copy_host_device (&data->M, partial_sums + BLOCKS_POW_2, 
+            REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+
+    //#ifdef __BUILD_DEBUG__
+    //    if (check_zero (t_data, data->M))
+    //    {
+    //        fprintf (stderr, "SimulationData:M does not match on host and device (%f %f) \n", t_data, data->M );
+    //        exit (0);
+    //    }
+    //#endif
+    data->inv_M = 1. / data->M;    
 }
 
 
 GLOBAL void Compute_Total_Mass (single_body_parameters *sbp, reax_atom *atoms, real *per_block_results, size_t n) 
 {
-	extern __shared__ real sdata[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0; 
-
-	if(i < n) 
-		x = sbp [ atoms[ i ].type ].mass;
-
-	sdata[threadIdx.x] = x; 
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) 
-	{  
-		if(threadIdx.x < offset)
-		{  
-			sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-		}  
-		__syncthreads();
-	}  
-
-	if(threadIdx.x == 0) 
-	{  
-		per_block_results[blockIdx.x] = sdata[0];
-	}
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0; 
+
+    if(i < n) 
+        x = sbp [ atoms[ i ].type ].mass;
+
+    sdata[threadIdx.x] = x; 
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) 
+    {  
+        if(threadIdx.x < offset)
+        {  
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }  
+        __syncthreads();
+    }  
+
+    if(threadIdx.x == 0) 
+    {  
+        per_block_results[blockIdx.x] = sdata[0];
+    }
 }
 
 
 void Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
-		FILE *fout )
+        FILE *fout )
 {
-	int i;
-	real m, xx, xy, xz, yy, yz, zz, det;
-	rvec tvec, diff;
-	rtensor mat, inv;
-
-	int blocks;
-	int block_size;
-	rvec *l_xcm, *l_vcm, *l_amcm;
-	real t_start, t_end;
-
-	rvec_MakeZero( data->xcm );  // position of CoM
-	rvec_MakeZero( data->vcm );  // velocity of CoM
-	rvec_MakeZero( data->amcm ); // angular momentum of CoM
-	rvec_MakeZero( data->avcm ); // angular velocity of CoM
-
-	/* Compute the position, velocity and angular momentum about the CoM */
-	for( i = 0; i < system->N; ++i ) {
-		m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
-
-		rvec_ScaledAdd( data->xcm, m, system->atoms[i].x );
-		rvec_ScaledAdd( data->vcm, m, system->atoms[i].v );
-
-		rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v );
-		rvec_ScaledAdd( data->amcm, m, tvec );
-
-		/*fprintf( fout,"%3d  %g %g %g\n",
-		  i+1, 
-		  system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2]  );
-		  fprintf( fout, "vcm:  %g %g %g\n", 
-		  data->vcm[0], data->vcm[1], data->vcm[2] );  
-		 */
-	}
-
-	rvec_Scale( data->xcm, data->inv_M, data->xcm );
-	rvec_Scale( data->vcm, data->inv_M, data->vcm );
-
-	rvec_Cross( tvec, data->xcm, data->vcm );
-	rvec_ScaledAdd( data->amcm, -data->M, tvec );
-
-	data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
-
-	/* Calculate and then invert the inertial tensor */
-	xx = xy = xz = yy = yz = zz = 0;
-
-	for( i = 0; i < system->N; ++i ) {
-		m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
-
-		rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
-		xx += diff[0] * diff[0] * m;
-		xy += diff[0] * diff[1] * m;
-		xz += diff[0] * diff[2] * m;
-		yy += diff[1] * diff[1] * m;
-		yz += diff[1] * diff[2] * m;
-		zz += diff[2] * diff[2] * m;      
-	}
+    int i;
+    real m, xx, xy, xz, yy, yz, zz, det;
+    rvec tvec, diff;
+    rtensor mat, inv;
+
+    int blocks;
+    int block_size;
+    rvec *l_xcm, *l_vcm, *l_amcm;
+    real t_start, t_end;
+
+    rvec_MakeZero( data->xcm );  // position of CoM
+    rvec_MakeZero( data->vcm );  // velocity of CoM
+    rvec_MakeZero( data->amcm ); // angular momentum of CoM
+    rvec_MakeZero( data->avcm ); // angular velocity of CoM
+
+    /* Compute the position, velocity and angular momentum about the CoM */
+    for( i = 0; i < system->N; ++i ) {
+        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
+
+        rvec_ScaledAdd( data->xcm, m, system->atoms[i].x );
+        rvec_ScaledAdd( data->vcm, m, system->atoms[i].v );
+
+        rvec_Cross( tvec, system->atoms[i].x, system->atoms[i].v );
+        rvec_ScaledAdd( data->amcm, m, tvec );
+
+        /*fprintf( fout,"%3d  %g %g %g\n",
+          i+1, 
+          system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2]  );
+          fprintf( fout, "vcm:  %g %g %g\n", 
+          data->vcm[0], data->vcm[1], data->vcm[2] );  
+         */
+    }
+
+    rvec_Scale( data->xcm, data->inv_M, data->xcm );
+    rvec_Scale( data->vcm, data->inv_M, data->vcm );
+
+    rvec_Cross( tvec, data->xcm, data->vcm );
+    rvec_ScaledAdd( data->amcm, -data->M, tvec );
+
+    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
+
+    /* Calculate and then invert the inertial tensor */
+    xx = xy = xz = yy = yz = zz = 0;
+
+    for( i = 0; i < system->N; ++i ) {
+        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
+
+        rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
+        xx += diff[0] * diff[0] * m;
+        xy += diff[0] * diff[1] * m;
+        xz += diff[0] * diff[2] * m;
+        yy += diff[1] * diff[1] * m;
+        yz += diff[1] * diff[2] * m;
+        zz += diff[2] * diff[2] * m;      
+    }
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, " xx: %f \n", xx);
-	fprintf (stderr, " xy: %f \n", xy);
-	fprintf (stderr, " xz: %f \n", xz);
-	fprintf (stderr, " yy: %f \n", yy);
-	fprintf (stderr, " yz: %f \n", yz);
-	fprintf (stderr, " zz: %f \n", zz);
+    fprintf (stderr, " xx: %f \n", xx);
+    fprintf (stderr, " xy: %f \n", xy);
+    fprintf (stderr, " xz: %f \n", xz);
+    fprintf (stderr, " yy: %f \n", yy);
+    fprintf (stderr, " yz: %f \n", yz);
+    fprintf (stderr, " zz: %f \n", zz);
 #endif
 
-	mat[0][0] = yy + zz;     
-	mat[0][1] = mat[1][0] = -xy;
-	mat[0][2] = mat[2][0] = -xz;
-	mat[1][1] = xx + zz;
-	mat[2][1] = mat[1][2] = -yz;
-	mat[2][2] = xx + yy;
-
-	/* invert the inertial tensor */
-	det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
-			mat[0][1] * mat[1][2] * mat[2][0] + 
-			mat[0][2] * mat[1][0] * mat[2][1] ) -
-		( mat[0][0] * mat[1][2] * mat[2][1] + 
-		  mat[0][1] * mat[1][0] * mat[2][2] + 
-		  mat[0][2] * mat[1][1] * mat[2][0] );
-
-	inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
-	inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
-	inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
-	inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
-	inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
-	inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
-	inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
-	inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
-	inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
-
-	if( fabs(det) > ALMOST_ZERO )
-		rtensor_Scale( inv, 1./det, inv );
-	else 
-		rtensor_MakeZero( inv );
-
-	/* Compute the angular velocity about the centre of mass */
-	rtensor_MatVec( data->avcm, inv, data->amcm );  
-	data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
+    mat[0][0] = yy + zz;     
+    mat[0][1] = mat[1][0] = -xy;
+    mat[0][2] = mat[2][0] = -xz;
+    mat[1][1] = xx + zz;
+    mat[2][1] = mat[1][2] = -yz;
+    mat[2][2] = xx + yy;
+
+    /* invert the inertial tensor */
+    det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
+            mat[0][1] * mat[1][2] * mat[2][0] + 
+            mat[0][2] * mat[1][0] * mat[2][1] ) -
+        ( mat[0][0] * mat[1][2] * mat[2][1] + 
+          mat[0][1] * mat[1][0] * mat[2][2] + 
+          mat[0][2] * mat[1][1] * mat[2][0] );
+
+    inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
+    inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
+    inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
+    inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
+    inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
+    inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
+    inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
+    inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
+    inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
+
+    if( fabs(det) > ALMOST_ZERO )
+        rtensor_Scale( inv, 1./det, inv );
+    else 
+        rtensor_MakeZero( inv );
+
+    /* Compute the angular velocity about the centre of mass */
+    rtensor_MatVec( data->avcm, inv, data->amcm );  
+    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
 
 #if defined(DEBUG)
-	fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
-			data->xcm[0], data->xcm[1], data->xcm[2] );
-	fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
-			data->vcm[0], data->vcm[1], data->vcm[2] );
-	fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
-			data->amcm[0], data->amcm[1], data->amcm[2] );
-	/* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
-	   mat[0][0], mat[0][1], mat[0][2], 
-	   mat[1][0], mat[1][1], mat[1][2], 
-	   mat[2][0], mat[2][1], mat[2][2] );
-	   fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
-	   inv[0][0], inv[0][1], inv[0][2], 
-	   inv[1][0], inv[1][1], inv[1][2], 
-	   inv[2][0], inv[2][1], inv[2][2] );
-	   fflush( fout ); */
-	fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
-			data->avcm[0], data->avcm[1], data->avcm[2] );
+    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
+            data->xcm[0], data->xcm[1], data->xcm[2] );
+    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
+            data->vcm[0], data->vcm[1], data->vcm[2] );
+    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
+            data->amcm[0], data->amcm[1], data->amcm[2] );
+    /* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
+       mat[0][0], mat[0][1], mat[0][2], 
+       mat[1][0], mat[1][1], mat[1][2], 
+       mat[2][0], mat[2][1], mat[2][2] );
+       fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
+       inv[0][0], inv[0][1], inv[0][2], 
+       inv[1][0], inv[1][1], inv[1][2], 
+       inv[2][0], inv[2][1], inv[2][2] );
+       fflush( fout ); */
+    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
+            data->avcm[0], data->avcm[1], data->avcm[2] );
 #endif
 }
 
 
 void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data, 
-		FILE *fout )
+        FILE *fout )
 {
-	int i;
-	real m, xx, xy, xz, yy, yz, zz, det;
-	rvec tvec, diff;
-	rtensor mat, inv;
-
-	int blocks;
-	int block_size;
-	rvec *l_xcm, *l_vcm, *l_amcm;
-	real t_start, t_end;
-
-	rvec t_xcm, t_vcm, t_amcm;
-
-	rvec *r_scratch = (rvec *)scratch;
-
-	//rvec_MakeZero( data->xcm );  // position of CoM
-	//rvec_MakeZero( data->vcm );  // velocity of CoM
-	//rvec_MakeZero( data->amcm ); // angular momentum of CoM
-	//rvec_MakeZero( data->avcm ); // angular velocity of CoM
-
-	//cuda_malloc ((void **)&l_xcm, RVEC_SIZE * (blocks + 1), 1, 0);
-	//cuda_malloc ((void **)&l_vcm, RVEC_SIZE * (blocks + 1), 1, 0);
-	//cuda_malloc ((void **)&l_amcm, RVEC_SIZE * (blocks + 1), 1, 0);
-
-	cuda_memset ( scratch, 0, 3 * RVEC_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH );
-	l_xcm = r_scratch;
-	l_vcm = r_scratch + (BLOCKS_POW_2 + 1);
-	l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1);
-
-	center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (RVEC_SIZE * BLOCK_SIZE) >>> 
-		(system->reaxprm.d_sbp, system->d_atoms, l_xcm, l_vcm, l_amcm, system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	center_of_mass <<<1, BLOCKS_POW_2, 3 * (RVEC_SIZE * BLOCKS_POW_2) >>> 
-		(l_xcm, l_vcm, l_amcm, 
-		 l_xcm + BLOCKS_POW_2, 
-		 l_vcm + BLOCKS_POW_2, 
-		 l_amcm + BLOCKS_POW_2, 
-		 BLOCKS_POW_2);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
-
-	//#ifdef __BUILD_DEBUG
-	//	validate_data ( system, data );
-	//#endif
-
-	//#ifdef __BUILD_DEBUG__
-	//	rvec_MakeZero (t_xcm);
-	//	rvec_MakeZero (t_vcm);
-	//	rvec_MakeZero (t_amcm);
-	//
-	//	rvec_Copy (t_xcm, data->xcm);
-	//	rvec_Copy (t_vcm, data->vcm);
-	//	rvec_Copy (t_amcm, data->amcm);
-	//#endif
-
-	copy_host_device (data->xcm, l_xcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-	copy_host_device (data->vcm, l_vcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-	copy_host_device (data->amcm, l_amcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-
-	rvec_Scale( data->xcm, data->inv_M, data->xcm );
-	rvec_Scale( data->vcm, data->inv_M, data->vcm );
-
-	rvec_Cross( tvec, data->xcm, data->vcm );
-	rvec_ScaledAdd( data->amcm, -data->M, tvec );
-
-	//#ifdef __BUILD_DEBUG__
-	//	if (check_zero (t_xcm, data->xcm) || 
-	//		check_zero (t_vcm, data->vcm) ||
-	//		check_zero (t_amcm, data->amcm)){
-	//			fprintf (stderr, "SimulationData (xcm, vcm, amcm) does not match between device and host \n");
-	//			exit (0);
-	//		}
-	//#endif
-
-	data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
-
-	/* Calculate and then invert the inertial tensor */
-	xx = xy = xz = yy = yz = zz = 0;
+    int i;
+    real m, xx, xy, xz, yy, yz, zz, det;
+    rvec tvec, diff;
+    rtensor mat, inv;
+
+    int blocks;
+    int block_size;
+    rvec *l_xcm, *l_vcm, *l_amcm;
+    real t_start, t_end;
+
+    rvec t_xcm, t_vcm, t_amcm;
+
+    rvec *r_scratch = (rvec *)scratch;
+
+    //rvec_MakeZero( data->xcm );  // position of CoM
+    //rvec_MakeZero( data->vcm );  // velocity of CoM
+    //rvec_MakeZero( data->amcm ); // angular momentum of CoM
+    //rvec_MakeZero( data->avcm ); // angular velocity of CoM
+
+    //cuda_malloc ((void **)&l_xcm, RVEC_SIZE * (blocks + 1), 1, 0);
+    //cuda_malloc ((void **)&l_vcm, RVEC_SIZE * (blocks + 1), 1, 0);
+    //cuda_malloc ((void **)&l_amcm, RVEC_SIZE * (blocks + 1), 1, 0);
+
+    cuda_memset ( scratch, 0, 3 * RVEC_SIZE * (BLOCKS_POW_2 + 1), RES_SCRATCH );
+    l_xcm = r_scratch;
+    l_vcm = r_scratch + (BLOCKS_POW_2 + 1);
+    l_amcm = r_scratch + 2 * (BLOCKS_POW_2 + 1);
+
+    center_of_mass_blocks <<<BLOCKS_POW_2, BLOCK_SIZE, 3 * (RVEC_SIZE * BLOCK_SIZE) >>> 
+        (system->reaxprm.d_sbp, system->d_atoms, l_xcm, l_vcm, l_amcm, system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    center_of_mass <<<1, BLOCKS_POW_2, 3 * (RVEC_SIZE * BLOCKS_POW_2) >>> 
+        (l_xcm, l_vcm, l_amcm, 
+         l_xcm + BLOCKS_POW_2, 
+         l_vcm + BLOCKS_POW_2, 
+         l_amcm + BLOCKS_POW_2, 
+         BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
+
+    //#ifdef __BUILD_DEBUG
+    //    validate_data ( system, data );
+    //#endif
+
+    //#ifdef __BUILD_DEBUG__
+    //    rvec_MakeZero (t_xcm);
+    //    rvec_MakeZero (t_vcm);
+    //    rvec_MakeZero (t_amcm);
+    //
+    //    rvec_Copy (t_xcm, data->xcm);
+    //    rvec_Copy (t_vcm, data->vcm);
+    //    rvec_Copy (t_amcm, data->amcm);
+    //#endif
+
+    copy_host_device (data->xcm, l_xcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (data->vcm, l_vcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (data->amcm, l_amcm + BLOCKS_POW_2, RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+
+    rvec_Scale( data->xcm, data->inv_M, data->xcm );
+    rvec_Scale( data->vcm, data->inv_M, data->vcm );
+
+    rvec_Cross( tvec, data->xcm, data->vcm );
+    rvec_ScaledAdd( data->amcm, -data->M, tvec );
+
+    //#ifdef __BUILD_DEBUG__
+    //    if (check_zero (t_xcm, data->xcm) || 
+    //        check_zero (t_vcm, data->vcm) ||
+    //        check_zero (t_amcm, data->amcm)){
+    //            fprintf (stderr, "SimulationData (xcm, vcm, amcm) does not match between device and host \n");
+    //            exit (0);
+    //        }
+    //#endif
+
+    data->etran_cm = 0.5 * data->M * rvec_Norm_Sqr( data->vcm );
+
+    /* Calculate and then invert the inertial tensor */
+    xx = xy = xz = yy = yz = zz = 0;
 
 #ifdef __BUILD_DEBUG__
 
-	for( i = 0; i < system->N; ++i ) {
-		m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
+    for( i = 0; i < system->N; ++i ) {
+        m = system->reaxprm.sbp[ system->atoms[i].type ].mass;
 
-		rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
-		xx += diff[0] * diff[0] * m;
-		xy += diff[0] * diff[1] * m;
-		xz += diff[0] * diff[2] * m;
-		yy += diff[1] * diff[1] * m;
-		yz += diff[1] * diff[2] * m;
-		zz += diff[2] * diff[2] * m;      
-	}
+        rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
+        xx += diff[0] * diff[0] * m;
+        xy += diff[0] * diff[1] * m;
+        xz += diff[0] * diff[2] * m;
+        yy += diff[1] * diff[1] * m;
+        yz += diff[1] * diff[2] * m;
+        zz += diff[2] * diff[2] * m;      
+    }
 
 #endif
 
-	real *partial_results = (real *) scratch;
-	real *local_results;
+    real *partial_results = (real *) scratch;
+    real *local_results;
 
-	//cuda_malloc ((void **)&partial_results, 6 * sizeof (real) * (blocks + 1), 1, 0);
-	cuda_memset (partial_results, 0, REAL_SIZE * 6 * (BLOCKS_POW_2 + 1), RES_SCRATCH );
-	local_results = (real *) malloc (REAL_SIZE * 6 *(BLOCKS_POW_2+ 1));
+    //cuda_malloc ((void **)&partial_results, 6 * sizeof (real) * (blocks + 1), 1, 0);
+    cuda_memset (partial_results, 0, REAL_SIZE * 6 * (BLOCKS_POW_2 + 1), RES_SCRATCH );
+    local_results = (real *) malloc (REAL_SIZE * 6 *(BLOCKS_POW_2+ 1));
 
-	compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> 
-		(system->reaxprm.d_sbp, system->d_atoms, partial_results, 
-		 data->xcm[0], data->xcm[1], data->xcm[2], system->N);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    compute_center_mass <<<BLOCKS_POW_2, BLOCK_SIZE, 6 * (REAL_SIZE * BLOCK_SIZE) >>> 
+        (system->reaxprm.d_sbp, system->d_atoms, partial_results, 
+         data->xcm[0], data->xcm[1], data->xcm[2], system->N);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	compute_center_mass <<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> 
-		(partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
-	cudaThreadSynchronize ();
-	cudaCheckError ();
+    compute_center_mass <<<1, BLOCKS_POW_2, 6 * (REAL_SIZE * BLOCKS_POW_2) >>> 
+        (partial_results, partial_results + (BLOCKS_POW_2 * 6), BLOCKS_POW_2);
+    cudaThreadSynchronize ();
+    cudaCheckError ();
 
-	copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (local_results, partial_results + 6 * BLOCKS_POW_2, REAL_SIZE * 6, cudaMemcpyDeviceToHost, __LINE__);
 
 #ifdef __BUILD_DEBUG__
-	if (check_zero (local_results[0],xx) ||
-			check_zero (local_results[1],xy) ||
-			check_zero (local_results[2],xz) ||
-			check_zero (local_results[3],yy) ||
-			check_zero (local_results[4],yz) ||
-			check_zero (local_results[5],zz) )
-	{
-		fprintf (stderr, " xx (%4.15f %4.15f) \n", xx, local_results[0]);
-		fprintf (stderr, " xy (%4.15f %4.15f) \n", xy, local_results[1]);
-		fprintf (stderr, " xz (%4.15f %4.15f) \n", xz, local_results[2]);
-		fprintf (stderr, " yy (%4.15f %4.15f) \n", yy, local_results[3]);
-		fprintf (stderr, " yz (%4.15f %4.15f) \n", yz, local_results[4]);
-		fprintf (stderr, " zz (%4.15f %4.15f) \n", zz, local_results[5]);
-		fprintf (stderr, " Failed to compute the center of mass \n");
-		exit (1);
-	}
+    if (check_zero (local_results[0],xx) ||
+            check_zero (local_results[1],xy) ||
+            check_zero (local_results[2],xz) ||
+            check_zero (local_results[3],yy) ||
+            check_zero (local_results[4],yz) ||
+            check_zero (local_results[5],zz) )
+    {
+        fprintf (stderr, " xx (%4.15f %4.15f) \n", xx, local_results[0]);
+        fprintf (stderr, " xy (%4.15f %4.15f) \n", xy, local_results[1]);
+        fprintf (stderr, " xz (%4.15f %4.15f) \n", xz, local_results[2]);
+        fprintf (stderr, " yy (%4.15f %4.15f) \n", yy, local_results[3]);
+        fprintf (stderr, " yz (%4.15f %4.15f) \n", yz, local_results[4]);
+        fprintf (stderr, " zz (%4.15f %4.15f) \n", zz, local_results[5]);
+        fprintf (stderr, " Failed to compute the center of mass \n");
+        exit (1);
+    }
 #endif
 
-	xx = local_results[0];
-	xy = local_results[1];
-	xz = local_results[2];
-	yy = local_results[3];
-	yz = local_results[4];
-	zz = local_results[5];
-
-	mat[0][0] = yy + zz;     
-	mat[0][1] = mat[1][0] = -xy;
-	mat[0][2] = mat[2][0] = -xz;
-	mat[1][1] = xx + zz;
-	mat[2][1] = mat[1][2] = -yz;
-	mat[2][2] = xx + yy;
-
-	/* invert the inertial tensor */
-	det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
-			mat[0][1] * mat[1][2] * mat[2][0] + 
-			mat[0][2] * mat[1][0] * mat[2][1] ) -
-		( mat[0][0] * mat[1][2] * mat[2][1] + 
-		  mat[0][1] * mat[1][0] * mat[2][2] + 
-		  mat[0][2] * mat[1][1] * mat[2][0] );
-
-	inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
-	inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
-	inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
-	inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
-	inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
-	inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
-	inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
-	inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
-	inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
-
-	if( fabs(det) > ALMOST_ZERO )
-		rtensor_Scale( inv, 1./det, inv );
-	else 
-		rtensor_MakeZero( inv );
-
-	/* Compute the angular velocity about the centre of mass */
-	rtensor_MatVec( data->avcm, inv, data->amcm );  
-	data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
-
-	//free the resources
-	free (local_results);
+    xx = local_results[0];
+    xy = local_results[1];
+    xz = local_results[2];
+    yy = local_results[3];
+    yz = local_results[4];
+    zz = local_results[5];
+
+    mat[0][0] = yy + zz;     
+    mat[0][1] = mat[1][0] = -xy;
+    mat[0][2] = mat[2][0] = -xz;
+    mat[1][1] = xx + zz;
+    mat[2][1] = mat[1][2] = -yz;
+    mat[2][2] = xx + yy;
+
+    /* invert the inertial tensor */
+    det = ( mat[0][0] * mat[1][1] * mat[2][2] + 
+            mat[0][1] * mat[1][2] * mat[2][0] + 
+            mat[0][2] * mat[1][0] * mat[2][1] ) -
+        ( mat[0][0] * mat[1][2] * mat[2][1] + 
+          mat[0][1] * mat[1][0] * mat[2][2] + 
+          mat[0][2] * mat[1][1] * mat[2][0] );
+
+    inv[0][0] = mat[1][1] * mat[2][2] - mat[1][2] * mat[2][1];
+    inv[0][1] = mat[0][2] * mat[2][1] - mat[0][1] * mat[2][2];
+    inv[0][2] = mat[0][1] * mat[1][2] - mat[0][2] * mat[1][1];
+    inv[1][0] = mat[1][2] * mat[2][0] - mat[1][0] * mat[2][2];
+    inv[1][1] = mat[0][0] * mat[2][2] - mat[0][2] * mat[2][0];
+    inv[1][2] = mat[0][2] * mat[1][0] - mat[0][0] * mat[1][2];
+    inv[2][0] = mat[1][0] * mat[2][1] - mat[2][0] * mat[1][1];
+    inv[2][1] = mat[2][0] * mat[0][1] - mat[0][0] * mat[2][1];
+    inv[2][2] = mat[0][0] * mat[1][1] - mat[1][0] * mat[0][1];
+
+    if( fabs(det) > ALMOST_ZERO )
+        rtensor_Scale( inv, 1./det, inv );
+    else 
+        rtensor_MakeZero( inv );
+
+    /* Compute the angular velocity about the centre of mass */
+    rtensor_MatVec( data->avcm, inv, data->amcm );  
+    data->erot_cm = 0.5 * E_CONV * rvec_Dot( data->avcm, data->amcm );
+
+    //free the resources
+    free (local_results);
 
 #if defined(DEBUG)
-	fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
-			data->xcm[0], data->xcm[1], data->xcm[2] );
-	fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
-			data->vcm[0], data->vcm[1], data->vcm[2] );
-	fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
-			data->amcm[0], data->amcm[1], data->amcm[2] );
-	/* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
-	   mat[0][0], mat[0][1], mat[0][2], 
-	   mat[1][0], mat[1][1], mat[1][2], 
-	   mat[2][0], mat[2][1], mat[2][2] );
-	   fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
-	   inv[0][0], inv[0][1], inv[0][2], 
-	   inv[1][0], inv[1][1], inv[1][2], 
-	   inv[2][0], inv[2][1], inv[2][2] );
-	   fflush( fout ); */
-	fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
-			data->avcm[0], data->avcm[1], data->avcm[2] );
+    fprintf( stderr, "xcm:  %24.15e %24.15e %24.15e\n",  
+            data->xcm[0], data->xcm[1], data->xcm[2] );
+    fprintf( stderr, "vcm:  %24.15e %24.15e %24.15e\n", 
+            data->vcm[0], data->vcm[1], data->vcm[2] );
+    fprintf( stderr, "amcm: %24.15e %24.15e %24.15e\n", 
+            data->amcm[0], data->amcm[1], data->amcm[2] );
+    /* fprintf( fout, "mat:  %f %f %f\n     %f %f %f\n     %f %f %f\n",
+       mat[0][0], mat[0][1], mat[0][2], 
+       mat[1][0], mat[1][1], mat[1][2], 
+       mat[2][0], mat[2][1], mat[2][2] );
+       fprintf( fout, "inv:  %g %g %g\n     %g %g %g\n     %g %g %g\n",
+       inv[0][0], inv[0][1], inv[0][2], 
+       inv[1][0], inv[1][1], inv[1][2], 
+       inv[2][0], inv[2][1], inv[2][2] );
+       fflush( fout ); */
+    fprintf( stderr, "avcm:  %24.15e %24.15e %24.15e\n", 
+            data->avcm[0], data->avcm[1], data->avcm[2] );
 #endif
 }
 
@@ -492,109 +492,109 @@ void Cuda_Compute_Center_of_Mass( reax_system *system, simulation_data *data,
 
 void Compute_Kinetic_Energy( reax_system* system, simulation_data* data )
 {
-	int i;
-	rvec p;
-	real m;
+    int i;
+    rvec p;
+    real m;
 
-	data->E_Kin = 0.0;
+    data->E_Kin = 0.0;
 
-	for (i=0; i < system->N; i++) {
-		m = system->reaxprm.sbp[system->atoms[i].type].mass;
+    for (i=0; i < system->N; i++) {
+        m = system->reaxprm.sbp[system->atoms[i].type].mass;
 
-		rvec_Scale( p, m, system->atoms[i].v );
-		data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v );
+        rvec_Scale( p, m, system->atoms[i].v );
+        data->E_Kin += 0.5 * rvec_Dot( p, system->atoms[i].v );
 
-		/* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n",
-		   i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
-		   system->reaxprm.sbp[system->atoms[i].type].mass); */
-	}
+        /* fprintf(stderr,"%d, %lf, %lf, %lf %lf\n",
+           i,system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2],
+           system->reaxprm.sbp[system->atoms[i].type].mass); */
+    }
 
-	data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
+    data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
 
-	if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */
-		data->therm.T = ALMOST_ZERO;
+    if ( fabs(data->therm.T) < ALMOST_ZERO ) /* avoid T being an absolute zero! */
+        data->therm.T = ALMOST_ZERO;
 }
 
 
 GLOBAL void Compute_Kinetic_Energy( single_body_parameters* sbp, reax_atom* atoms, 
-		unsigned int N, real *output)
+        unsigned int N, real *output)
 {
-	extern __shared__ real sh_ekin[];
-	unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
-	rvec p;
-	real m, tmp;
-
-	tmp = 0;
-	m = 0;
-	if (index < N) {
-		m = sbp[atoms[index].type].mass;
-		rvec_Scale( p, m, atoms[index].v );
-		tmp = 0.5 * rvec_Dot( p, atoms[index].v );
-	}
-	sh_ekin[threadIdx.x] = tmp;
-	__syncthreads ();
-
-	for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
-		if (threadIdx.x < offset ) {
-			index = threadIdx.x + offset;
-			sh_ekin[threadIdx.x] += sh_ekin[ index ];
-		}
-		__syncthreads ();
-	}
-
-	if (threadIdx.x == 0) {
-		output [ blockIdx.x ] = sh_ekin [ 0 ];
-	}
+    extern __shared__ real sh_ekin[];
+    unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
+    rvec p;
+    real m, tmp;
+
+    tmp = 0;
+    m = 0;
+    if (index < N) {
+        m = sbp[atoms[index].type].mass;
+        rvec_Scale( p, m, atoms[index].v );
+        tmp = 0.5 * rvec_Dot( p, atoms[index].v );
+    }
+    sh_ekin[threadIdx.x] = tmp;
+    __syncthreads ();
+
+    for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
+        if (threadIdx.x < offset ) {
+            index = threadIdx.x + offset;
+            sh_ekin[threadIdx.x] += sh_ekin[ index ];
+        }
+        __syncthreads ();
+    }
+
+    if (threadIdx.x == 0) {
+        output [ blockIdx.x ] = sh_ekin [ 0 ];
+    }
 }
 
 GLOBAL void Kinetic_Energy_Reduction (simulation_data *data,
-		real *input, int n)
+        real *input, int n)
 {
-	extern __shared__ real sdata[];
-	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-	real x = 0;
-
-	if(i < n)
-	{
-		x = input[i];
-	}
-	sdata[threadIdx.x] = x;
-	__syncthreads();
-
-	for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
-	{
-		if(threadIdx.x < offset)
-		{   
-			sdata[threadIdx.x] += sdata[threadIdx.x + offset];
-		}   
-
-		__syncthreads();
-	}
-
-	if(threadIdx.x == 0)
-	{
-		//per_block_results[blockIdx.x] = sdata[0];
-		data->E_Kin = sdata[0];
-		data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
-
-		if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! 
-			data->therm.T = ALMOST_ZERO;
-	}
+    extern __shared__ real sdata[];
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    real x = 0;
+
+    if(i < n)
+    {
+        x = input[i];
+    }
+    sdata[threadIdx.x] = x;
+    __syncthreads();
+
+    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
+    {
+        if(threadIdx.x < offset)
+        {   
+            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
+        }   
+
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        //per_block_results[blockIdx.x] = sdata[0];
+        data->E_Kin = sdata[0];
+        data->therm.T = (2. * data->E_Kin) / (data->N_f * K_B);
+
+        if ( fabs(data->therm.T) < ALMOST_ZERO ) // avoid T being an absolute zero! 
+            data->therm.T = ALMOST_ZERO;
+    }
 }
 
 void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data)
 {
-	real *results = (real *) scratch;
-	cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
-	Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-		(system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results);
-	cudaThreadSynchronize (); 
-	cudaCheckError ();
-
-	Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
-		((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2);
-	cudaThreadSynchronize (); 
-	cudaCheckError ();
+    real *results = (real *) scratch;
+    cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
+    Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+        (system->reaxprm.d_sbp, system->d_atoms, system->N, (real *) results);
+    cudaThreadSynchronize (); 
+    cudaCheckError ();
+
+    Kinetic_Energy_Reduction <<< 1, BLOCKS_POW_2, REAL_SIZE * BLOCKS_POW_2 >>>
+        ((simulation_data *)data->d_simulation_data, results, BLOCKS_POW_2);
+    cudaThreadSynchronize (); 
+    cudaCheckError ();
 }
 
 /*
@@ -632,7 +632,7 @@ void Cuda_Compute_Kinetic_Energy (reax_system *system, simulation_data *data)
    __syncthreads ();
 
 //if ((blockIdx.x == 0) && (threadIdx.x < gridDim.x)) {
-//	sh_ekin [ threadIdx.x ] = output [ threadIdx.x ];
+//    sh_ekin [ threadIdx.x ] = output [ threadIdx.x ];
 //}
 //__syncthreads ();
 
@@ -669,108 +669,108 @@ data->therm.T = ALMOST_ZERO;
  *  We may want to add that for more accuracy. 
  */
 void Compute_Pressure_Isotropic( reax_system* system, control_params *control, 
-		simulation_data* data, 
-		output_controls *out_control )
+        simulation_data* data, 
+        output_controls *out_control )
 {
-	int i;
-	reax_atom *p_atom;
-	rvec tx;
-	rvec tmp;
-	simulation_box *box = &(system->box);
-
-	/* Calculate internal pressure */
-	rvec_MakeZero( data->int_press );
-
-	// 0: both int and ext, 1: ext only, 2: int only
-	if( control->press_mode == 0 || control->press_mode == 2 ) {
-		for( i = 0; i < system->N; ++i ) {
-			p_atom = &( system->atoms[i] );
-
-			/* transform x into unitbox coordinates */
-			Transform_to_UnitBox( p_atom->x, box, 1, tx );
-
-			/* this atom's contribution to internal pressure */
-			rvec_Multiply( tmp, p_atom->f, tx );
-			rvec_Add( data->int_press, tmp );
-
-			if( out_control->debug_level > 0 ) {
-				fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", 
-						i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
-				fprintf( out_control->prs, "%8.2f%8.2f%8.2f", 
-						p_atom->f[0], p_atom->f[1], p_atom->f[2] );
-				fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", 
-						data->int_press[0],data->int_press[1],data->int_press[2]);
-			}
-		}
-	}
-
-	/* kinetic contribution */
-	data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV );
-
-	/* Calculate total pressure in each direction */  
-	data->tot_press[0] = data->kin_press - 
-		((data->int_press[0] + data->ext_press[0]) /
-		 (box->box_norms[1] * box->box_norms[2] * P_CONV));
-
-	data->tot_press[1] = data->kin_press - 
-		((data->int_press[1] + data->ext_press[1])/
-		 (box->box_norms[0] * box->box_norms[2] * P_CONV));
-
-	data->tot_press[2] = data->kin_press - 
-		((data->int_press[2] + data->ext_press[2])/
-		 (box->box_norms[0] * box->box_norms[1] * P_CONV));
-
-	/* Average pressure for the whole box */
-	data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3;
+    int i;
+    reax_atom *p_atom;
+    rvec tx;
+    rvec tmp;
+    simulation_box *box = &(system->box);
+
+    /* Calculate internal pressure */
+    rvec_MakeZero( data->int_press );
+
+    // 0: both int and ext, 1: ext only, 2: int only
+    if( control->press_mode == 0 || control->press_mode == 2 ) {
+        for( i = 0; i < system->N; ++i ) {
+            p_atom = &( system->atoms[i] );
+
+            /* transform x into unitbox coordinates */
+            Transform_to_UnitBox( p_atom->x, box, 1, tx );
+
+            /* this atom's contribution to internal pressure */
+            rvec_Multiply( tmp, p_atom->f, tx );
+            rvec_Add( data->int_press, tmp );
+
+            if( out_control->debug_level > 0 ) {
+                fprintf( out_control->prs, "%-8d%8.2f%8.2f%8.2f", 
+                        i+1, p_atom->x[0], p_atom->x[1], p_atom->x[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f", 
+                        p_atom->f[0], p_atom->f[1], p_atom->f[2] );
+                fprintf( out_control->prs, "%8.2f%8.2f%8.2f\n", 
+                        data->int_press[0],data->int_press[1],data->int_press[2]);
+            }
+        }
+    }
+
+    /* kinetic contribution */
+    data->kin_press = 2. * (E_CONV * data->E_Kin) / ( 3. * box->volume * P_CONV );
+
+    /* Calculate total pressure in each direction */  
+    data->tot_press[0] = data->kin_press - 
+        ((data->int_press[0] + data->ext_press[0]) /
+         (box->box_norms[1] * box->box_norms[2] * P_CONV));
+
+    data->tot_press[1] = data->kin_press - 
+        ((data->int_press[1] + data->ext_press[1])/
+         (box->box_norms[0] * box->box_norms[2] * P_CONV));
+
+    data->tot_press[2] = data->kin_press - 
+        ((data->int_press[2] + data->ext_press[2])/
+         (box->box_norms[0] * box->box_norms[1] * P_CONV));
+
+    /* Average pressure for the whole box */
+    data->iso_bar.P=(data->tot_press[0]+data->tot_press[1]+data->tot_press[2])/3;
 }
 
 
 void Compute_Pressure_Isotropic_Klein( reax_system* system, 
-		simulation_data* data )
+        simulation_data* data )
 {
-	int i;
-	reax_atom *p_atom;
-	rvec dx;
-
-	// IMPORTANT: This function assumes that current kinetic energy and 
-	// the center of mass of the system is already computed before.
-	data->iso_bar.P = 2.0 * data->E_Kin;
-
-	for( i = 0; i < system->N; ++i )
-	{
-		p_atom = &( system->atoms[i] );
-		rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm);
-		data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) );
-	}
-
-	data->iso_bar.P /= (3.0 * system->box.volume);
-
-	// IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
-	// to be added when there are long-range interactions or long-range 
-	// corrections to short-range interactions present.
-	// We may want to add that for more accuracy.
+    int i;
+    reax_atom *p_atom;
+    rvec dx;
+
+    // IMPORTANT: This function assumes that current kinetic energy and 
+    // the center of mass of the system is already computed before.
+    data->iso_bar.P = 2.0 * data->E_Kin;
+
+    for( i = 0; i < system->N; ++i )
+    {
+        p_atom = &( system->atoms[i] );
+        rvec_ScaledSum(dx,1.0,p_atom->x,-1.0,data->xcm);
+        data->iso_bar.P += ( -F_CONV * rvec_Dot(p_atom->f, dx) );
+    }
+
+    data->iso_bar.P /= (3.0 * system->box.volume);
+
+    // IMPORTANT: In Klein's paper, it is stated that a dU/dV term needs 
+    // to be added when there are long-range interactions or long-range 
+    // corrections to short-range interactions present.
+    // We may want to add that for more accuracy.
 }
 
 
 void Compute_Pressure( reax_system* system, simulation_data* data, 
-		static_storage *workspace )
+        static_storage *workspace )
 {
-	int i;
-	reax_atom *p_atom;
-	rtensor temp;
-
-	rtensor_MakeZero( data->flex_bar.P );
-
-	for( i = 0; i < system->N; ++i ) {
-		p_atom = &( system->atoms[i] );
-		// Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx );
-		rvec_OuterProduct( temp, p_atom->v, p_atom->v );
-		rtensor_ScaledAdd( data->flex_bar.P, 
-				system->reaxprm.sbp[ p_atom->type ].mass, temp );
-		// rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); 
-		rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp );
-	}
-
-	rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P );
-	data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0;
+    int i;
+    reax_atom *p_atom;
+    rtensor temp;
+
+    rtensor_MakeZero( data->flex_bar.P );
+
+    for( i = 0; i < system->N; ++i ) {
+        p_atom = &( system->atoms[i] );
+        // Distance_on_T3_Gen( data->rcm, p_atom->x, &(system->box), &dx );
+        rvec_OuterProduct( temp, p_atom->v, p_atom->v );
+        rtensor_ScaledAdd( data->flex_bar.P, 
+                system->reaxprm.sbp[ p_atom->type ].mass, temp );
+        // rvec_OuterProduct(temp, workspace->virial_forces[i], p_atom->x ); 
+        rtensor_ScaledAdd( data->flex_bar.P, -F_CONV, temp );
+    }
+
+    rtensor_Scale( data->flex_bar.P, 1.0 / system->box.volume, data->flex_bar.P );
+    data->iso_bar.P = rtensor_Trace( data->flex_bar.P ) / 3.0;
 }
diff --git a/PuReMD-GPU/src/testmd.cu b/PuReMD-GPU/src/testmd.cu
index ffca47ff..93f286cc 100644
--- a/PuReMD-GPU/src/testmd.cu
+++ b/PuReMD-GPU/src/testmd.cu
@@ -48,7 +48,7 @@ print_interaction Print_Interactions[NO_OF_INTERACTIONS];
 LR_lookup_table *LR;
 LR_lookup_table *d_LR;
 
-list		*dev_lists;
+list        *dev_lists;
 static_storage *dev_workspace;
 reax_timing d_timing;
 
@@ -70,398 +70,398 @@ cusparseMatDescr_t matdescriptor;
 
 
 void Post_Evolve( reax_system* system, control_params* control, 
-		simulation_data* data, static_storage* workspace, 
-		list** lists, output_controls *out_control )
+        simulation_data* data, static_storage* workspace, 
+        list** lists, output_controls *out_control )
 {
-	int i;
-	rvec diff, cross;
-
-	/* if velocity dependent force then
-	   {
-	   Generate_Neighbor_Lists( &system, &control, &lists );
-	   QEq(system, control, workspace, lists[FAR_NBRS]);
-	   Introduce compute_force here if we are using velocity dependent forces
-	   Compute_Forces(system,control,data,workspace,lists);
-	   } */
-
-	/* compute kinetic energy of the system */
-	Compute_Kinetic_Energy( system, data );
-
-	/* remove rotational and translational velocity of the center of mass */
-	if( control->ensemble != NVE && 
-			control->remove_CoM_vel && 
-			data->step && data->step % control->remove_CoM_vel == 0 ) {
-
-		/* compute velocity of the center of mass */
-		Compute_Center_of_Mass( system, data, out_control->prs );
-
-		for( i = 0; i < system->N; i++ ) {
-			// remove translational
-			rvec_ScaledAdd( system->atoms[i].v, -1., data->vcm ); 
-
-			// remove rotational
-			rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
-			rvec_Cross( cross, data->avcm, diff );
-			rvec_ScaledAdd( system->atoms[i].v, -1., cross );
-		}
-	}
+    int i;
+    rvec diff, cross;
+
+    /* if velocity dependent force then
+       {
+       Generate_Neighbor_Lists( &system, &control, &lists );
+       QEq(system, control, workspace, lists[FAR_NBRS]);
+       Introduce compute_force here if we are using velocity dependent forces
+       Compute_Forces(system,control,data,workspace,lists);
+       } */
+
+    /* compute kinetic energy of the system */
+    Compute_Kinetic_Energy( system, data );
+
+    /* remove rotational and translational velocity of the center of mass */
+    if( control->ensemble != NVE && 
+            control->remove_CoM_vel && 
+            data->step && data->step % control->remove_CoM_vel == 0 ) {
+
+        /* compute velocity of the center of mass */
+        Compute_Center_of_Mass( system, data, out_control->prs );
+
+        for( i = 0; i < system->N; i++ ) {
+            // remove translational
+            rvec_ScaledAdd( system->atoms[i].v, -1., data->vcm ); 
+
+            // remove rotational
+            rvec_ScaledSum( diff, 1., system->atoms[i].x, -1., data->xcm );
+            rvec_Cross( cross, data->avcm, diff );
+            rvec_ScaledAdd( system->atoms[i].v, -1., cross );
+        }
+    }
 }
 
 GLOBAL void Update_Atoms_Post_Evolve (reax_atom *atoms, simulation_data *data, int N)
 {
-	rvec diff, cross;
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i >= N) return;
-
-	//for( i = 0; i < system->N; i++ ) {
-	// remove translational
-	rvec_ScaledAdd( atoms[i].v, -1., data->vcm ); 
-
-	// remove rotational
-	rvec_ScaledSum( diff, 1., atoms[i].x, -1., data->xcm );
-	rvec_Cross( cross, data->avcm, diff );
-	rvec_ScaledAdd( atoms[i].v, -1., cross );
-	//}
+    rvec diff, cross;
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= N) return;
+
+    //for( i = 0; i < system->N; i++ ) {
+    // remove translational
+    rvec_ScaledAdd( atoms[i].v, -1., data->vcm ); 
+
+    // remove rotational
+    rvec_ScaledSum( diff, 1., atoms[i].x, -1., data->xcm );
+    rvec_Cross( cross, data->avcm, diff );
+    rvec_ScaledAdd( atoms[i].v, -1., cross );
+    //}
 }
 
 void Cuda_Post_Evolve( reax_system* system, control_params* control, 
-		simulation_data* data, static_storage* workspace, 
-		list** lists, output_controls *out_control )
+        simulation_data* data, static_storage* workspace, 
+        list** lists, output_controls *out_control )
 {
-	int i;
-	rvec diff, cross;
-
-	/* compute kinetic energy of the system */
-	/*
-	   real *results = (real *) scratch;
-	   cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
-	   Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
-	   (system->reaxprm.d_sbp, system->d_atoms, system->N, 
-	   (simulation_data *)data->d_simulation_data, (real *) results);
-	   cudaThreadSynchronize ();
-	   cudaCheckError ();
-	 */
-
-	//fprintf (stderr, "Cuda_Post_Evolve: Begin\n");
-	Cuda_Compute_Kinetic_Energy (system, data);
-	//fprintf (stderr, " Cuda_Compute_Kinetic_Energy done.... \n");
-
-	/* remove rotational and translational velocity of the center of mass */
-	if( control->ensemble != NVE && 
-			control->remove_CoM_vel && 
-			data->step && data->step % control->remove_CoM_vel == 0 ) {
-
-		/*
-		   rvec t_xcm, t_vcm, t_avcm;
-		   rvec_MakeZero (t_xcm);
-		   rvec_MakeZero (t_vcm);
-		   rvec_MakeZero (t_avcm);
-
-		   rvec_Copy (t_xcm, data->xcm);
-		   rvec_Copy (t_vcm, data->vcm);
-		   rvec_Copy (t_avcm, data->avcm);
-		 */
-
-		/* compute velocity of the center of mass */
-		Cuda_Compute_Center_of_Mass( system, data, out_control->prs );
-		//fprintf (stderr, "Cuda_Compute_Center_of_Mass done... \n");
-		/*
-		   fprintf (stderr, "center of mass done on the device \n");
-
-		   fprintf (stderr, "xcm --> %4.10f %4.10f \n", t_xcm, data->xcm );
-		   fprintf (stderr, "vcm --> %4.10f %4.10f \n", t_vcm, data->vcm );
-		   fprintf (stderr, "avcm --> %4.10f %4.10f \n", t_avcm, data->avcm );
-
-		   if (check_zero (t_xcm, data->xcm) || 
-		   check_zero (t_vcm, data->vcm) ||
-		   check_zero (t_avcm, data->avcm)){
-		   fprintf (stderr, "SimulationData (xcm, vcm, avcm) does not match between device and host \n");
-		   exit (0);
-		   }
-		 */
-
-		//xcm, avcm, 
-		copy_host_device (data->vcm, ((simulation_data *)data->d_simulation_data)->vcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
-		copy_host_device (data->xcm, ((simulation_data *)data->d_simulation_data)->xcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
-		copy_host_device (data->avcm, ((simulation_data *)data->d_simulation_data)->avcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
-
-		//fprintf (stderr, "data copied.... \n");
-
-		Update_Atoms_Post_Evolve  <<< BLOCKS, BLOCK_SIZE >>>
-			(system->d_atoms, (simulation_data *)data->d_simulation_data, system->N);
-		cudaThreadSynchronize ();
-		cudaCheckError ();
-
-		//fprintf (stderr, " Cuda_Post_Evolve:End \n");
-
-	}
+    int i;
+    rvec diff, cross;
+
+    /* compute kinetic energy of the system */
+    /*
+       real *results = (real *) scratch;
+       cuda_memset (results, 0, REAL_SIZE * BLOCKS_POW_2, RES_SCRATCH);
+       Compute_Kinetic_Energy <<<BLOCKS_POW_2, BLOCK_SIZE, REAL_SIZE * BLOCK_SIZE>>>
+       (system->reaxprm.d_sbp, system->d_atoms, system->N, 
+       (simulation_data *)data->d_simulation_data, (real *) results);
+       cudaThreadSynchronize ();
+       cudaCheckError ();
+     */
+
+    //fprintf (stderr, "Cuda_Post_Evolve: Begin\n");
+    Cuda_Compute_Kinetic_Energy (system, data);
+    //fprintf (stderr, " Cuda_Compute_Kinetic_Energy done.... \n");
+
+    /* remove rotational and translational velocity of the center of mass */
+    if( control->ensemble != NVE && 
+            control->remove_CoM_vel && 
+            data->step && data->step % control->remove_CoM_vel == 0 ) {
+
+        /*
+           rvec t_xcm, t_vcm, t_avcm;
+           rvec_MakeZero (t_xcm);
+           rvec_MakeZero (t_vcm);
+           rvec_MakeZero (t_avcm);
+
+           rvec_Copy (t_xcm, data->xcm);
+           rvec_Copy (t_vcm, data->vcm);
+           rvec_Copy (t_avcm, data->avcm);
+         */
+
+        /* compute velocity of the center of mass */
+        Cuda_Compute_Center_of_Mass( system, data, out_control->prs );
+        //fprintf (stderr, "Cuda_Compute_Center_of_Mass done... \n");
+        /*
+           fprintf (stderr, "center of mass done on the device \n");
+
+           fprintf (stderr, "xcm --> %4.10f %4.10f \n", t_xcm, data->xcm );
+           fprintf (stderr, "vcm --> %4.10f %4.10f \n", t_vcm, data->vcm );
+           fprintf (stderr, "avcm --> %4.10f %4.10f \n", t_avcm, data->avcm );
+
+           if (check_zero (t_xcm, data->xcm) || 
+           check_zero (t_vcm, data->vcm) ||
+           check_zero (t_avcm, data->avcm)){
+           fprintf (stderr, "SimulationData (xcm, vcm, avcm) does not match between device and host \n");
+           exit (0);
+           }
+         */
+
+        //xcm, avcm, 
+        copy_host_device (data->vcm, ((simulation_data *)data->d_simulation_data)->vcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+        copy_host_device (data->xcm, ((simulation_data *)data->d_simulation_data)->xcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+        copy_host_device (data->avcm, ((simulation_data *)data->d_simulation_data)->avcm, RVEC_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+
+        //fprintf (stderr, "data copied.... \n");
+
+        Update_Atoms_Post_Evolve  <<< BLOCKS, BLOCK_SIZE >>>
+            (system->d_atoms, (simulation_data *)data->d_simulation_data, system->N);
+        cudaThreadSynchronize ();
+        cudaCheckError ();
+
+        //fprintf (stderr, " Cuda_Post_Evolve:End \n");
+
+    }
 }
 
 
 
 
 void Read_System( char *geof, char *ff, char *ctrlf, 
-		reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		output_controls *out_control )
+        reax_system *system, control_params *control, 
+        simulation_data *data, static_storage *workspace, 
+        output_controls *out_control )
 {
-	FILE *ffield, *ctrl;
-
-	ffield = fopen( ff, "r" );
-	ctrl = fopen( ctrlf, "r" );
-
-	/* ffield file */
-	Read_Force_Field( ffield, &(system->reaxprm) );
-
-	/* control file */
-	Read_Control_File( ctrl, system, control, out_control );
-
-	/* geo file */
-	if( control->geo_format == XYZ ) {
-		fprintf( stderr, "xyz input is not implemented yet\n" );
-		exit(1);
-	}
-	else if( control->geo_format == PDB ) 
-		Read_PDB( geof, system, control, data, workspace );
-	else if( control->geo_format == BGF ) 
-		Read_BGF( geof, system, control, data, workspace );
-	else if( control->geo_format == ASCII_RESTART ) {
-		Read_ASCII_Restart( geof, system, control, data, workspace );
-		control->restart = 1;
-	}
-	else if( control->geo_format == BINARY_RESTART ) {
-		Read_Binary_Restart( geof, system, control, data, workspace );
-		control->restart = 1;
-	}
-	else {
-		fprintf( stderr, "unknown geo file format. terminating!\n" );
-		exit(1);
-	}  
+    FILE *ffield, *ctrl;
+
+    ffield = fopen( ff, "r" );
+    ctrl = fopen( ctrlf, "r" );
+
+    /* ffield file */
+    Read_Force_Field( ffield, &(system->reaxprm) );
+
+    /* control file */
+    Read_Control_File( ctrl, system, control, out_control );
+
+    /* geo file */
+    if( control->geo_format == XYZ ) {
+        fprintf( stderr, "xyz input is not implemented yet\n" );
+        exit(1);
+    }
+    else if( control->geo_format == PDB ) 
+        Read_PDB( geof, system, control, data, workspace );
+    else if( control->geo_format == BGF ) 
+        Read_BGF( geof, system, control, data, workspace );
+    else if( control->geo_format == ASCII_RESTART ) {
+        Read_ASCII_Restart( geof, system, control, data, workspace );
+        control->restart = 1;
+    }
+    else if( control->geo_format == BINARY_RESTART ) {
+        Read_Binary_Restart( geof, system, control, data, workspace );
+        control->restart = 1;
+    }
+    else {
+        fprintf( stderr, "unknown geo file format. terminating!\n" );
+        exit(1);
+    }  
 
 #if defined(DEBUG_FOCUS)
-	fprintf( stderr, "input files have been read...\n" );
-	Print_Box_Information( &(system->box), stderr );
+    fprintf( stderr, "input files have been read...\n" );
+    Print_Box_Information( &(system->box), stderr );
 #endif
 }
 
 void Init_Data_Structures (simulation_data *data)
 {
-	//data->step = 0;
-	//data->prev_steps = 0;
-	//data->time = 0;
+    //data->step = 0;
+    //data->prev_steps = 0;
+    //data->time = 0;
 
-	memset (data, 0, SIMULATION_DATA_SIZE );
+    memset (data, 0, SIMULATION_DATA_SIZE );
 }
 
 
 int main(int argc, char* argv[])
 {
-	reax_system system;
-	control_params control;
-	simulation_data data;
-	static_storage workspace;
-	list *lists;
-	output_controls out_control;
-	evolve_function Evolve;
-	evolve_function Cuda_Evolve;
-	int steps;
+    reax_system system;
+    control_params control;
+    simulation_data data;
+    static_storage workspace;
+    list *lists;
+    output_controls out_control;
+    evolve_function Evolve;
+    evolve_function Cuda_Evolve;
+    int steps;
 
-	real t_start, t_elapsed;
-	real *results = NULL;
+    real t_start, t_elapsed;
+    real *results = NULL;
 
-	lists = (list*) malloc( sizeof(list) * LIST_N );
+    lists = (list*) malloc( sizeof(list) * LIST_N );
 
-	cudaDeviceSetLimit (cudaLimitStackSize, 8192);
-	cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
-	cudaCheckError ();
+    cudaDeviceSetLimit (cudaLimitStackSize, 8192);
+    cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
+    cudaCheckError ();
 
-	cublasCheckError (cublasStatus = cublasCreate (&cublasHandle));  
+    cublasCheckError (cublasStatus = cublasCreate (&cublasHandle));  
 
-	cusparseCheckError (cusparseStatus = cusparseCreate (&cusparseHandle));
-	cusparseCheckError (cusparseCreateMatDescr (&matdescriptor));
-	cusparseSetMatType (matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
-	cusparseSetMatIndexBase (matdescriptor, CUSPARSE_INDEX_BASE_ZERO);
+    cusparseCheckError (cusparseStatus = cusparseCreate (&cusparseHandle));
+    cusparseCheckError (cusparseCreateMatDescr (&matdescriptor));
+    cusparseSetMatType (matdescriptor, CUSPARSE_MATRIX_TYPE_GENERAL);
+    cusparseSetMatIndexBase (matdescriptor, CUSPARSE_INDEX_BASE_ZERO);
 
-	dev_lists = (list *) malloc (sizeof (list) * LIST_N );
-	dev_workspace = (static_storage *) malloc (STORAGE_SIZE);
+    dev_lists = (list *) malloc (sizeof (list) * LIST_N );
+    dev_workspace = (static_storage *) malloc (STORAGE_SIZE);
 
-	//init the nbrs estimate
-	dev_workspace->realloc.estimate_nbrs = -1;
+    //init the nbrs estimate
+    dev_workspace->realloc.estimate_nbrs = -1;
 
-	//Cleanup before usage.
-	Init_Data_Structures (&data);
-	system.init_thblist = false;
+    //Cleanup before usage.
+    Init_Data_Structures (&data);
+    system.init_thblist = false;
 
-	Read_System( argv[1], argv[2], argv[3], &system, &control, 
-			&data, &workspace, &out_control );
+    Read_System( argv[1], argv[2], argv[3], &system, &control, 
+            &data, &workspace, &out_control );
 
-	compute_blocks (&BLOCKS, &BLOCK_SIZE, system.N);
-	compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2);
+    compute_blocks (&BLOCKS, &BLOCK_SIZE, system.N);
+    compute_nearest_pow_2 (BLOCKS, &BLOCKS_POW_2);
 
-	//MATVEC_BLOCKS = system.N;
-	//MATVEC_BLOCK_SIZE = 32;
+    //MATVEC_BLOCKS = system.N;
+    //MATVEC_BLOCK_SIZE = 32;
 
-	MATVEC_BLOCKS = (system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) + 
-		((system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) == 0 ? 0 : 1);
+    MATVEC_BLOCKS = (system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) + 
+        ((system.N * MATVEC_THREADS_PER_ROW / MATVEC_BLOCK_SIZE) == 0 ? 0 : 1);
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, " MATVEC Blocks : %d, Block_Size : %d \n", MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
-	fprintf (stderr, " Blocks : %d, Blocks_Pow_2 : %d, Block_Size : %d \n", BLOCKS, BLOCKS_POW_2, BLOCK_SIZE );
-	fprintf (stderr, " Size of far neighbor data %d \n", sizeof (far_neighbor_data));
-	fprintf (stderr, " Size of reax_atom %d \n", sizeof (reax_atom));
-	fprintf (stderr, " size of sparse matrix entry %d \n", sizeof (sparse_matrix_entry));
-	fprintf (stderr, " TOTAL NUMBER OF ATOMS IN THE SYSTEM --> %d \n", system.N);
+    fprintf (stderr, " MATVEC Blocks : %d, Block_Size : %d \n", MATVEC_BLOCKS, MATVEC_BLOCK_SIZE );
+    fprintf (stderr, " Blocks : %d, Blocks_Pow_2 : %d, Block_Size : %d \n", BLOCKS, BLOCKS_POW_2, BLOCK_SIZE );
+    fprintf (stderr, " Size of far neighbor data %d \n", sizeof (far_neighbor_data));
+    fprintf (stderr, " Size of reax_atom %d \n", sizeof (reax_atom));
+    fprintf (stderr, " size of sparse matrix entry %d \n", sizeof (sparse_matrix_entry));
+    fprintf (stderr, " TOTAL NUMBER OF ATOMS IN THE SYSTEM --> %d \n", system.N);
 #endif
 
 #ifdef __CUDA_MEM__
-	print_device_mem_usage ();
+    print_device_mem_usage ();
 #endif
 
 #ifdef __BUILD_DEBUG__
-	Initialize( &system, &control, &data, &workspace, &lists, 
-			&out_control, &Evolve );
+    Initialize( &system, &control, &data, &workspace, &lists, 
+            &out_control, &Evolve );
 #endif
 
-	t_start = Get_Time ();
-	Cuda_Initialize( &system, &control, &data, &workspace, &lists, 
-			&out_control, &Cuda_Evolve);
-	t_elapsed = Get_Timing_Info (t_start);
+    t_start = Get_Time ();
+    Cuda_Initialize( &system, &control, &data, &workspace, &lists, 
+            &out_control, &Cuda_Evolve);
+    t_elapsed = Get_Timing_Info (t_start);
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, " Cuda Initialize timing ---> %f \n", t_elapsed );
+    fprintf (stderr, " Cuda Initialize timing ---> %f \n", t_elapsed );
 #endif
 
 
 #ifdef __CUDA_MEM__
-	print_device_mem_usage ();
+    print_device_mem_usage ();
 #endif
 
 #ifdef __BUILD_DEBUG__
-	Reset( &system, &control, &data, &workspace, &lists );
+    Reset( &system, &control, &data, &workspace, &lists );
 #endif
-	Cuda_Reset( &system, &control, &data, &workspace, &lists );
+    Cuda_Reset( &system, &control, &data, &workspace, &lists );
 
 
 
 #ifdef __BUILD_DEBUG__
-	Generate_Neighbor_Lists ( &system, &control, &data, &workspace, 
-			&lists, &out_control );
+    Generate_Neighbor_Lists ( &system, &control, &data, &workspace, 
+            &lists, &out_control );
 #endif
-	/*
-	   dim3 blockspergrid (system.g.ncell[0], system.g.ncell[1], system.g.ncell[2]);
-	   dim3 threadsperblock (system.g.max_atoms);
+    /*
+       dim3 blockspergrid (system.g.ncell[0], system.g.ncell[1], system.g.ncell[2]);
+       dim3 threadsperblock (system.g.max_atoms);
 
-	   t_start = Get_Time ();
-	   Cuda_Bin_Atoms (&system, &workspace);
-	   Cuda_Bin_Atoms_Sync ( &system );
+       t_start = Get_Time ();
+       Cuda_Bin_Atoms (&system, &workspace);
+       Cuda_Bin_Atoms_Sync ( &system );
 
-	   Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> 
-	   (system.d_atoms, system.d_g, system.d_box, 
-	   (control_params *)control.d_control, *(dev_lists + FAR_NBRS));
-	   cudaThreadSynchronize (); 
-	   cudaCheckError ();
-	   t_elapsed = Get_Timing_Info (t_start);
+       Generate_Neighbor_Lists <<<blockspergrid, threadsperblock >>> 
+       (system.d_atoms, system.d_g, system.d_box, 
+       (control_params *)control.d_control, *(dev_lists + FAR_NBRS));
+       cudaThreadSynchronize (); 
+       cudaCheckError ();
+       t_elapsed = Get_Timing_Info (t_start);
 
-	   d_timing.nbrs += t_elapsed;
-	 */
+       d_timing.nbrs += t_elapsed;
+     */
 
-	Cuda_Generate_Neighbor_Lists (&system, &workspace, &control, false);
+    Cuda_Generate_Neighbor_Lists (&system, &workspace, &control, false);
 
 #ifdef __BUILD_DEBUG__
-	Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
+    Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
 #endif
-	Cuda_Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
+    Cuda_Compute_Forces(&system, &control, &data, &workspace, &lists, &out_control);
 
 
 #ifdef __BUILD_DEBUG__
-	Compute_Kinetic_Energy( &system, &data );
+    Compute_Kinetic_Energy( &system, &data );
 #endif
-	Cuda_Compute_Kinetic_Energy (&system, &data);
+    Cuda_Compute_Kinetic_Energy (&system, &data);
 
 
 #ifndef __BUILD_DEBUG__
-	// Here sync the simulation data, because it has been changed.
-	Prep_Device_For_Output ( &system, &data );
-	Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
+    // Here sync the simulation data, because it has been changed.
+    Prep_Device_For_Output ( &system, &data );
+    Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
 #endif
 
 #ifdef __BUILD_DEBUG__
-	if (!validate_device (&system, &data, &workspace, &lists) )
-	{
-		fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step);
-		exit (1);
-	}
+    if (!validate_device (&system, &data, &workspace, &lists) )
+    {
+        fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step);
+        exit (1);
+    }
 #endif
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, "step -> %d <- done. \n", data.step);
+    fprintf (stderr, "step -> %d <- done. \n", data.step);
 #endif
 
 
-	++data.step;
+    ++data.step;
 
 
-	for( ; data.step <= control.nsteps; data.step++ ) {      
+    for( ; data.step <= control.nsteps; data.step++ ) {      
 
-		//fprintf (stderr, "Begin ... \n");
-		//to Sync step to the device.
-		//Sync_Host_Device (&data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice );
-		copy_host_device (&data.step, &((simulation_data *)data.d_simulation_data)->step, 
-				INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
+        //fprintf (stderr, "Begin ... \n");
+        //to Sync step to the device.
+        //Sync_Host_Device (&data, (simulation_data *)data.d_simulation_data, cudaMemcpyHostToDevice );
+        copy_host_device (&data.step, &((simulation_data *)data.d_simulation_data)->step, 
+                INT_SIZE, cudaMemcpyHostToDevice, RES_SIMULATION_DATA );
 
-		//fprintf (stderr, "Synched data .... \n");
-		if( control.T_mode ) {
-			Temperature_Control( &control, &data, &out_control );
-			Sync_Host_Device (&control, (control_params *)control.d_control, cudaMemcpyHostToDevice );
-		}
-		//fprintf (stderr, "Temp. Control done ... \n");
+        //fprintf (stderr, "Synched data .... \n");
+        if( control.T_mode ) {
+            Temperature_Control( &control, &data, &out_control );
+            Sync_Host_Device (&control, (control_params *)control.d_control, cudaMemcpyHostToDevice );
+        }
+        //fprintf (stderr, "Temp. Control done ... \n");
 
 #ifdef __BUILD_DEBUG__
-		Evolve( &system, &control, &data, &workspace, &lists, &out_control );
+        Evolve( &system, &control, &data, &workspace, &lists, &out_control );
 #endif
-		Cuda_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
+        Cuda_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
 
-		//fprintf (stderr, "Evolve done \n");
+        //fprintf (stderr, "Evolve done \n");
 
 
 #ifdef __BUILD_DEBUG__
-		Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
+        Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
 #endif
-		Cuda_Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
-		//fprintf (stderr, "Post Evolve done \n");
+        Cuda_Post_Evolve( &system, &control, &data, &workspace, &lists, &out_control );
+        //fprintf (stderr, "Post Evolve done \n");
 
 #ifndef __BUILD_DEBUG__
-		Prep_Device_For_Output ( &system, &data );
-		Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
-
-		/*
-		   Analysis( &system, &control, &data, &workspace, &lists, &out_control );
-		 */
-		steps = data.step - data.prev_steps;
-		if( steps && out_control.restart_freq && 
-				steps % out_control.restart_freq == 0 )
-			Write_Restart( &system, &control, &data, &workspace, &out_control );
+        Prep_Device_For_Output ( &system, &data );
+        Output_Results(&system, &control, &data, &workspace, &lists, &out_control);
+
+        /*
+           Analysis( &system, &control, &data, &workspace, &lists, &out_control );
+         */
+        steps = data.step - data.prev_steps;
+        if( steps && out_control.restart_freq && 
+                steps % out_control.restart_freq == 0 )
+            Write_Restart( &system, &control, &data, &workspace, &out_control );
 #endif
 
 #ifdef __BUILD_DEBUG__
-		if (!validate_device (&system, &data, &workspace, &lists) )
-		{
-			fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step);
-			exit (1);
-		}
+        if (!validate_device (&system, &data, &workspace, &lists) )
+        {
+            fprintf (stderr, " Results does not match between Device and host @ step --> %d \n", data.step);
+            exit (1);
+        }
 #endif
-		fprintf (stderr, "step -> %d <- done. \n", data.step);
-	}
+        fprintf (stderr, "step -> %d <- done. \n", data.step);
+    }
 
-	if( out_control.write_steps > 0 ) { 
-		fclose( out_control.trj );
-		//Write_PDB( &system, &control, &data, &workspace,
-		//     &(lists[BONDS]), &out_control );
-	}
+    if( out_control.write_steps > 0 ) { 
+        fclose( out_control.trj );
+        //Write_PDB( &system, &control, &data, &workspace,
+        //     &(lists[BONDS]), &out_control );
+    }
 
-	data.timing.end = Get_Time( );
-	data.timing.elapsed = Get_Timing_Info( data.timing.start );
-	fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed );
+    data.timing.end = Get_Time( );
+    data.timing.elapsed = Get_Timing_Info( data.timing.start );
+    fprintf( out_control.log, "total: %.2f secs\n", data.timing.elapsed );
 
-	return 0;
+    return 0;
 }
diff --git a/PuReMD-GPU/src/three_body_interactions.cu b/PuReMD-GPU/src/three_body_interactions.cu
index bc4d73cf..c2eed63b 100644
--- a/PuReMD-GPU/src/three_body_interactions.cu
+++ b/PuReMD-GPU/src/three_body_interactions.cu
@@ -30,43 +30,43 @@
 
 /* calculates the theta angle between i-j-k */
 HOST_DEVICE void Calculate_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
-		real *theta, real *cos_theta )
+        real *theta, real *cos_theta )
 {
-	(*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
-	if( *cos_theta > 1. ) *cos_theta  = 1.0;
-	if( *cos_theta < -1. ) *cos_theta  = -1.0;
+    (*cos_theta) = Dot( dvec_ji, dvec_jk, 3 ) / ( d_ji * d_jk );
+    if( *cos_theta > 1. ) *cos_theta  = 1.0;
+    if( *cos_theta < -1. ) *cos_theta  = -1.0;
 
-	(*theta) = ACOS( *cos_theta );
+    (*theta) = ACOS( *cos_theta );
 }
 
 
 /* calculates the derivative of the cosine of the angle between i-j-k */
 HOST_DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, real d_jk, 
-		rvec* dcos_theta_di, rvec* dcos_theta_dj, 
-		rvec* dcos_theta_dk )
+        rvec* dcos_theta_di, rvec* dcos_theta_dj, 
+        rvec* dcos_theta_dk )
 {
-	int  t;
-	real sqr_d_ji   = SQR(d_ji);
-	real sqr_d_jk   = SQR(d_jk);
-	real inv_dists  = 1.0 / (d_ji * d_jk);
-	real inv_dists3 = POW( inv_dists, 3 );
-	real dot_dvecs  = Dot( dvec_ji, dvec_jk, 3 );
-	real Cdot_inv3  = dot_dvecs * inv_dists3;
-
-	for( t = 0; t < 3; ++t ) {
-		(*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - 
-			Cdot_inv3 * sqr_d_jk * dvec_ji[t];
-
-		(*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists +
-			Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
-
-		(*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - 
-			Cdot_inv3 * sqr_d_ji * dvec_jk[t];
-	}
-
-	/*fprintf( stderr, 
-	  "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-	  dvec_jk[t] * inv_dists*/
+    int  t;
+    real sqr_d_ji   = SQR(d_ji);
+    real sqr_d_jk   = SQR(d_jk);
+    real inv_dists  = 1.0 / (d_ji * d_jk);
+    real inv_dists3 = POW( inv_dists, 3 );
+    real dot_dvecs  = Dot( dvec_ji, dvec_jk, 3 );
+    real Cdot_inv3  = dot_dvecs * inv_dists3;
+
+    for( t = 0; t < 3; ++t ) {
+        (*dcos_theta_di)[t] = dvec_jk[t] * inv_dists - 
+            Cdot_inv3 * sqr_d_jk * dvec_ji[t];
+
+        (*dcos_theta_dj)[t] = -(dvec_jk[t] + dvec_ji[t]) * inv_dists +
+            Cdot_inv3 * ( sqr_d_jk * dvec_ji[t] + sqr_d_ji * dvec_jk[t] );
+
+        (*dcos_theta_dk)[t] = dvec_ji[t] * inv_dists - 
+            Cdot_inv3 * sqr_d_ji * dvec_jk[t];
+    }
+
+    /*fprintf( stderr, 
+      "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+      dvec_jk[t] * inv_dists*/
 }
 
 
@@ -83,508 +83,508 @@ HOST_DEVICE void Calculate_dCos_Theta( rvec dvec_ji, real d_ji, rvec dvec_jk, re
 /* this is a 3-body interaction in which the main role is 
    played by j which sits in the middle of the other two. */
 void Three_Body_Interactions( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace,
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace,
+        list **lists, output_controls *out_control )
 {
-	int  i, j, pi, k, pk, t;
-	int  type_i, type_j, type_k;
-	int  start_j, end_j, start_pk, end_pk;
-	int  flag, cnt, num_thb_intrs;
-
-	real temp, temp_bo_jt, pBOjt7;
-	real p_val1, p_val2, p_val3, p_val4, p_val5;
-	real p_val6, p_val7, p_val8, p_val9, p_val10;
-	real p_pen1, p_pen2, p_pen3, p_pen4;
-	real p_coa1, p_coa2, p_coa3, p_coa4;
-	real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
-	real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
-	real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
-	real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
-	real CEpen1, CEpen2, CEpen3;
-	real e_ang, e_coa, e_pen;
-	real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
-	real Cf7ij, Cf7jk, Cf8j, Cf9j;
-	real f7_ij, f7_jk, f8_Dj, f9_Dj;
-	real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
-	real r_ij, r_jk;
-	real BOA_ij, BOA_jk;
-	real vlpadj;
-	rvec force, ext_press;
-	// rtensor temp_rtensor, total_rtensor;
-	real *total_bo;
-	three_body_header *thbh;
-	three_body_parameters *thbp;
-	three_body_interaction_data *p_ijk, *p_kji;
-	bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-	bond_order_data *bo_ij, *bo_jk, *bo_jt;
-	list *bonds, *thb_intrs;
-	bond_data *bond_list;
-	three_body_interaction_data *thb_list;
-
-	total_bo = workspace->total_bond_order;
-	bonds = (*lists) + BONDS;
-	bond_list = bonds->select.bond_list;
-	thb_intrs = (*lists) + THREE_BODIES;
-	thb_list = thb_intrs->select.three_body_list;
-
-	/* global parameters used in these calculations */
-	p_val6 = system->reaxprm.gp.l[14];
-	p_val8 = system->reaxprm.gp.l[33];
-	p_val9 = system->reaxprm.gp.l[16];
-	p_val10 = system->reaxprm.gp.l[17];
-	num_thb_intrs = 0;
-
-	for( j = 0; j < system->N; ++j ) {
-		// fprintf( out_control->eval, "j: %d\n", j );
-		type_j = system->atoms[j].type;
-		start_j = Start_Index(j, bonds);
-		end_j = End_Index(j, bonds);
-
-		p_val3 = system->reaxprm.sbp[ type_j ].p_val3;
-		p_val5 = system->reaxprm.sbp[ type_j ].p_val5;
-
-		SBOp = 0, prod_SBO = 1;
-		for( t = start_j; t < end_j; ++t ) {
-			bo_jt = &(bond_list[t].bo_data);
-			SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
-			temp = SQR( bo_jt->BO );
-			temp *= temp; 
-			temp *= temp;
-			prod_SBO *= EXP( -temp );
-		}
-
-		/* modifications to match Adri's code - 09/01/09 */
-		if( workspace->vlpex[j] >= 0 ){
-			vlpadj = 0;
-			dSBO2 = prod_SBO - 1;
-		}
-		else{
-			vlpadj = workspace->nlp[j];
-			dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
-		}
-
-		SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
-		dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
-
-		if( SBO <= 0 )
-			SBO2 = 0, CSBO2 = 0;
-		else if( SBO > 0 && SBO <= 1 ) {
-			SBO2 = POW( SBO, p_val9 );
-			CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
-		}
-		else if( SBO > 1 && SBO < 2 ) {
-			SBO2 = 2 - POW( 2-SBO, p_val9 );
-			CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
-		}
-		else 
-			SBO2 = 2, CSBO2 = 0;  
-
-		expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
-
-		/* unlike 2-body intrs where we enforce i<j, we cannot put any such 
-		   restrictions here. such a restriction would prevent us from producing 
-		   all 4-body intrs correctly */
-		for( pi = start_j; pi < end_j; ++pi ) {
-			Set_Start_Index( pi, num_thb_intrs, thb_intrs );
-
-			pbond_ij = &(bond_list[pi]);
-			bo_ij = &(pbond_ij->bo_data);
-			BOA_ij = bo_ij->BO - control->thb_cut;
-
-
-			if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) {
-				i = pbond_ij->nbr;
-				r_ij = pbond_ij->d;	 
-				type_i = system->atoms[i].type;
-				// fprintf( out_control->eval, "i: %d\n", i );
-
-
-				/* first copy 3-body intrs from previously computed ones where i>k.
+    int  i, j, pi, k, pk, t;
+    int  type_i, type_j, type_k;
+    int  start_j, end_j, start_pk, end_pk;
+    int  flag, cnt, num_thb_intrs;
+
+    real temp, temp_bo_jt, pBOjt7;
+    real p_val1, p_val2, p_val3, p_val4, p_val5;
+    real p_val6, p_val7, p_val8, p_val9, p_val10;
+    real p_pen1, p_pen2, p_pen3, p_pen4;
+    real p_coa1, p_coa2, p_coa3, p_coa4;
+    real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
+    real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
+    real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
+    real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
+    real CEpen1, CEpen2, CEpen3;
+    real e_ang, e_coa, e_pen;
+    real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
+    real Cf7ij, Cf7jk, Cf8j, Cf9j;
+    real f7_ij, f7_jk, f8_Dj, f9_Dj;
+    real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
+    real r_ij, r_jk;
+    real BOA_ij, BOA_jk;
+    real vlpadj;
+    rvec force, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    real *total_bo;
+    three_body_header *thbh;
+    three_body_parameters *thbp;
+    three_body_interaction_data *p_ijk, *p_kji;
+    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+    bond_order_data *bo_ij, *bo_jk, *bo_jt;
+    list *bonds, *thb_intrs;
+    bond_data *bond_list;
+    three_body_interaction_data *thb_list;
+
+    total_bo = workspace->total_bond_order;
+    bonds = (*lists) + BONDS;
+    bond_list = bonds->select.bond_list;
+    thb_intrs = (*lists) + THREE_BODIES;
+    thb_list = thb_intrs->select.three_body_list;
+
+    /* global parameters used in these calculations */
+    p_val6 = system->reaxprm.gp.l[14];
+    p_val8 = system->reaxprm.gp.l[33];
+    p_val9 = system->reaxprm.gp.l[16];
+    p_val10 = system->reaxprm.gp.l[17];
+    num_thb_intrs = 0;
+
+    for( j = 0; j < system->N; ++j ) {
+        // fprintf( out_control->eval, "j: %d\n", j );
+        type_j = system->atoms[j].type;
+        start_j = Start_Index(j, bonds);
+        end_j = End_Index(j, bonds);
+
+        p_val3 = system->reaxprm.sbp[ type_j ].p_val3;
+        p_val5 = system->reaxprm.sbp[ type_j ].p_val5;
+
+        SBOp = 0, prod_SBO = 1;
+        for( t = start_j; t < end_j; ++t ) {
+            bo_jt = &(bond_list[t].bo_data);
+            SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
+            temp = SQR( bo_jt->BO );
+            temp *= temp; 
+            temp *= temp;
+            prod_SBO *= EXP( -temp );
+        }
+
+        /* modifications to match Adri's code - 09/01/09 */
+        if( workspace->vlpex[j] >= 0 ){
+            vlpadj = 0;
+            dSBO2 = prod_SBO - 1;
+        }
+        else{
+            vlpadj = workspace->nlp[j];
+            dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
+        }
+
+        SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
+        dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
+
+        if( SBO <= 0 )
+            SBO2 = 0, CSBO2 = 0;
+        else if( SBO > 0 && SBO <= 1 ) {
+            SBO2 = POW( SBO, p_val9 );
+            CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
+        }
+        else if( SBO > 1 && SBO < 2 ) {
+            SBO2 = 2 - POW( 2-SBO, p_val9 );
+            CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
+        }
+        else 
+            SBO2 = 2, CSBO2 = 0;  
+
+        expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
+
+        /* unlike 2-body intrs where we enforce i<j, we cannot put any such 
+           restrictions here. such a restriction would prevent us from producing 
+           all 4-body intrs correctly */
+        for( pi = start_j; pi < end_j; ++pi ) {
+            Set_Start_Index( pi, num_thb_intrs, thb_intrs );
+
+            pbond_ij = &(bond_list[pi]);
+            bo_ij = &(pbond_ij->bo_data);
+            BOA_ij = bo_ij->BO - control->thb_cut;
+
+
+            if( BOA_ij/*bo_ij->BO*/ > (real) 0.0 ) {
+                i = pbond_ij->nbr;
+                r_ij = pbond_ij->d;     
+                type_i = system->atoms[i].type;
+                // fprintf( out_control->eval, "i: %d\n", i );
+
+
+                /* first copy 3-body intrs from previously computed ones where i>k.
 IMPORTANT: if it is less costly to compute theta and its 
 derivative, we should definitely re-compute them, 
 instead of copying!
 in the second for-loop below, we compute only new 3-body intrs 
 where i < k */
-				for( pk = start_j; pk < pi; ++pk ) {
-					// fprintf( out_control->eval, "pk: %d\n", pk );
-					start_pk = Start_Index( pk, thb_intrs );
-					end_pk = End_Index( pk, thb_intrs );
-
-					for( t = start_pk; t < end_pk; ++t )
-						if( thb_list[t].thb == i ) {
-							p_ijk = &(thb_list[num_thb_intrs]);
-							p_kji = &(thb_list[t]);
-
-							p_ijk->thb = bond_list[pk].nbr;
-							p_ijk->pthb  = pk;
-							p_ijk->theta = p_kji->theta;			  
-							rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-							rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-							rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-
-							//if (j == 12)
-							//fprintf (stderr, "Adding one for matched atom %d \n", i);
-
-							++num_thb_intrs;
-							break;
-						}
-				}
-
-
-				/* and this is the second for loop mentioned above */
-				for( pk = pi+1; pk < end_j; ++pk ) {
-					pbond_jk = &(bond_list[pk]);
-					bo_jk    = &(pbond_jk->bo_data);
-					BOA_jk   = bo_jk->BO - control->thb_cut;
-					k        = pbond_jk->nbr;
-					type_k   = system->atoms[k].type;
-					p_ijk    = &( thb_list[num_thb_intrs] );
-
-					//TODO - CHANGE ORIGINAL
-					if (BOA_jk <= 0) continue;
-
-					Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
-							pbond_jk->dvec, pbond_jk->d,
-							&theta, &cos_theta );
-
-					Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
-							pbond_jk->dvec, pbond_jk->d, 
-							&(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
-							&(p_ijk->dcos_dk) );
-
-					p_ijk->thb = k;
-					p_ijk->pthb = pk;
-					p_ijk->theta = theta;
-
-					//if (j == 12)
-					//fprintf (stderr, "Adding one for the rest %d \n", k);
-
-					sin_theta = SIN( theta );
-					if( sin_theta < 1.0e-5 )
-						sin_theta = 1.0e-5;
-
-					++num_thb_intrs;
-
-
-					if( BOA_jk > 0.0 && 
-							(bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
-						r_jk = pbond_jk->d;		      
-						thbh = &( system->reaxprm.thbp[ index_thbp (type_i,type_j,type_k,&system->reaxprm) ] );
-						flag = 0;
-
-						/* if( workspace->orig_id[i] < workspace->orig_id[k] )
-						   fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-						   workspace->orig_id[i], workspace->orig_id[j],
-						   workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
-						   else 
-						   fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-						   workspace->orig_id[k], workspace->orig_id[j],
-						   workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
-
-
-						for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
-							// fprintf( out_control->eval, 
-							// "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
-
-							if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
-								thbp = &( thbh->prm[cnt] );
-
-								/* ANGLE ENERGY */
-								p_val1 = thbp->p_val1;
-								p_val2 = thbp->p_val2;
-								p_val4 = thbp->p_val4;
-								p_val7 = thbp->p_val7;
-								theta_00 = thbp->theta_00;
-
-								exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
-								f7_ij = 1.0 - exp3ij;
-								Cf7ij = p_val3 * p_val4 * 
-									POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
-
-								exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
-								f7_jk = 1.0 - exp3jk;
-								Cf7jk = p_val3 * p_val4 * 
-									POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
-
-								expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
-								trm8 = 1.0 + expval6 + expval7;
-								f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
-								Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
-									(p_val6 * expval6 * trm8 - 
-									 (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
-
-								theta_0 = 180.0 - 
-									theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
-								theta_0 = DEG2RAD( theta_0 );		      
-
-								expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
-								if( p_val1 >= 0 )
-									expval12theta = p_val1 * (1.0 - expval2theta);
-								else // To avoid linear Me-H-Me angles (6/6/06)
-									expval12theta = p_val1 * -expval2theta;
-
-								CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
-								CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
-								CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
-								CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
-									expval2theta * (theta_0 - theta);
-
-								Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
-									exp( -p_val10 * (2.0 - SBO2) );
-
-								CEval5 = -CEval4 * Ctheta_0 * CSBO2;
-								CEval6 = CEval5 * dSBO1;
-								CEval7 = CEval5 * dSBO2;
-								CEval8 = -CEval4 / sin_theta;
-
-								data->E_Ang += e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
-								/* END ANGLE ENERGY*/
-
-
-								/* PENALTY ENERGY */
-								p_pen1 = thbp->p_pen1;
-								p_pen2 = system->reaxprm.gp.l[19];
-								p_pen3 = system->reaxprm.gp.l[20];
-								p_pen4 = system->reaxprm.gp.l[21];
-
-								exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
-								exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
-								exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
-								exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
-								trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
-								f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-								Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
-										(2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
-											p_pen4 * exp_pen4 )) /
-									SQR( trm_pen34 );
-
-								data->E_Pen += e_pen = 
-									p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
-
-								CEpen1 = e_pen * Cf9j / f9_Dj;
-								temp   = -2.0 * p_pen2 * e_pen;
-								CEpen2 = temp * (BOA_ij - 2.0);
-								CEpen3 = temp * (BOA_jk - 2.0);
-								/* END PENALTY ENERGY */
-
-
-								/* COALITION ENERGY */
-								p_coa1 = thbp->p_coa1;
-								p_coa2 = system->reaxprm.gp.l[2];
-								p_coa3 = system->reaxprm.gp.l[38];
-								p_coa4 = system->reaxprm.gp.l[30];
-
-								exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
-								data->E_Coa += e_coa = 
-									p_coa1 / (1. + exp_coa2) *
-									EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
-									EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
-									EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
-									EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
-
-								CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
-								CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-								CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
-								CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
-								CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
-								/* END COALITION ENERGY */
-
-								/* FORCES */
-								bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4));
-								bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5));
-								workspace->CdDelta[j] += ((CEval3 + CEval7) + 
-										CEpen1 + CEcoa3);
-								workspace->CdDelta[i] += CEcoa4;
-								workspace->CdDelta[k] += CEcoa5;		      
-
-								for( t = start_j; t < end_j; ++t ) {
-									pbond_jt = &( bond_list[t] );
-									bo_jt = &(pbond_jt->bo_data);
-									temp_bo_jt = bo_jt->BO;
-									temp = CUBE( temp_bo_jt );
-									pBOjt7 = temp * temp * temp_bo_jt; 
-
-									// fprintf( out_control->eval, "%6d%12.8f\n", 
-									// workspace->orig_id[ bond_list[t].nbr ], 
-									//    (CEval6 * pBOjt7) );
-
-									bo_jt->Cdbo += (CEval6 * pBOjt7);
-									bo_jt->Cdbopi += CEval5;
-									bo_jt->Cdbopi2 += CEval5;
-								}		      
-
-
-								if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-
-									rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di );
-									rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
-									rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk );
-
-									/*
-									   if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j);
-									   if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j);
-									 */
-								}
-								else {
-									/* terms not related to bond order derivatives
-									   are added directly into 
-									   forces and pressure vector/tensor */
-									rvec_Scale( force, CEval8, p_ijk->dcos_di );
-									rvec_Add( system->atoms[i].f, force );
-									rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-									rvec_Add( data->ext_press, ext_press );
-
-									rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
-
-									rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-									rvec_Add( system->atoms[k].f, force );
-									rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-									rvec_Add( data->ext_press, ext_press );
-
-
-									/* This part is for a fully-flexible box */
-									/* rvec_OuterProduct( temp_rtensor, 
-									   p_ijk->dcos_di, system->atoms[i].x );
-									   rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
-
-									   rvec_OuterProduct( temp_rtensor, 
-									   p_ijk->dcos_dj, system->atoms[j].x );
-									   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-
-									   rvec_OuterProduct( temp_rtensor, 
-									   p_ijk->dcos_dk, system->atoms[k].x );
-									   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-
-									   if( pbond_ij->imaginary || pbond_jk->imaginary )
-									   rtensor_ScaledAdd( data->flex_bar.P, 
-									   -1.0, total_rtensor );
-									   else
-									   rtensor_Add( data->flex_bar.P, total_rtensor ); */
-								}
+                for( pk = start_j; pk < pi; ++pk ) {
+                    // fprintf( out_control->eval, "pk: %d\n", pk );
+                    start_pk = Start_Index( pk, thb_intrs );
+                    end_pk = End_Index( pk, thb_intrs );
+
+                    for( t = start_pk; t < end_pk; ++t )
+                        if( thb_list[t].thb == i ) {
+                            p_ijk = &(thb_list[num_thb_intrs]);
+                            p_kji = &(thb_list[t]);
+
+                            p_ijk->thb = bond_list[pk].nbr;
+                            p_ijk->pthb  = pk;
+                            p_ijk->theta = p_kji->theta;              
+                            rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
+                            rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
+                            rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
+
+                            //if (j == 12)
+                            //fprintf (stderr, "Adding one for matched atom %d \n", i);
+
+                            ++num_thb_intrs;
+                            break;
+                        }
+                }
+
+
+                /* and this is the second for loop mentioned above */
+                for( pk = pi+1; pk < end_j; ++pk ) {
+                    pbond_jk = &(bond_list[pk]);
+                    bo_jk    = &(pbond_jk->bo_data);
+                    BOA_jk   = bo_jk->BO - control->thb_cut;
+                    k        = pbond_jk->nbr;
+                    type_k   = system->atoms[k].type;
+                    p_ijk    = &( thb_list[num_thb_intrs] );
+
+                    //TODO - CHANGE ORIGINAL
+                    if (BOA_jk <= 0) continue;
+
+                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
+                            pbond_jk->dvec, pbond_jk->d,
+                            &theta, &cos_theta );
+
+                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
+                            pbond_jk->dvec, pbond_jk->d, 
+                            &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
+                            &(p_ijk->dcos_dk) );
+
+                    p_ijk->thb = k;
+                    p_ijk->pthb = pk;
+                    p_ijk->theta = theta;
+
+                    //if (j == 12)
+                    //fprintf (stderr, "Adding one for the rest %d \n", k);
+
+                    sin_theta = SIN( theta );
+                    if( sin_theta < 1.0e-5 )
+                        sin_theta = 1.0e-5;
+
+                    ++num_thb_intrs;
+
+
+                    if( BOA_jk > 0.0 && 
+                            (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
+                        r_jk = pbond_jk->d;              
+                        thbh = &( system->reaxprm.thbp[ index_thbp (type_i,type_j,type_k,&system->reaxprm) ] );
+                        flag = 0;
+
+                        /* if( workspace->orig_id[i] < workspace->orig_id[k] )
+                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                           workspace->orig_id[i], workspace->orig_id[j],
+                           workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
+                           else 
+                           fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                           workspace->orig_id[k], workspace->orig_id[j],
+                           workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
+
+
+                        for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
+                            // fprintf( out_control->eval, 
+                            // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
+
+                            if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
+                                thbp = &( thbh->prm[cnt] );
+
+                                /* ANGLE ENERGY */
+                                p_val1 = thbp->p_val1;
+                                p_val2 = thbp->p_val2;
+                                p_val4 = thbp->p_val4;
+                                p_val7 = thbp->p_val7;
+                                theta_00 = thbp->theta_00;
+
+                                exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
+                                f7_ij = 1.0 - exp3ij;
+                                Cf7ij = p_val3 * p_val4 * 
+                                    POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+
+                                exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
+                                f7_jk = 1.0 - exp3jk;
+                                Cf7jk = p_val3 * p_val4 * 
+                                    POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+
+                                expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
+                                trm8 = 1.0 + expval6 + expval7;
+                                f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
+                                Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
+                                    (p_val6 * expval6 * trm8 - 
+                                     (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
+
+                                theta_0 = 180.0 - 
+                                    theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
+                                theta_0 = DEG2RAD( theta_0 );              
+
+                                expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
+                                if( p_val1 >= 0 )
+                                    expval12theta = p_val1 * (1.0 - expval2theta);
+                                else // To avoid linear Me-H-Me angles (6/6/06)
+                                    expval12theta = p_val1 * -expval2theta;
+
+                                CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
+                                CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
+                                CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
+                                CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
+                                    expval2theta * (theta_0 - theta);
+
+                                Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
+                                    exp( -p_val10 * (2.0 - SBO2) );
+
+                                CEval5 = -CEval4 * Ctheta_0 * CSBO2;
+                                CEval6 = CEval5 * dSBO1;
+                                CEval7 = CEval5 * dSBO2;
+                                CEval8 = -CEval4 / sin_theta;
+
+                                data->E_Ang += e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
+                                /* END ANGLE ENERGY*/
+
+
+                                /* PENALTY ENERGY */
+                                p_pen1 = thbp->p_pen1;
+                                p_pen2 = system->reaxprm.gp.l[19];
+                                p_pen3 = system->reaxprm.gp.l[20];
+                                p_pen4 = system->reaxprm.gp.l[21];
+
+                                exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                                exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                                exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
+                                exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
+                                trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
+                                f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
+                                Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
+                                        (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
+                                            p_pen4 * exp_pen4 )) /
+                                    SQR( trm_pen34 );
+
+                                data->E_Pen += e_pen = 
+                                    p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+
+                                CEpen1 = e_pen * Cf9j / f9_Dj;
+                                temp   = -2.0 * p_pen2 * e_pen;
+                                CEpen2 = temp * (BOA_ij - 2.0);
+                                CEpen3 = temp * (BOA_jk - 2.0);
+                                /* END PENALTY ENERGY */
+
+
+                                /* COALITION ENERGY */
+                                p_coa1 = thbp->p_coa1;
+                                p_coa2 = system->reaxprm.gp.l[2];
+                                p_coa3 = system->reaxprm.gp.l[38];
+                                p_coa4 = system->reaxprm.gp.l[30];
+
+                                exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
+                                data->E_Coa += e_coa = 
+                                    p_coa1 / (1. + exp_coa2) *
+                                    EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
+                                    EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
+                                    EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
+                                    EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+
+                                CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
+                                CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
+                                CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
+                                CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
+                                CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
+                                /* END COALITION ENERGY */
+
+                                /* FORCES */
+                                bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4));
+                                bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5));
+                                workspace->CdDelta[j] += ((CEval3 + CEval7) + 
+                                        CEpen1 + CEcoa3);
+                                workspace->CdDelta[i] += CEcoa4;
+                                workspace->CdDelta[k] += CEcoa5;              
+
+                                for( t = start_j; t < end_j; ++t ) {
+                                    pbond_jt = &( bond_list[t] );
+                                    bo_jt = &(pbond_jt->bo_data);
+                                    temp_bo_jt = bo_jt->BO;
+                                    temp = CUBE( temp_bo_jt );
+                                    pBOjt7 = temp * temp * temp_bo_jt; 
+
+                                    // fprintf( out_control->eval, "%6d%12.8f\n", 
+                                    // workspace->orig_id[ bond_list[t].nbr ], 
+                                    //    (CEval6 * pBOjt7) );
+
+                                    bo_jt->Cdbo += (CEval6 * pBOjt7);
+                                    bo_jt->Cdbopi += CEval5;
+                                    bo_jt->Cdbopi2 += CEval5;
+                                }              
+
+
+                                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+
+                                    rvec_ScaledAdd( system->atoms[i].f, CEval8, p_ijk->dcos_di );
+                                    rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                    rvec_ScaledAdd( system->atoms[k].f, CEval8, p_ijk->dcos_dk );
+
+                                    /*
+                                       if (i == 0) fprintf (stderr, " atom %d adding to i (j) = 0\n", j);
+                                       if (k == 0) fprintf (stderr, " atom %d adding to i (k) = 0\n", j);
+                                     */
+                                }
+                                else {
+                                    /* terms not related to bond order derivatives
+                                       are added directly into 
+                                       forces and pressure vector/tensor */
+                                    rvec_Scale( force, CEval8, p_ijk->dcos_di );
+                                    rvec_Add( system->atoms[i].f, force );
+                                    rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                    rvec_Add( data->ext_press, ext_press );
+
+                                    rvec_ScaledAdd( system->atoms[j].f, CEval8, p_ijk->dcos_dj );
+
+                                    rvec_Scale( force, CEval8, p_ijk->dcos_dk );
+                                    rvec_Add( system->atoms[k].f, force );
+                                    rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                    rvec_Add( data->ext_press, ext_press );
+
+
+                                    /* This part is for a fully-flexible box */
+                                    /* rvec_OuterProduct( temp_rtensor, 
+                                       p_ijk->dcos_di, system->atoms[i].x );
+                                       rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
+
+                                       rvec_OuterProduct( temp_rtensor, 
+                                       p_ijk->dcos_dj, system->atoms[j].x );
+                                       rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+
+                                       rvec_OuterProduct( temp_rtensor, 
+                                       p_ijk->dcos_dk, system->atoms[k].x );
+                                       rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+
+                                       if( pbond_ij->imaginary || pbond_jk->imaginary )
+                                       rtensor_ScaledAdd( data->flex_bar.P, 
+                                       -1.0, total_rtensor );
+                                       else
+                                       rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                                }
 
 #ifdef TEST_ENERGY
-								fprintf( out_control->eval, 
-										//"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
-										"%6d%6d%6d%23.15e%23.15e%23.15e\n",
-										i+1, j+1, k+1,
-										//workspace->orig_id[i]+1,  
-										//workspace->orig_id[j]+1,
-										//workspace->orig_id[k]+1,
-										//workspace->Delta_boc[j], 
-										RAD2DEG(theta), /*BOA_ij, BOA_jk, */
-										e_ang, data->E_Ang );
-
-								/*fprintf( out_control->eval, 
-								  "%23.15e%23.15e%23.15e%23.15e",
-								  p_val3, p_val4, BOA_ij, BOA_jk );
-								  fprintf( out_control->eval, 
-								  "%23.15e%23.15e%23.15e%23.15e",
-								  f7_ij, f7_jk, f8_Dj, expval12theta );
-								  fprintf( out_control->eval, 
-								  "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-								  CEval1, CEval2, CEval3, CEval4, CEval5
-								//CEval6, CEval7, CEval8  );*/
-
-								/*fprintf( out_control->eval, 
-								  "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-								  -p_ijk->dcos_di[0]/sin_theta, 
-								  -p_ijk->dcos_di[1]/sin_theta, 
-								  -p_ijk->dcos_di[2]/sin_theta, 
-								  -p_ijk->dcos_dj[0]/sin_theta, 
-								  -p_ijk->dcos_dj[1]/sin_theta, 
-								  -p_ijk->dcos_dj[2]/sin_theta, 
-								  -p_ijk->dcos_dk[0]/sin_theta, 
-								  -p_ijk->dcos_dk[1]/sin_theta, 
-								  -p_ijk->dcos_dk[2]/sin_theta );*/
-
-								/* fprintf( out_control->epen, 
-								   "%23.15e%23.15e%23.15e\n", 
-								   CEpen1, CEpen2, CEpen3 );
-								   fprintf( out_control->epen, 
-								   "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-								   workspace->orig_id[i],  workspace->orig_id[j],
-								   workspace->orig_id[k], RAD2DEG(theta), 
-								   BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
-
-								fprintf( out_control->ecoa, 
-										"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-										workspace->orig_id[i], 
-										workspace->orig_id[j],
-										workspace->orig_id[k], 
-										RAD2DEG(theta), BOA_ij, BOA_jk, 
-										e_coa, data->E_Coa );
+                                fprintf( out_control->eval, 
+                                        //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
+                                        "%6d%6d%6d%23.15e%23.15e%23.15e\n",
+                                        i+1, j+1, k+1,
+                                        //workspace->orig_id[i]+1,  
+                                        //workspace->orig_id[j]+1,
+                                        //workspace->orig_id[k]+1,
+                                        //workspace->Delta_boc[j], 
+                                        RAD2DEG(theta), /*BOA_ij, BOA_jk, */
+                                        e_ang, data->E_Ang );
+
+                                /*fprintf( out_control->eval, 
+                                  "%23.15e%23.15e%23.15e%23.15e",
+                                  p_val3, p_val4, BOA_ij, BOA_jk );
+                                  fprintf( out_control->eval, 
+                                  "%23.15e%23.15e%23.15e%23.15e",
+                                  f7_ij, f7_jk, f8_Dj, expval12theta );
+                                  fprintf( out_control->eval, 
+                                  "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                  CEval1, CEval2, CEval3, CEval4, CEval5
+                                //CEval6, CEval7, CEval8  );*/
+
+                                /*fprintf( out_control->eval, 
+                                  "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                  -p_ijk->dcos_di[0]/sin_theta, 
+                                  -p_ijk->dcos_di[1]/sin_theta, 
+                                  -p_ijk->dcos_di[2]/sin_theta, 
+                                  -p_ijk->dcos_dj[0]/sin_theta, 
+                                  -p_ijk->dcos_dj[1]/sin_theta, 
+                                  -p_ijk->dcos_dj[2]/sin_theta, 
+                                  -p_ijk->dcos_dk[0]/sin_theta, 
+                                  -p_ijk->dcos_dk[1]/sin_theta, 
+                                  -p_ijk->dcos_dk[2]/sin_theta );*/
+
+                                /* fprintf( out_control->epen, 
+                                   "%23.15e%23.15e%23.15e\n", 
+                                   CEpen1, CEpen2, CEpen3 );
+                                   fprintf( out_control->epen, 
+                                   "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                   workspace->orig_id[i],  workspace->orig_id[j],
+                                   workspace->orig_id[k], RAD2DEG(theta), 
+                                   BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
+
+                                fprintf( out_control->ecoa, 
+                                        "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                        workspace->orig_id[i], 
+                                        workspace->orig_id[j],
+                                        workspace->orig_id[k], 
+                                        RAD2DEG(theta), BOA_ij, BOA_jk, 
+                                        e_coa, data->E_Coa );
 #endif
 
 #ifdef TEST_FORCES            /* angle forces */
-								Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
-								Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
-								Add_dDelta( system, lists, 
-										j, CEval3 + CEval7, workspace->f_ang );
-
-								for( t = start_j; t < end_j; ++t ) {
-									pbond_jt = &( bond_list[t] );
-									bo_jt = &(pbond_jt->bo_data);
-									temp_bo_jt = bo_jt->BO;
-									temp = CUBE( temp_bo_jt );
-									pBOjt7 = temp * temp * temp_bo_jt; 
-
-									Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
-											workspace->f_ang );
-									Add_dBOpinpi2( system, lists, j, t, 
-											CEval5, CEval5, 
-											workspace->f_ang, workspace->f_ang );
-								}
-
-								rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
-								rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
-								rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
-								/* end angle forces */
-
-								/* penalty forces */
-								Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
-								Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
-								Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
-								/* end penalty forces */
-
-								/* coalition forces */
-								Add_dBO( system, lists, 
-										j, pi, CEcoa1-CEcoa4, workspace->f_coa );
-								Add_dBO( system, lists, 
-										j, pk, CEcoa2-CEcoa5, workspace->f_coa );
-								Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
-								Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
-								Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
-								/* end coalition forces */
+                                Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
+                                Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
+                                Add_dDelta( system, lists, 
+                                        j, CEval3 + CEval7, workspace->f_ang );
+
+                                for( t = start_j; t < end_j; ++t ) {
+                                    pbond_jt = &( bond_list[t] );
+                                    bo_jt = &(pbond_jt->bo_data);
+                                    temp_bo_jt = bo_jt->BO;
+                                    temp = CUBE( temp_bo_jt );
+                                    pBOjt7 = temp * temp * temp_bo_jt; 
+
+                                    Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
+                                            workspace->f_ang );
+                                    Add_dBOpinpi2( system, lists, j, t, 
+                                            CEval5, CEval5, 
+                                            workspace->f_ang, workspace->f_ang );
+                                }
+
+                                rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
+                                rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
+                                rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
+                                /* end angle forces */
+
+                                /* penalty forces */
+                                Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
+                                Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
+                                Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
+                                /* end penalty forces */
+
+                                /* coalition forces */
+                                Add_dBO( system, lists, 
+                                        j, pi, CEcoa1-CEcoa4, workspace->f_coa );
+                                Add_dBO( system, lists, 
+                                        j, pk, CEcoa2-CEcoa5, workspace->f_coa );
+                                Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
+                                Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
+                                Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
+                                /* end coalition forces */
 #endif
-							}
-						}
-					}
-				}
-			}
-
-			Set_End_Index(pi, num_thb_intrs, thb_intrs );
-		}
-	}
-
-	if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
-		workspace->realloc.num_3body = num_thb_intrs;
-		if( num_thb_intrs > thb_intrs->num_intrs ) {
-			fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
-					data->step, num_thb_intrs, thb_intrs->num_intrs );
-			exit( INSUFFICIENT_SPACE );
-		}
-	}
-
-	//fprintf( stderr,"%d: Number of angle interactions: %d\n", 
-	// data->step, num_thb_intrs );
+                            }
+                        }
+                    }
+                }
+            }
+
+            Set_End_Index(pi, num_thb_intrs, thb_intrs );
+        }
+    }
+
+    if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
+        workspace->realloc.num_3body = num_thb_intrs;
+        if( num_thb_intrs > thb_intrs->num_intrs ) {
+            fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
+                    data->step, num_thb_intrs, thb_intrs->num_intrs );
+            exit( INSUFFICIENT_SPACE );
+        }
+    }
+
+    //fprintf( stderr,"%d: Number of angle interactions: %d\n", 
+    // data->step, num_thb_intrs );
 #ifdef TEST_ENERGY
-	fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
+    fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
 
-	fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
-			data->E_Ang, data->E_Pen, data->E_Coa );
+    fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
+            data->E_Ang, data->E_Pen, data->E_Coa );
 
-	fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
-			data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+    fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
+            data->ext_press[0], data->ext_press[1], data->ext_press[2] );
 #endif
 }
 
@@ -597,598 +597,598 @@ where i < k */
 /* this is a 3-body interaction in which the main role is 
    played by j which sits in the middle of the other two. */
 GLOBAL void Three_Body_Interactions( reax_atom *atoms,
-		single_body_parameters *sbp,
-		three_body_header *d_thbp,
-		global_parameters g_params,
-		control_params *control,
-		simulation_data *data,
-		static_storage p_workspace, 
-		list p_bonds, list p_thb_intrs,
-		int N, int num_atom_types,
-		real *E_Ang, real *E_Pen, real *E_Coa, rvec *aux_ext_press )
+        single_body_parameters *sbp,
+        three_body_header *d_thbp,
+        global_parameters g_params,
+        control_params *control,
+        simulation_data *data,
+        static_storage p_workspace, 
+        list p_bonds, list p_thb_intrs,
+        int N, int num_atom_types,
+        real *E_Ang, real *E_Pen, real *E_Coa, rvec *aux_ext_press )
 {
-	int  i, j, pi, k, pk, t;
-	int  type_i, type_j, type_k;
-	int  start_j, end_j, start_pk, end_pk;
-	int  flag, cnt, num_thb_intrs;
-
-	real temp, temp_bo_jt, pBOjt7;
-	real p_val1, p_val2, p_val3, p_val4, p_val5;
-	real p_val6, p_val7, p_val8, p_val9, p_val10;
-	real p_pen1, p_pen2, p_pen3, p_pen4;
-	real p_coa1, p_coa2, p_coa3, p_coa4;
-	real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
-	real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
-	real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
-	real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
-	real CEpen1, CEpen2, CEpen3;
-	real e_ang, e_coa, e_pen;
-	real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
-	real Cf7ij, Cf7jk, Cf8j, Cf9j;
-	real f7_ij, f7_jk, f8_Dj, f9_Dj;
-	real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
-	real r_ij, r_jk;
-	real BOA_ij, BOA_jk;
-	real vlpadj;
-	rvec force, ext_press;
-	// rtensor temp_rtensor, total_rtensor;
-	real *total_bo;
-	three_body_header *thbh;
-	three_body_parameters *thbp;
-	three_body_interaction_data *p_ijk, *p_kji;
-	bond_data *pbond_ij, *pbond_jk, *pbond_jt;
-	bond_order_data *bo_ij, *bo_jk, *bo_jt;
-	list *bonds, *thb_intrs;
-	bond_data *bond_list;
-	three_body_interaction_data *thb_list;
-	static_storage *workspace = &p_workspace;
-
-	j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= N) return;
-
-
-	total_bo = workspace->total_bond_order;
-	bonds = &p_bonds;
-	bond_list = bonds->select.bond_list;
-	thb_intrs = &p_thb_intrs;
-	thb_list = thb_intrs->select.three_body_list;
-
-	/* global parameters used in these calculations */
-	p_val6 = g_params.l[14];
-	p_val8 = g_params.l[33];
-	p_val9 = g_params.l[16];
-	p_val10 = g_params.l[17];
-
-	//TODO check this, initially this was zero, 
-	// I am changing it to the starting index for this atom.
-	//num_thb_intrs = j * MAX_TH_BODY;
-
-	//for( j = 0; j < system->N; ++j ) {
-	// fprintf( out_control->eval, "j: %d\n", j );
-	type_j = atoms[j].type;
-	start_j = Start_Index(j, bonds);
-	end_j = End_Index(j, bonds);
-
-	p_val3 = sbp[ type_j ].p_val3;
-	p_val5 = sbp[ type_j ].p_val5;
-
-	SBOp = 0, prod_SBO = 1;
-	for( t = start_j; t < end_j; ++t ) {
-		bo_jt = &(bond_list[t].bo_data);
-		SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
-		temp = SQR( bo_jt->BO );
-		temp *= temp; 
-		temp *= temp;
-		prod_SBO *= EXP( -temp );
-	}
-
-	/* modifications to match Adri's code - 09/01/09 */
-	if( workspace->vlpex[j] >= 0 ){
-		vlpadj = 0;
-		dSBO2 = prod_SBO - 1;
-	}
-	else{
-		vlpadj = workspace->nlp[j];
-		dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
-	}
-
-	SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
-	dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
-
-	if( SBO <= 0 )
-		SBO2 = 0, CSBO2 = 0;
-	else if( SBO > 0 && SBO <= 1 ) {
-		SBO2 = POW( SBO, p_val9 );
-		CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
-	}
-	else if( SBO > 1 && SBO < 2 ) {
-		SBO2 = 2 - POW( 2-SBO, p_val9 );
-		CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
-	}
-	else 
-		SBO2 = 2, CSBO2 = 0;  
-
-	expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
-
-	/* unlike 2-body intrs where we enforce i<j, we cannot put any such 
-	   restrictions here. such a restriction would prevent us from producing 
-	   all 4-body intrs correctly */
-	for( pi = start_j; pi < end_j; ++pi ) {
-
-		//TODO
-		//num_thb_intrs = pi * MAX_THREE_BODIES;
-		//TODO
-
-		//Set_Start_Index( pi, num_thb_intrs, thb_intrs );
-		num_thb_intrs = Start_Index (pi, thb_intrs);
-
-		pbond_ij = &(bond_list[pi]);
-		bo_ij = &(pbond_ij->bo_data);
-		BOA_ij = bo_ij->BO - control->thb_cut;
-
-
-		if( BOA_ij/*bo_ij->BO*/ > 0.0 ) {
-			i = pbond_ij->nbr;
-			r_ij = pbond_ij->d;	 
-			type_i = atoms[i].type;
-			// fprintf( out_control->eval, "i: %d\n", i );
-
-
-			/* first copy 3-body intrs from previously computed ones where i>k.
+    int  i, j, pi, k, pk, t;
+    int  type_i, type_j, type_k;
+    int  start_j, end_j, start_pk, end_pk;
+    int  flag, cnt, num_thb_intrs;
+
+    real temp, temp_bo_jt, pBOjt7;
+    real p_val1, p_val2, p_val3, p_val4, p_val5;
+    real p_val6, p_val7, p_val8, p_val9, p_val10;
+    real p_pen1, p_pen2, p_pen3, p_pen4;
+    real p_coa1, p_coa2, p_coa3, p_coa4;
+    real trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
+    real exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
+    real dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO;
+    real CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
+    real CEpen1, CEpen2, CEpen3;
+    real e_ang, e_coa, e_pen;
+    real CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
+    real Cf7ij, Cf7jk, Cf8j, Cf9j;
+    real f7_ij, f7_jk, f8_Dj, f9_Dj;
+    real Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
+    real r_ij, r_jk;
+    real BOA_ij, BOA_jk;
+    real vlpadj;
+    rvec force, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    real *total_bo;
+    three_body_header *thbh;
+    three_body_parameters *thbp;
+    three_body_interaction_data *p_ijk, *p_kji;
+    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+    bond_order_data *bo_ij, *bo_jk, *bo_jt;
+    list *bonds, *thb_intrs;
+    bond_data *bond_list;
+    three_body_interaction_data *thb_list;
+    static_storage *workspace = &p_workspace;
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
+
+
+    total_bo = workspace->total_bond_order;
+    bonds = &p_bonds;
+    bond_list = bonds->select.bond_list;
+    thb_intrs = &p_thb_intrs;
+    thb_list = thb_intrs->select.three_body_list;
+
+    /* global parameters used in these calculations */
+    p_val6 = g_params.l[14];
+    p_val8 = g_params.l[33];
+    p_val9 = g_params.l[16];
+    p_val10 = g_params.l[17];
+
+    //TODO check this, initially this was zero, 
+    // I am changing it to the starting index for this atom.
+    //num_thb_intrs = j * MAX_TH_BODY;
+
+    //for( j = 0; j < system->N; ++j ) {
+    // fprintf( out_control->eval, "j: %d\n", j );
+    type_j = atoms[j].type;
+    start_j = Start_Index(j, bonds);
+    end_j = End_Index(j, bonds);
+
+    p_val3 = sbp[ type_j ].p_val3;
+    p_val5 = sbp[ type_j ].p_val5;
+
+    SBOp = 0, prod_SBO = 1;
+    for( t = start_j; t < end_j; ++t ) {
+        bo_jt = &(bond_list[t].bo_data);
+        SBOp += (bo_jt->BO_pi + bo_jt->BO_pi2);
+        temp = SQR( bo_jt->BO );
+        temp *= temp; 
+        temp *= temp;
+        prod_SBO *= EXP( -temp );
+    }
+
+    /* modifications to match Adri's code - 09/01/09 */
+    if( workspace->vlpex[j] >= 0 ){
+        vlpadj = 0;
+        dSBO2 = prod_SBO - 1;
+    }
+    else{
+        vlpadj = workspace->nlp[j];
+        dSBO2 = (prod_SBO - 1) * (1 - p_val8 * workspace->dDelta_lp[j]);
+    }
+
+    SBO = SBOp + (1 - prod_SBO) * (-workspace->Delta_boc[j] - p_val8 * vlpadj);
+    dSBO1 = -8 * prod_SBO * ( workspace->Delta_boc[j] + p_val8 * vlpadj );
+
+    if( SBO <= 0 )
+        SBO2 = 0, CSBO2 = 0;
+    else if( SBO > 0 && SBO <= 1 ) {
+        SBO2 = POW( SBO, p_val9 );
+        CSBO2 = p_val9 * POW( SBO, p_val9 - 1 );
+    }
+    else if( SBO > 1 && SBO < 2 ) {
+        SBO2 = 2 - POW( 2-SBO, p_val9 );
+        CSBO2 = p_val9 * POW( 2 - SBO, p_val9 - 1 );
+    }
+    else 
+        SBO2 = 2, CSBO2 = 0;  
+
+    expval6 = EXP( p_val6 * workspace->Delta_boc[j] );
+
+    /* unlike 2-body intrs where we enforce i<j, we cannot put any such 
+       restrictions here. such a restriction would prevent us from producing 
+       all 4-body intrs correctly */
+    for( pi = start_j; pi < end_j; ++pi ) {
+
+        //TODO
+        //num_thb_intrs = pi * MAX_THREE_BODIES;
+        //TODO
+
+        //Set_Start_Index( pi, num_thb_intrs, thb_intrs );
+        num_thb_intrs = Start_Index (pi, thb_intrs);
+
+        pbond_ij = &(bond_list[pi]);
+        bo_ij = &(pbond_ij->bo_data);
+        BOA_ij = bo_ij->BO - control->thb_cut;
+
+
+        if( BOA_ij/*bo_ij->BO*/ > 0.0 ) {
+            i = pbond_ij->nbr;
+            r_ij = pbond_ij->d;     
+            type_i = atoms[i].type;
+            // fprintf( out_control->eval, "i: %d\n", i );
+
+
+            /* first copy 3-body intrs from previously computed ones where i>k.
 IMPORTANT: if it is less costly to compute theta and its 
 derivative, we should definitely re-compute them, 
 instead of copying!
 in the second for-loop below, we compute only new 3-body intrs 
 where i < k */
-			for( pk = start_j; pk < pi; ++pk ) {
-				// fprintf( out_control->eval, "pk: %d\n", pk );
-				start_pk = Start_Index( pk, thb_intrs );
-				end_pk = End_Index( pk, thb_intrs );
-
-				for( t = start_pk; t < end_pk; ++t )
-					if( thb_list[t].thb == i ) {
-						p_ijk = &(thb_list[num_thb_intrs]);
-						p_kji = &(thb_list[t]);
-
-						p_ijk->thb = bond_list[pk].nbr;
-						p_ijk->pthb  = pk;
-						p_ijk->theta = p_kji->theta;			  
-						rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
-						rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
-						rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
-
-						++num_thb_intrs;
-						break;
-					}
-			}
-
-
-			/* and this is the second for loop mentioned above */
-			for( pk = pi+1; pk < end_j; ++pk ) {
-				pbond_jk = &(bond_list[pk]);
-				bo_jk    = &(pbond_jk->bo_data);
-				BOA_jk   = bo_jk->BO - control->thb_cut;
-				k        = pbond_jk->nbr;
-				type_k   = atoms[k].type;
-				p_ijk    = &( thb_list[num_thb_intrs] );
-
-				//CHANGE ORIGINAL
-				if (BOA_jk <= 0) continue;
-				//CHANGE ORIGINAL
-
-				Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
-						pbond_jk->dvec, pbond_jk->d,
-						&theta, &cos_theta );
-
-				Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
-						pbond_jk->dvec, pbond_jk->d, 
-						&(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
-						&(p_ijk->dcos_dk) );
-
-				p_ijk->thb = k;
-				p_ijk->pthb = pk;
-				p_ijk->theta = theta;
-
-				sin_theta = SIN( theta );
-				if( sin_theta < 1.0e-5 )
-					sin_theta = 1.0e-5;
-
-				++num_thb_intrs;
-
-
-				if( BOA_jk > 0.0 && 
-						(bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
-					r_jk = pbond_jk->d;		      
-					thbh = &( d_thbp[ index_thbp (type_i,type_j,type_k,num_atom_types) ] );
-					flag = 0;
-
-					/* if( workspace->orig_id[i] < workspace->orig_id[k] )
-					   fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-					   workspace->orig_id[i], workspace->orig_id[j],
-					   workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
-					   else 
-					   fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
-					   workspace->orig_id[k], workspace->orig_id[j],
-					   workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
-
-					//TODO:
-					//pbond_jk->scratch = thbh->cnt;
-
-					for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
-						// fprintf( out_control->eval, 
-						// "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
-
-						if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
-							thbp = &( thbh->prm[cnt] );
-
-							/* ANGLE ENERGY */
-							p_val1 = thbp->p_val1;
-							p_val2 = thbp->p_val2;
-							p_val4 = thbp->p_val4;
-							p_val7 = thbp->p_val7;
-							theta_00 = thbp->theta_00;
-
-							exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
-							f7_ij = 1.0 - exp3ij;
-							Cf7ij = p_val3 * p_val4 * 
-								POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
-
-							exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
-							f7_jk = 1.0 - exp3jk;
-							Cf7jk = p_val3 * p_val4 * 
-								POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
-
-							expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
-							trm8 = 1.0 + expval6 + expval7;
-							f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
-							Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
-								(p_val6 * expval6 * trm8 - 
-								 (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
-
-							theta_0 = 180.0 - 
-								theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
-							theta_0 = DEG2RAD( theta_0 );		      
-
-							expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
-							if( p_val1 >= 0 )
-								expval12theta = p_val1 * (1.0 - expval2theta);
-							else // To avoid linear Me-H-Me angles (6/6/06)
-								expval12theta = p_val1 * -expval2theta;
-
-							CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
-							CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
-							CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
-							CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
-								expval2theta * (theta_0 - theta);
-
-							Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
-								exp( -p_val10 * (2.0 - SBO2) );
-
-							CEval5 = -CEval4 * Ctheta_0 * CSBO2;
-							CEval6 = CEval5 * dSBO1;
-							CEval7 = CEval5 * dSBO2;
-							CEval8 = -CEval4 / sin_theta;
-
-							e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
-							//PERFORMANCE IMPACT
-							//atomicAdd (&data->E_Ang, e_ang);
-							E_Ang [j] += e_ang;
-							/* END ANGLE ENERGY*/
-
-
-							/* PENALTY ENERGY */
-							p_pen1 = thbp->p_pen1;
-							p_pen2 = g_params.l[19];
-							p_pen3 = g_params.l[20];
-							p_pen4 = g_params.l[21];
-
-							exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
-							exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
-							exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
-							exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
-							trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
-							f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
-							Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
-									(2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
-										p_pen4 * exp_pen4 )) /
-								SQR( trm_pen34 );
-
-							e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
-							//PERFORMANCE IMPACT
-							//atomicAdd (&data->E_Pen, e_pen);
-							E_Pen [j] += e_pen;
-
-
-							CEpen1 = e_pen * Cf9j / f9_Dj;
-							temp   = -2.0 * p_pen2 * e_pen;
-							CEpen2 = temp * (BOA_ij - 2.0);
-							CEpen3 = temp * (BOA_jk - 2.0);
-							/* END PENALTY ENERGY */
-
-
-							/* COALITION ENERGY */
-							p_coa1 = thbp->p_coa1;
-							p_coa2 = g_params.l[2];
-							p_coa3 = g_params.l[38];
-							p_coa4 = g_params.l[30];
-
-							exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
-							e_coa = 
-								p_coa1 / (1. + exp_coa2) *
-								EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
-								EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
-								EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
-								EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
-
-							//PERFORMANCE IMPACT
-							//atomicAdd (&data->E_Coa, e_coa);
-							E_Coa [j] += e_coa;
-
-							CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
-							CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
-							CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
-							CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
-							CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
-							/* END COALITION ENERGY */
-
-							/* FORCES */
-							/*
-							   atomicAdd (&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) );
-							   atomicAdd (&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) );
-							   atomicAdd (&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) );
-							   atomicAdd (&workspace->CdDelta[i], CEcoa4 );
-							   atomicAdd (&workspace->CdDelta[k], CEcoa5 );		      
-							 */
-
-							bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ;
-							bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ;
-							workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ;
-							//atomicAdd (&workspace->CdDelta[i], CEcoa4 );
-							pbond_ij->CdDelta_ij += CEcoa4 ;
-							//atomicAdd (&workspace->CdDelta[k], CEcoa5 );		      
-							pbond_jk->CdDelta_ij += CEcoa5;
-
-							for( t = start_j; t < end_j; ++t ) {
-								pbond_jt = &( bond_list[t] );
-								bo_jt = &(pbond_jt->bo_data);
-								temp_bo_jt = bo_jt->BO;
-								temp = CUBE( temp_bo_jt );
-								pBOjt7 = temp * temp * temp_bo_jt; 
-
-								// fprintf( out_control->eval, "%6d%12.8f\n", 
-								// workspace->orig_id[ bond_list[t].nbr ], 
-								//    (CEval6 * pBOjt7) );
-
-								/*
-								   atomicAdd (&bo_jt->Cdbo, (CEval6 * pBOjt7) );
-								   atomicAdd (&bo_jt->Cdbopi, CEval5 );
-								   atomicAdd (&bo_jt->Cdbopi2, CEval5 );
-								 */
-								bo_jt->Cdbo		+= (CEval6 * pBOjt7) ;
-								bo_jt->Cdbopi	+= CEval5 ;
-								bo_jt->Cdbopi2	+= CEval5 ;
-							}		      
-
-
-							if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-								/*
-								   atomic_rvecScaledAdd( atoms[i].f, CEval8, p_ijk->dcos_di );
-								   atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
-								   atomic_rvecScaledAdd( atoms[k].f, CEval8, p_ijk->dcos_dk );
-								 */
-								rvec_ScaledAdd( pbond_ij->f, CEval8, p_ijk->dcos_di );
-								rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
-								rvec_ScaledAdd( pbond_jk->f, CEval8, p_ijk->dcos_dk );
-
-
-							}
-							else {
-								/* terms not related to bond order derivatives
-								   are added directly into 
-								   forces and pressure vector/tensor */
-								rvec_Scale( force, CEval8, p_ijk->dcos_di );
-								//atomic_rvecAdd( atoms[i].f, force );
-								rvec_Add( pbond_ij->f, force );
-
-								rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-								//atomic_rvecAdd( data->ext_press, ext_press );
-								rvec_Add( aux_ext_press [j], ext_press );
-
-								//atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
-								rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
-
-								rvec_Scale( force, CEval8, p_ijk->dcos_dk );
-								//atomic_rvecAdd( atoms[k].f, force );
-								rvec_Add( pbond_jk->f, force );
-								rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
-								//atomic_rvecAdd( data->ext_press, ext_press );
-								rvec_Add( aux_ext_press [j], ext_press );
-
-
-								/* This part is for a fully-flexible box */
-								/* rvec_OuterProduct( temp_rtensor, 
-								   p_ijk->dcos_di, system->atoms[i].x );
-								   rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
-
-								   rvec_OuterProduct( temp_rtensor, 
-								   p_ijk->dcos_dj, system->atoms[j].x );
-								   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-
-								   rvec_OuterProduct( temp_rtensor, 
-								   p_ijk->dcos_dk, system->atoms[k].x );
-								   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
-
-								   if( pbond_ij->imaginary || pbond_jk->imaginary )
-								   rtensor_ScaledAdd( data->flex_bar.P, 
-								   -1.0, total_rtensor );
-								   else
-								   rtensor_Add( data->flex_bar.P, total_rtensor ); */
-							}
+            for( pk = start_j; pk < pi; ++pk ) {
+                // fprintf( out_control->eval, "pk: %d\n", pk );
+                start_pk = Start_Index( pk, thb_intrs );
+                end_pk = End_Index( pk, thb_intrs );
+
+                for( t = start_pk; t < end_pk; ++t )
+                    if( thb_list[t].thb == i ) {
+                        p_ijk = &(thb_list[num_thb_intrs]);
+                        p_kji = &(thb_list[t]);
+
+                        p_ijk->thb = bond_list[pk].nbr;
+                        p_ijk->pthb  = pk;
+                        p_ijk->theta = p_kji->theta;              
+                        rvec_Copy( p_ijk->dcos_di, p_kji->dcos_dk );
+                        rvec_Copy( p_ijk->dcos_dj, p_kji->dcos_dj );
+                        rvec_Copy( p_ijk->dcos_dk, p_kji->dcos_di );
+
+                        ++num_thb_intrs;
+                        break;
+                    }
+            }
+
+
+            /* and this is the second for loop mentioned above */
+            for( pk = pi+1; pk < end_j; ++pk ) {
+                pbond_jk = &(bond_list[pk]);
+                bo_jk    = &(pbond_jk->bo_data);
+                BOA_jk   = bo_jk->BO - control->thb_cut;
+                k        = pbond_jk->nbr;
+                type_k   = atoms[k].type;
+                p_ijk    = &( thb_list[num_thb_intrs] );
+
+                //CHANGE ORIGINAL
+                if (BOA_jk <= 0) continue;
+                //CHANGE ORIGINAL
+
+                Calculate_Theta( pbond_ij->dvec, pbond_ij->d, 
+                        pbond_jk->dvec, pbond_jk->d,
+                        &theta, &cos_theta );
+
+                Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, 
+                        pbond_jk->dvec, pbond_jk->d, 
+                        &(p_ijk->dcos_di), &(p_ijk->dcos_dj), 
+                        &(p_ijk->dcos_dk) );
+
+                p_ijk->thb = k;
+                p_ijk->pthb = pk;
+                p_ijk->theta = theta;
+
+                sin_theta = SIN( theta );
+                if( sin_theta < 1.0e-5 )
+                    sin_theta = 1.0e-5;
+
+                ++num_thb_intrs;
+
+
+                if( BOA_jk > 0.0 && 
+                        (bo_ij->BO * bo_jk->BO) > SQR(control->thb_cut)/*0*/) {
+                    r_jk = pbond_jk->d;              
+                    thbh = &( d_thbp[ index_thbp (type_i,type_j,type_k,num_atom_types) ] );
+                    flag = 0;
+
+                    /* if( workspace->orig_id[i] < workspace->orig_id[k] )
+                       fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                       workspace->orig_id[i], workspace->orig_id[j],
+                       workspace->orig_id[k], bo_ij->BO, bo_jk->BO, p_ijk->theta );
+                       else 
+                       fprintf( stdout, "%6d %6d %6d %7.3f %7.3f %7.3f\n", 
+                       workspace->orig_id[k], workspace->orig_id[j],
+                       workspace->orig_id[i], bo_jk->BO, bo_ij->BO, p_ijk->theta ); */
+
+                    //TODO:
+                    //pbond_jk->scratch = thbh->cnt;
+
+                    for( cnt = 0; cnt < thbh->cnt; ++cnt ) {
+                        // fprintf( out_control->eval, 
+                        // "%6d%6d%6d -- exists in thbp\n", i+1, j+1, k+1 );
+
+                        if( fabs(thbh->prm[cnt].p_val1) > 0.001 ) {
+                            thbp = &( thbh->prm[cnt] );
+
+                            /* ANGLE ENERGY */
+                            p_val1 = thbp->p_val1;
+                            p_val2 = thbp->p_val2;
+                            p_val4 = thbp->p_val4;
+                            p_val7 = thbp->p_val7;
+                            theta_00 = thbp->theta_00;
+
+                            exp3ij = EXP( -p_val3 * POW( BOA_ij, p_val4 ) );
+                            f7_ij = 1.0 - exp3ij;
+                            Cf7ij = p_val3 * p_val4 * 
+                                POW( BOA_ij, p_val4 - 1.0 ) * exp3ij;
+
+                            exp3jk = EXP( -p_val3 * POW( BOA_jk, p_val4 ) );
+                            f7_jk = 1.0 - exp3jk;
+                            Cf7jk = p_val3 * p_val4 * 
+                                POW( BOA_jk, p_val4 - 1.0 ) * exp3jk;
+
+                            expval7 = EXP( -p_val7 * workspace->Delta_boc[j] );
+                            trm8 = 1.0 + expval6 + expval7;
+                            f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
+                            Cf8j = ( (1.0 - p_val5) / SQR(trm8) ) *
+                                (p_val6 * expval6 * trm8 - 
+                                 (2.0 + expval6) * ( p_val6 * expval6 - p_val7 * expval7 ));
+
+                            theta_0 = 180.0 - 
+                                theta_00 * (1.0 - EXP(-p_val10 * (2.0 - SBO2)));
+                            theta_0 = DEG2RAD( theta_0 );              
+
+                            expval2theta  = EXP(-p_val2 * SQR(theta_0-theta));
+                            if( p_val1 >= 0 )
+                                expval12theta = p_val1 * (1.0 - expval2theta);
+                            else // To avoid linear Me-H-Me angles (6/6/06)
+                                expval12theta = p_val1 * -expval2theta;
+
+                            CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
+                            CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
+                            CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
+                            CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * 
+                                expval2theta * (theta_0 - theta);
+
+                            Ctheta_0 = p_val10 * DEG2RAD(theta_00) * 
+                                exp( -p_val10 * (2.0 - SBO2) );
+
+                            CEval5 = -CEval4 * Ctheta_0 * CSBO2;
+                            CEval6 = CEval5 * dSBO1;
+                            CEval7 = CEval5 * dSBO2;
+                            CEval8 = -CEval4 / sin_theta;
+
+                            e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
+                            //PERFORMANCE IMPACT
+                            //atomicAdd (&data->E_Ang, e_ang);
+                            E_Ang [j] += e_ang;
+                            /* END ANGLE ENERGY*/
+
+
+                            /* PENALTY ENERGY */
+                            p_pen1 = thbp->p_pen1;
+                            p_pen2 = g_params.l[19];
+                            p_pen3 = g_params.l[20];
+                            p_pen4 = g_params.l[21];
+
+                            exp_pen2ij = EXP( -p_pen2 * SQR( BOA_ij - 2.0 ) );
+                            exp_pen2jk = EXP( -p_pen2 * SQR( BOA_jk - 2.0 ) );
+                            exp_pen3 = EXP( -p_pen3 * workspace->Delta[j] );
+                            exp_pen4 = EXP(  p_pen4 * workspace->Delta[j] );
+                            trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
+                            f9_Dj = ( 2.0 + exp_pen3 ) / trm_pen34;
+                            Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - 
+                                    (2.0 + exp_pen3) * ( -p_pen3 * exp_pen3 +
+                                        p_pen4 * exp_pen4 )) /
+                                SQR( trm_pen34 );
+
+                            e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
+                            //PERFORMANCE IMPACT
+                            //atomicAdd (&data->E_Pen, e_pen);
+                            E_Pen [j] += e_pen;
+
+
+                            CEpen1 = e_pen * Cf9j / f9_Dj;
+                            temp   = -2.0 * p_pen2 * e_pen;
+                            CEpen2 = temp * (BOA_ij - 2.0);
+                            CEpen3 = temp * (BOA_jk - 2.0);
+                            /* END PENALTY ENERGY */
+
+
+                            /* COALITION ENERGY */
+                            p_coa1 = thbp->p_coa1;
+                            p_coa2 = g_params.l[2];
+                            p_coa3 = g_params.l[38];
+                            p_coa4 = g_params.l[30];
+
+                            exp_coa2 = EXP( p_coa2 * workspace->Delta_boc[j] );
+                            e_coa = 
+                                p_coa1 / (1. + exp_coa2) *
+                                EXP( -p_coa3 * SQR(total_bo[i] - BOA_ij) ) * 
+                                EXP( -p_coa3 * SQR(total_bo[k] - BOA_jk) ) * 
+                                EXP( -p_coa4 * SQR(BOA_ij - 1.5) ) * 
+                                EXP( -p_coa4 * SQR(BOA_jk - 1.5) );
+
+                            //PERFORMANCE IMPACT
+                            //atomicAdd (&data->E_Coa, e_coa);
+                            E_Coa [j] += e_coa;
+
+                            CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
+                            CEcoa2 = -2 * p_coa4 * (BOA_jk - 1.5) * e_coa;
+                            CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1+exp_coa2);
+                            CEcoa4 = -2*p_coa3 * (total_bo[i]-BOA_ij) * e_coa;
+                            CEcoa5 = -2*p_coa3 * (total_bo[k]-BOA_jk) * e_coa;
+                            /* END COALITION ENERGY */
+
+                            /* FORCES */
+                            /*
+                               atomicAdd (&bo_ij->Cdbo, (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) );
+                               atomicAdd (&bo_jk->Cdbo, (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) );
+                               atomicAdd (&workspace->CdDelta[j], ((CEval3 + CEval7) + CEpen1 + CEcoa3) );
+                               atomicAdd (&workspace->CdDelta[i], CEcoa4 );
+                               atomicAdd (&workspace->CdDelta[k], CEcoa5 );              
+                             */
+
+                            bo_ij->Cdbo += (CEval1 + CEpen2 + (CEcoa1-CEcoa4)) ;
+                            bo_jk->Cdbo += (CEval2 + CEpen3 + (CEcoa2-CEcoa5)) ;
+                            workspace->CdDelta[j] += ((CEval3 + CEval7) + CEpen1 + CEcoa3) ;
+                            //atomicAdd (&workspace->CdDelta[i], CEcoa4 );
+                            pbond_ij->CdDelta_ij += CEcoa4 ;
+                            //atomicAdd (&workspace->CdDelta[k], CEcoa5 );              
+                            pbond_jk->CdDelta_ij += CEcoa5;
+
+                            for( t = start_j; t < end_j; ++t ) {
+                                pbond_jt = &( bond_list[t] );
+                                bo_jt = &(pbond_jt->bo_data);
+                                temp_bo_jt = bo_jt->BO;
+                                temp = CUBE( temp_bo_jt );
+                                pBOjt7 = temp * temp * temp_bo_jt; 
+
+                                // fprintf( out_control->eval, "%6d%12.8f\n", 
+                                // workspace->orig_id[ bond_list[t].nbr ], 
+                                //    (CEval6 * pBOjt7) );
+
+                                /*
+                                   atomicAdd (&bo_jt->Cdbo, (CEval6 * pBOjt7) );
+                                   atomicAdd (&bo_jt->Cdbopi, CEval5 );
+                                   atomicAdd (&bo_jt->Cdbopi2, CEval5 );
+                                 */
+                                bo_jt->Cdbo        += (CEval6 * pBOjt7) ;
+                                bo_jt->Cdbopi    += CEval5 ;
+                                bo_jt->Cdbopi2    += CEval5 ;
+                            }              
+
+
+                            if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                                /*
+                                   atomic_rvecScaledAdd( atoms[i].f, CEval8, p_ijk->dcos_di );
+                                   atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                   atomic_rvecScaledAdd( atoms[k].f, CEval8, p_ijk->dcos_dk );
+                                 */
+                                rvec_ScaledAdd( pbond_ij->f, CEval8, p_ijk->dcos_di );
+                                rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                rvec_ScaledAdd( pbond_jk->f, CEval8, p_ijk->dcos_dk );
+
+
+                            }
+                            else {
+                                /* terms not related to bond order derivatives
+                                   are added directly into 
+                                   forces and pressure vector/tensor */
+                                rvec_Scale( force, CEval8, p_ijk->dcos_di );
+                                //atomic_rvecAdd( atoms[i].f, force );
+                                rvec_Add( pbond_ij->f, force );
+
+                                rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                                //atomic_rvecAdd( data->ext_press, ext_press );
+                                rvec_Add( aux_ext_press [j], ext_press );
+
+                                //atomic_rvecScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
+                                rvec_ScaledAdd( atoms[j].f, CEval8, p_ijk->dcos_dj );
+
+                                rvec_Scale( force, CEval8, p_ijk->dcos_dk );
+                                //atomic_rvecAdd( atoms[k].f, force );
+                                rvec_Add( pbond_jk->f, force );
+                                rvec_iMultiply( ext_press, pbond_jk->rel_box, force );
+                                //atomic_rvecAdd( data->ext_press, ext_press );
+                                rvec_Add( aux_ext_press [j], ext_press );
+
+
+                                /* This part is for a fully-flexible box */
+                                /* rvec_OuterProduct( temp_rtensor, 
+                                   p_ijk->dcos_di, system->atoms[i].x );
+                                   rtensor_Scale( total_rtensor, +CEval8, temp_rtensor );
+
+                                   rvec_OuterProduct( temp_rtensor, 
+                                   p_ijk->dcos_dj, system->atoms[j].x );
+                                   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+
+                                   rvec_OuterProduct( temp_rtensor, 
+                                   p_ijk->dcos_dk, system->atoms[k].x );
+                                   rtensor_ScaledAdd(total_rtensor, CEval8, temp_rtensor);
+
+                                   if( pbond_ij->imaginary || pbond_jk->imaginary )
+                                   rtensor_ScaledAdd( data->flex_bar.P, 
+                                   -1.0, total_rtensor );
+                                   else
+                                   rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                            }
 
 #ifdef TEST_ENERGY
-							//TODO -- check this
-							//		fprintf( out_control->eval, 
-							//"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
-							//			 "%6d%6d%6d%23.15e%23.15e%23.15e\n",
-							//			 i+1, j+1, k+1,
-							//workspace->orig_id[i]+1,  
-							//workspace->orig_id[j]+1,
-							//workspace->orig_id[k]+1,
-							//workspace->Delta_boc[j], 
-							//			 RAD2DEG(theta), /*BOA_ij, BOA_jk, */
-							//			 e_ang, data->E_Ang );
-
-							/*fprintf( out_control->eval, 
-							  "%23.15e%23.15e%23.15e%23.15e",
-							  p_val3, p_val4, BOA_ij, BOA_jk );
-							  fprintf( out_control->eval, 
-							  "%23.15e%23.15e%23.15e%23.15e",
-							  f7_ij, f7_jk, f8_Dj, expval12theta );
-							  fprintf( out_control->eval, 
-							  "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-							  CEval1, CEval2, CEval3, CEval4, CEval5
-							//CEval6, CEval7, CEval8  );*/
-
-							/*fprintf( out_control->eval, 
-							  "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-							  -p_ijk->dcos_di[0]/sin_theta, 
-							  -p_ijk->dcos_di[1]/sin_theta, 
-							  -p_ijk->dcos_di[2]/sin_theta, 
-							  -p_ijk->dcos_dj[0]/sin_theta, 
-							  -p_ijk->dcos_dj[1]/sin_theta, 
-							  -p_ijk->dcos_dj[2]/sin_theta, 
-							  -p_ijk->dcos_dk[0]/sin_theta, 
-							  -p_ijk->dcos_dk[1]/sin_theta, 
-							  -p_ijk->dcos_dk[2]/sin_theta );*/
-
-							/* fprintf( out_control->epen, 
-							   "%23.15e%23.15e%23.15e\n", 
-							   CEpen1, CEpen2, CEpen3 );
-							   fprintf( out_control->epen, 
-							   "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-							   workspace->orig_id[i],  workspace->orig_id[j],
-							   workspace->orig_id[k], RAD2DEG(theta), 
-							   BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
-
-							//		fprintf( out_control->ecoa, 
-							//			 "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-							//			 workspace->orig_id[i], 
-							//			 workspace->orig_id[j],
-							//			 workspace->orig_id[k], 
-							//			 RAD2DEG(theta), BOA_ij, BOA_jk, 
-							//			 e_coa, data->E_Coa );
+                            //TODO -- check this
+                            //        fprintf( out_control->eval, 
+                            //"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e",
+                            //             "%6d%6d%6d%23.15e%23.15e%23.15e\n",
+                            //             i+1, j+1, k+1,
+                            //workspace->orig_id[i]+1,  
+                            //workspace->orig_id[j]+1,
+                            //workspace->orig_id[k]+1,
+                            //workspace->Delta_boc[j], 
+                            //             RAD2DEG(theta), /*BOA_ij, BOA_jk, */
+                            //             e_ang, data->E_Ang );
+
+                            /*fprintf( out_control->eval, 
+                              "%23.15e%23.15e%23.15e%23.15e",
+                              p_val3, p_val4, BOA_ij, BOA_jk );
+                              fprintf( out_control->eval, 
+                              "%23.15e%23.15e%23.15e%23.15e",
+                              f7_ij, f7_jk, f8_Dj, expval12theta );
+                              fprintf( out_control->eval, 
+                              "%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                              CEval1, CEval2, CEval3, CEval4, CEval5
+                            //CEval6, CEval7, CEval8  );*/
+
+                            /*fprintf( out_control->eval, 
+                              "%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                              -p_ijk->dcos_di[0]/sin_theta, 
+                              -p_ijk->dcos_di[1]/sin_theta, 
+                              -p_ijk->dcos_di[2]/sin_theta, 
+                              -p_ijk->dcos_dj[0]/sin_theta, 
+                              -p_ijk->dcos_dj[1]/sin_theta, 
+                              -p_ijk->dcos_dj[2]/sin_theta, 
+                              -p_ijk->dcos_dk[0]/sin_theta, 
+                              -p_ijk->dcos_dk[1]/sin_theta, 
+                              -p_ijk->dcos_dk[2]/sin_theta );*/
+
+                            /* fprintf( out_control->epen, 
+                               "%23.15e%23.15e%23.15e\n", 
+                               CEpen1, CEpen2, CEpen3 );
+                               fprintf( out_control->epen, 
+                               "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                               workspace->orig_id[i],  workspace->orig_id[j],
+                               workspace->orig_id[k], RAD2DEG(theta), 
+                               BOA_ij, BOA_jk, e_pen, data->E_Pen ); */
+
+                            //        fprintf( out_control->ecoa, 
+                            //             "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                            //             workspace->orig_id[i], 
+                            //             workspace->orig_id[j],
+                            //             workspace->orig_id[k], 
+                            //             RAD2DEG(theta), BOA_ij, BOA_jk, 
+                            //             e_coa, data->E_Coa );
 #endif
 
 #ifdef TEST_FORCES            /* angle forces */
-							//TODO -- check this
-							/*
-							   Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
-							   Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
-							   Add_dDelta( system, lists, 
-							   j, CEval3 + CEval7, workspace->f_ang );
-
-							   for( t = start_j; t < end_j; ++t ) {
-							   pbond_jt = &( bond_list[t] );
-							   bo_jt = &(pbond_jt->bo_data);
-							   temp_bo_jt = bo_jt->BO;
-							   temp = CUBE( temp_bo_jt );
-							   pBOjt7 = temp * temp * temp_bo_jt; 
-
-							   Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
-							   workspace->f_ang );
-							   Add_dBOpinpi2( system, lists, j, t, 
-							   CEval5, CEval5, 
-							   workspace->f_ang, workspace->f_ang );
-							   }
-
-							   rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
-							   rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
-							   rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
-							// end angle forces 
-
-							// penalty forces 
-							Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
-							Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
-							Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
-							// end penalty forces 
-
-							// coalition forces 
-							Add_dBO( system, lists, 
-							j, pi, CEcoa1-CEcoa4, workspace->f_coa );
-							Add_dBO( system, lists, 
-							j, pk, CEcoa2-CEcoa5, workspace->f_coa );
-							Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
-							Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
-							Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
-							// end coalition forces 
-
-							 */
+                            //TODO -- check this
+                            /*
+                               Add_dBO( system, lists, j, pi, CEval1, workspace->f_ang );
+                               Add_dBO( system, lists, j, pk, CEval2, workspace->f_ang );
+                               Add_dDelta( system, lists, 
+                               j, CEval3 + CEval7, workspace->f_ang );
+
+                               for( t = start_j; t < end_j; ++t ) {
+                               pbond_jt = &( bond_list[t] );
+                               bo_jt = &(pbond_jt->bo_data);
+                               temp_bo_jt = bo_jt->BO;
+                               temp = CUBE( temp_bo_jt );
+                               pBOjt7 = temp * temp * temp_bo_jt; 
+
+                               Add_dBO( system, lists, j, t, pBOjt7 * CEval6,
+                               workspace->f_ang );
+                               Add_dBOpinpi2( system, lists, j, t, 
+                               CEval5, CEval5, 
+                               workspace->f_ang, workspace->f_ang );
+                               }
+
+                               rvec_ScaledAdd( workspace->f_ang[i], CEval8, p_ijk->dcos_di );
+                               rvec_ScaledAdd( workspace->f_ang[j], CEval8, p_ijk->dcos_dj );
+                               rvec_ScaledAdd( workspace->f_ang[k], CEval8, p_ijk->dcos_dk );
+                            // end angle forces 
+
+                            // penalty forces 
+                            Add_dDelta( system, lists, j, CEpen1, workspace->f_pen );
+                            Add_dBO( system, lists, j, pi, CEpen2, workspace->f_pen );
+                            Add_dBO( system, lists, j, pk, CEpen3, workspace->f_pen );
+                            // end penalty forces 
+
+                            // coalition forces 
+                            Add_dBO( system, lists, 
+                            j, pi, CEcoa1-CEcoa4, workspace->f_coa );
+                            Add_dBO( system, lists, 
+                            j, pk, CEcoa2-CEcoa5, workspace->f_coa );
+                            Add_dDelta( system, lists, j, CEcoa3, workspace->f_coa );
+                            Add_dDelta( system, lists, i, CEcoa4, workspace->f_coa );
+                            Add_dDelta( system, lists, k, CEcoa5, workspace->f_coa );
+                            // end coalition forces 
+
+                             */
 #endif
-						}
-					}
-				}
-			}
-		}
+                        }
+                    }
+                }
+            }
+        }
 
-		Set_End_Index(pi, num_thb_intrs, thb_intrs );
-	}
-	//  } // end of the main for loop here
+        Set_End_Index(pi, num_thb_intrs, thb_intrs );
+    }
+    //  } // end of the main for loop here
 
 
-	//TODO - to be done on the CPU
-	/*
+    //TODO - to be done on the CPU
+    /*
 
-	   if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
-	   workspace->realloc.num_3body = num_thb_intrs;
-	   if( num_thb_intrs > thb_intrs->num_intrs ) {
-	   fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
-	   data->step, num_thb_intrs, thb_intrs->num_intrs );
-	   exit( INSUFFICIENT_SPACE );
-	   }
-	   }
-	 */
+       if( num_thb_intrs >= thb_intrs->num_intrs * DANGER_ZONE ) {
+       workspace->realloc.num_3body = num_thb_intrs;
+       if( num_thb_intrs > thb_intrs->num_intrs ) {
+       fprintf( stderr, "step%d-ran out of space on angle_list: top=%d, max=%d",
+       data->step, num_thb_intrs, thb_intrs->num_intrs );
+       exit( INSUFFICIENT_SPACE );
+       }
+       }
+     */
 
-	//fprintf( stderr,"%d: Number of angle interactions: %d\n", 
-	// data->step, num_thb_intrs );
+    //fprintf( stderr,"%d: Number of angle interactions: %d\n", 
+    // data->step, num_thb_intrs );
 
 #ifdef TEST_ENERGY
-	/*
-	   fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
+    /*
+       fprintf( stderr,"Number of angle interactions: %d\n", num_thb_intrs );
 
-	   fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
-	   data->E_Ang, data->E_Pen, data->E_Coa );
+       fprintf( stderr,"Angle Energy:%g\t Penalty Energy:%g\t Coalition Energy:%g\n",
+       data->E_Ang, data->E_Pen, data->E_Coa );
 
-	   fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
-	   data->ext_press[0], data->ext_press[1], data->ext_press[2] );
-	 */
+       fprintf( stderr,"3body: ext_press (%23.15e %23.15e %23.15e)\n", 
+       data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+     */
 #endif
 }
 
 
-GLOBAL void Three_Body_Interactions_results ( 	reax_atom *atoms, control_params *control,
-		static_storage p_workspace, 
-		list p_bonds, int N )
+GLOBAL void Three_Body_Interactions_results (     reax_atom *atoms, control_params *control,
+        static_storage p_workspace, 
+        list p_bonds, int N )
 {
-	int i, pj;
+    int i, pj;
 
-	bond_data *pbond;
-	bond_data *sym_index_bond;
-	list *bonds = &p_bonds;
-	static_storage *workspace = &p_workspace;
+    bond_data *pbond;
+    bond_data *sym_index_bond;
+    list *bonds = &p_bonds;
+    static_storage *workspace = &p_workspace;
 
-	i = blockIdx.x * blockDim.x + threadIdx.x;
+    i = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if ( i >= N) return;
+    if ( i >= N) return;
 
-	for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+    for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
 
-		pbond = &(bonds->select.bond_list[pj]);
-		sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
+        pbond = &(bonds->select.bond_list[pj]);
+        sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
 
-		workspace->CdDelta [i] += sym_index_bond->CdDelta_ij;
+        workspace->CdDelta [i] += sym_index_bond->CdDelta_ij;
 
-		rvec_Add (atoms[i].f, sym_index_bond->f );
-	}
+        rvec_Add (atoms[i].f, sym_index_bond->f );
+    }
 }
 
 
@@ -1201,78 +1201,78 @@ GLOBAL void Three_Body_Interactions_results ( 	reax_atom *atoms, control_params
 /* this is a 3-body interaction in which the main role is 
    played by j which sits in the middle of the other two. */
 GLOBAL void Three_Body_Estimate ( reax_atom *atoms, 
-		control_params *control,
-		list p_bonds, int N, 
-		int *count)
+        control_params *control,
+        list p_bonds, int N, 
+        int *count)
 {
-	int  i, j, pi, k, pk, t;
-	int  type_i, type_j, type_k;
-	int  start_j, end_j ;
-	int  flag, cnt, num_thb_intrs;
+    int  i, j, pi, k, pk, t;
+    int  type_i, type_j, type_k;
+    int  start_j, end_j ;
+    int  flag, cnt, num_thb_intrs;
 
-	real r_ij, r_jk;
-	real BOA_ij, BOA_jk;
-	list *bonds;
+    real r_ij, r_jk;
+    real BOA_ij, BOA_jk;
+    list *bonds;
 
-	bond_order_data *bo_ij, *bo_jk, *bo_jt;
-	bond_data *bond_list;
-	bond_data *pbond_ij, *pbond_jk, *pbond_jt;
+    bond_order_data *bo_ij, *bo_jk, *bo_jt;
+    bond_data *bond_list;
+    bond_data *pbond_ij, *pbond_jk, *pbond_jt;
 
-	j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= N) return;
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
 
-	bonds = &p_bonds;
-	bond_list = bonds->select.bond_list;
+    bonds = &p_bonds;
+    bond_list = bonds->select.bond_list;
 
-	type_j = atoms[j].type;
-	start_j = Start_Index(j, bonds);
-	end_j = End_Index(j, bonds);
+    type_j = atoms[j].type;
+    start_j = Start_Index(j, bonds);
+    end_j = End_Index(j, bonds);
 
 
-	for( pi = start_j; pi < end_j; ++pi ) {
+    for( pi = start_j; pi < end_j; ++pi ) {
 
-		num_thb_intrs = 0;
-		count [pi] = 0;
+        num_thb_intrs = 0;
+        count [pi] = 0;
 
-		pbond_ij = &(bond_list[pi]);
-		bo_ij = &(pbond_ij->bo_data);
-		BOA_ij = bo_ij->BO - control->thb_cut;
+        pbond_ij = &(bond_list[pi]);
+        bo_ij = &(pbond_ij->bo_data);
+        BOA_ij = bo_ij->BO - control->thb_cut;
 
-		if( BOA_ij/*bo_ij->BO*/ > 0.0 ) {
-			i = pbond_ij->nbr;
-			r_ij = pbond_ij->d;	 
-			type_i = atoms[i].type;
+        if( BOA_ij/*bo_ij->BO*/ > 0.0 ) {
+            i = pbond_ij->nbr;
+            r_ij = pbond_ij->d;     
+            type_i = atoms[i].type;
 
-			/*
-			   for( pk = start_j; pk < pi; ++pk ) {
-			   start_pk = Start_Index( pk, thb_intrs );
-			   end_pk = End_Index( pk, thb_intrs );
+            /*
+               for( pk = start_j; pk < pi; ++pk ) {
+               start_pk = Start_Index( pk, thb_intrs );
+               end_pk = End_Index( pk, thb_intrs );
 
-			   for( t = start_pk; t < end_pk; ++t )
-			   if( thb_list[t].thb == i ) {
+               for( t = start_pk; t < end_pk; ++t )
+               if( thb_list[t].thb == i ) {
 
-			   ++num_thb_intrs;
-			   break;
-			   }
-			   }
-			 */
+               ++num_thb_intrs;
+               break;
+               }
+               }
+             */
 
-			/* and this is the second for loop mentioned above */
-			for( pk = start_j; pk < end_j; ++pk ) {
-				if (pk == pi) continue;
+            /* and this is the second for loop mentioned above */
+            for( pk = start_j; pk < end_j; ++pk ) {
+                if (pk == pi) continue;
 
-				pbond_jk = &(bond_list[pk]);
-				bo_jk    = &(pbond_jk->bo_data);
-				BOA_jk   = bo_jk->BO - control->thb_cut;
+                pbond_jk = &(bond_list[pk]);
+                bo_jk    = &(pbond_jk->bo_data);
+                BOA_jk   = bo_jk->BO - control->thb_cut;
 
-				if (BOA_jk <= 0) continue;
+                if (BOA_jk <= 0) continue;
 
-				++num_thb_intrs;
-			}
-		}
+                ++num_thb_intrs;
+            }
+        }
 
-		count [pi] = num_thb_intrs;
-	}
+        count [pi] = num_thb_intrs;
+    }
 }
 
 
@@ -1295,224 +1295,224 @@ GLOBAL void Three_Body_Estimate ( reax_atom *atoms,
 
 
 void Hydrogen_Bonds( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
-	int i, j, k, pi, pk, itr, top;
-	int type_i, type_j, type_k;
-	int start_j, end_j, hb_start_j, hb_end_j;
-	int hblist[MAX_BONDS];
-	int num_hb_intrs = 0;
-	real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-	real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
-	rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-	rvec dvec_jk, force, ext_press;
-	ivec rel_jk;
-	// rtensor temp_rtensor, total_rtensor;
-	hbond_parameters *hbp;
-	bond_order_data *bo_ij;
-	bond_data *pbond_ij;
-	far_neighbor_data *nbr_jk;
-	list *bonds, *hbonds;
-	bond_data *bond_list;
-	hbond_data *hbond_list;
-
-	bonds = (*lists) + BONDS;
-	bond_list = bonds->select.bond_list;
-
-	hbonds = (*lists) + HBONDS;
-	hbond_list = hbonds->select.hbond_list;
-
-	/* loops below discover the Hydrogen bonds between i-j-k triplets.
-	   here j is H atom and there has to be some bond between i and j.
-	   Hydrogen bond is between j and k.
-	   so in this function i->X, j->H, k->Z when we map 
-	   variables onto the ones in the handout.*/
-	for( j = 0; j < system->N; ++j )
-		if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H
-			/*set j's variables */
-			type_j  = system->atoms[j].type;
-			start_j = Start_Index(j, bonds);
-			end_j   = End_Index(j, bonds);
-			hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-			hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
-
-			top = 0;
-			for( pi = start_j; pi < end_j; ++pi ) {
-				pbond_ij = &( bond_list[pi] );
-				i = pbond_ij->nbr;
-				bo_ij = &(pbond_ij->bo_data);
-				type_i = system->atoms[i].type;
-
-				if( system->reaxprm.sbp[type_i].p_hbond == 2 && 
-						bo_ij->BO >= HB_THRESHOLD )
-					hblist[top++] = pi;
-			}
-
-			// fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-			//          j, top, hb_start_j, hb_end_j );
-
-			for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
-				/* set k's varibles */
-				k = hbond_list[pk].nbr;
-				type_k = system->atoms[k].type;
-				nbr_jk = hbond_list[pk].ptr;
-				r_jk = nbr_jk->d;
-				rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
-
-				for( itr=0; itr < top; ++itr ) {
-					pi = hblist[itr];
-					pbond_ij = &( bond_list[pi] );
-					i = pbond_ij->nbr;
-
-					if( i != k ) {
-						bo_ij = &(pbond_ij->bo_data);
-						type_i = system->atoms[i].type;
-						r_ij = pbond_ij->d;	     
-						hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, &system->reaxprm) ]);
-						++num_hb_intrs;
-
-						Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-								&theta, &cos_theta );
-						/* the derivative of cos(theta) */
-						Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-								&dcos_theta_di, &dcos_theta_dj, 
-								&dcos_theta_dk );
-
-						/* hydrogen bond energy*/
-						sin_theta2 = SIN( theta/2.0 );
-						sin_xhz4 = SQR(sin_theta2);
-						sin_xhz4 *= sin_xhz4;
-						cos_xhz1 = ( 1.0 - cos_theta );
-						exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-						exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-									r_jk / hbp->r0_hb - 2.0 ) );
-
-						data->E_HB += e_hb = 
-							hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-
-						CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
-						CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-						CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
-								1.0 / hbp->r0_hb);
-
-						/* hydrogen bond forces */
-						bo_ij->Cdbo += CEhb1;   // dbo term
-
-						if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
-							rvec_ScaledAdd( system->atoms[i].f, 
-									+CEhb2, dcos_theta_di ); //dcos terms
-							rvec_ScaledAdd( system->atoms[j].f, 
-									+CEhb2, dcos_theta_dj );
-
-
-
-
-							//TODO
-							rvec_ScaledAdd( system->atoms[k].f, 
-									+CEhb2, dcos_theta_dk );
-
-							//dr terms
-							rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
-
-
-							//TODO
-							rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk );
-						}
-						else
-						{
-							/* for pressure coupling, terms that are not related 
-							   to bond order derivatives are added directly into 
-							   pressure vector/tensor */
-							rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
-							rvec_Add( system->atoms[i].f, force );
-							rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-							rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
-
-							rvec_ScaledAdd( system->atoms[j].f, +CEhb2, dcos_theta_dj );
-
-							ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
-							rvec_Scale( force, +CEhb2, dcos_theta_dk );
-
-
-
-							//TODO
-							rvec_Add( system->atoms[k].f, force );
-
-
+    int i, j, k, pi, pk, itr, top;
+    int type_i, type_j, type_k;
+    int start_j, end_j, hb_start_j, hb_end_j;
+    int hblist[MAX_BONDS];
+    int num_hb_intrs = 0;
+    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+    rvec dvec_jk, force, ext_press;
+    ivec rel_jk;
+    // rtensor temp_rtensor, total_rtensor;
+    hbond_parameters *hbp;
+    bond_order_data *bo_ij;
+    bond_data *pbond_ij;
+    far_neighbor_data *nbr_jk;
+    list *bonds, *hbonds;
+    bond_data *bond_list;
+    hbond_data *hbond_list;
+
+    bonds = (*lists) + BONDS;
+    bond_list = bonds->select.bond_list;
+
+    hbonds = (*lists) + HBONDS;
+    hbond_list = hbonds->select.hbond_list;
+
+    /* loops below discover the Hydrogen bonds between i-j-k triplets.
+       here j is H atom and there has to be some bond between i and j.
+       Hydrogen bond is between j and k.
+       so in this function i->X, j->H, k->Z when we map 
+       variables onto the ones in the handout.*/
+    for( j = 0; j < system->N; ++j )
+        if( system->reaxprm.sbp[system->atoms[j].type].p_hbond==1 ) {// j must be H
+            /*set j's variables */
+            type_j  = system->atoms[j].type;
+            start_j = Start_Index(j, bonds);
+            end_j   = End_Index(j, bonds);
+            hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+            hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
+
+            top = 0;
+            for( pi = start_j; pi < end_j; ++pi ) {
+                pbond_ij = &( bond_list[pi] );
+                i = pbond_ij->nbr;
+                bo_ij = &(pbond_ij->bo_data);
+                type_i = system->atoms[i].type;
+
+                if( system->reaxprm.sbp[type_i].p_hbond == 2 && 
+                        bo_ij->BO >= HB_THRESHOLD )
+                    hblist[top++] = pi;
+            }
+
+            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+            //          j, top, hb_start_j, hb_end_j );
+
+            for( pk = hb_start_j; pk < hb_end_j; ++pk ) {
+                /* set k's varibles */
+                k = hbond_list[pk].nbr;
+                type_k = system->atoms[k].type;
+                nbr_jk = hbond_list[pk].ptr;
+                r_jk = nbr_jk->d;
+                rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+
+                for( itr=0; itr < top; ++itr ) {
+                    pi = hblist[itr];
+                    pbond_ij = &( bond_list[pi] );
+                    i = pbond_ij->nbr;
+
+                    if( i != k ) {
+                        bo_ij = &(pbond_ij->bo_data);
+                        type_i = system->atoms[i].type;
+                        r_ij = pbond_ij->d;         
+                        hbp = &(system->reaxprm.hbp[ index_hbp(type_i, type_j, type_k, &system->reaxprm) ]);
+                        ++num_hb_intrs;
+
+                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                                &theta, &cos_theta );
+                        /* the derivative of cos(theta) */
+                        Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                                &dcos_theta_di, &dcos_theta_dj, 
+                                &dcos_theta_dk );
+
+                        /* hydrogen bond energy*/
+                        sin_theta2 = SIN( theta/2.0 );
+                        sin_xhz4 = SQR(sin_theta2);
+                        sin_xhz4 *= sin_xhz4;
+                        cos_xhz1 = ( 1.0 - cos_theta );
+                        exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
+                                    r_jk / hbp->r0_hb - 2.0 ) );
+
+                        data->E_HB += e_hb = 
+                            hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+
+                        CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
+                        CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
+                                1.0 / hbp->r0_hb);
+
+                        /* hydrogen bond forces */
+                        bo_ij->Cdbo += CEhb1;   // dbo term
+
+                        if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
+                            rvec_ScaledAdd( system->atoms[i].f, 
+                                    +CEhb2, dcos_theta_di ); //dcos terms
+                            rvec_ScaledAdd( system->atoms[j].f, 
+                                    +CEhb2, dcos_theta_dj );
+
+
+
+
+                            //TODO
+                            rvec_ScaledAdd( system->atoms[k].f, 
+                                    +CEhb2, dcos_theta_dk );
+
+                            //dr terms
+                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
+
+
+                            //TODO
+                            rvec_ScaledAdd( system->atoms[k].f, +CEhb3/r_jk, dvec_jk );
+                        }
+                        else
+                        {
+                            /* for pressure coupling, terms that are not related 
+                               to bond order derivatives are added directly into 
+                               pressure vector/tensor */
+                            rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
+                            rvec_Add( system->atoms[i].f, force );
+                            rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
+
+                            rvec_ScaledAdd( system->atoms[j].f, +CEhb2, dcos_theta_dj );
+
+                            ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                            rvec_Scale( force, +CEhb2, dcos_theta_dk );
+
+
+
+                            //TODO
+                            rvec_Add( system->atoms[k].f, force );
+
+
 
-							rvec_iMultiply( ext_press, rel_jk, force );
-							rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
+                            rvec_iMultiply( ext_press, rel_jk, force );
+                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
 
-							//dr terms
-							rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                            //dr terms
+                            rvec_ScaledAdd( system->atoms[j].f, -CEhb3/r_jk, dvec_jk );
 
-							rvec_Scale( force, CEhb3/r_jk, dvec_jk );
-							rvec_Add( system->atoms[k].f, force );
-							rvec_iMultiply( ext_press, rel_jk, force );
-							rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
+                            rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                            rvec_Add( system->atoms[k].f, force );
+                            rvec_iMultiply( ext_press, rel_jk, force );
+                            rvec_ScaledAdd( data->ext_press, 1.0, ext_press );
 
-							/* This part is intended for a fully-flexible box */
-							/* rvec_OuterProduct( temp_rtensor, 
-							   dcos_theta_di, system->atoms[i].x );
-							   rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor );
+                            /* This part is intended for a fully-flexible box */
+                            /* rvec_OuterProduct( temp_rtensor, 
+                               dcos_theta_di, system->atoms[i].x );
+                               rtensor_Scale( total_rtensor, -CEhb2, temp_rtensor );
 
-							   rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj,
-							   -CEhb3/r_jk, pbond_jk->dvec );
-							   rvec_OuterProduct( temp_rtensor, 
-							   temp_rvec, system->atoms[j].x );
-							   rtensor_Add( total_rtensor, temp_rtensor );
+                               rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dj,
+                               -CEhb3/r_jk, pbond_jk->dvec );
+                               rvec_OuterProduct( temp_rtensor, 
+                               temp_rvec, system->atoms[j].x );
+                               rtensor_Add( total_rtensor, temp_rtensor );
 
-							   rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk,
-							   +CEhb3/r_jk, pbond_jk->dvec );
-							   rvec_OuterProduct( temp_rtensor, 
-							   temp_rvec, system->atoms[k].x );
-							   rtensor_Add( total_rtensor, temp_rtensor );
+                               rvec_ScaledSum( temp_rvec, -CEhb2, dcos_theta_dk,
+                               +CEhb3/r_jk, pbond_jk->dvec );
+                               rvec_OuterProduct( temp_rtensor, 
+                               temp_rvec, system->atoms[k].x );
+                               rtensor_Add( total_rtensor, temp_rtensor );
 
-							   if( pbond_ij->imaginary || pbond_jk->imaginary )
-							   rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
-							   else
-							   rtensor_Add( data->flex_bar.P, total_rtensor ); */
-						}
+                               if( pbond_ij->imaginary || pbond_jk->imaginary )
+                               rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
+                               else
+                               rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                        }
 
 #ifdef TEST_ENERGY
-						/*fprintf( out_control->ehb, 
-						  "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n",
-						  dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], 
-						  dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], 
-						  dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
-						  fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n",
-						  CEhb1, CEhb2, CEhb3 ); */
-						fprintf( stderr, //out_control->ehb, 
-								"%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
-								workspace->orig_id[i], 
-								workspace->orig_id[j], 
-								workspace->orig_id[k], 
-								r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
+                        /*fprintf( out_control->ehb, 
+                          "%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n%23.15e%23.15e%23.15e\n",
+                          dcos_theta_di[0], dcos_theta_di[1], dcos_theta_di[2], 
+                          dcos_theta_dj[0], dcos_theta_dj[1], dcos_theta_dj[2], 
+                          dcos_theta_dk[0], dcos_theta_dk[1], dcos_theta_dk[2]);
+                          fprintf( out_control->ehb, "%23.15e%23.15e%23.15e\n",
+                          CEhb1, CEhb2, CEhb3 ); */
+                        fprintf( stderr, //out_control->ehb, 
+                                "%6d%6d%6d%23.15e%23.15e%23.15e%23.15e%23.15e\n",
+                                workspace->orig_id[i], 
+                                workspace->orig_id[j], 
+                                workspace->orig_id[k], 
+                                r_jk, theta, bo_ij->BO, e_hb, data->E_HB );
 
 #endif
 #ifdef TEST_FORCES
-						// dbo term
-						Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb );
-						// dcos terms
-						rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); 
-						rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
-						rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
-						// dr terms
-						rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk );
-						rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk );
+                        // dbo term
+                        Add_dBO( system, lists, j, pi, +CEhb1, workspace->f_hb );
+                        // dcos terms
+                        rvec_ScaledAdd( workspace->f_hb[i], +CEhb2, dcos_theta_di ); 
+                        rvec_ScaledAdd( workspace->f_hb[j], +CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb2, dcos_theta_dk );
+                        // dr terms
+                        rvec_ScaledAdd( workspace->f_hb[j], -CEhb3/r_jk, dvec_jk );
+                        rvec_ScaledAdd( workspace->f_hb[k], +CEhb3/r_jk, dvec_jk );
 #endif
-					}
-				}
-			}
-		}
+                    }
+                }
+            }
+        }
 
-	/* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", 
-	   data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
+    /* fprintf( stderr, "hydbonds: ext_press (%23.15e %23.15e %23.15e)\n", 
+       data->ext_press[0], data->ext_press[1], data->ext_press[2] ); */
 
 #ifdef TEST_FORCES
-	fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs );
-	fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->E_HB );
+    fprintf( stderr, "Number of hydrogen bonds: %d\n", num_hb_intrs );
+    fprintf( stderr, "Hydrogen Bond Energy: %g\n", data->E_HB );
 #endif
 }
 
@@ -1525,740 +1525,740 @@ void Hydrogen_Bonds( reax_system *system, control_params *control,
 // Cuda Function
 ////////////////////////////////////////////////////////////////////
 
-GLOBAL void Hydrogen_Bonds (	reax_atom *atoms,
-		single_body_parameters *sbp,
-		hbond_parameters *d_hbp,
-		control_params *control,
-		simulation_data *data,
-		static_storage p_workspace, 
-		list p_bonds, list p_hbonds,
-		int N, int num_atom_types, 
-		real *E_HB, rvec *aux_ext_press, rvec *atoms_f )
+GLOBAL void Hydrogen_Bonds (    reax_atom *atoms,
+        single_body_parameters *sbp,
+        hbond_parameters *d_hbp,
+        control_params *control,
+        simulation_data *data,
+        static_storage p_workspace, 
+        list p_bonds, list p_hbonds,
+        int N, int num_atom_types, 
+        real *E_HB, rvec *aux_ext_press, rvec *atoms_f )
 {
-	extern __shared__ real t_hb[];
-	extern __shared__ real t_f[];
-	//extern __shared__ rvec t_cdbo[];
-	//extern __shared__ rvec t_hf [];
-
-	real *sh_hb = t_hb;
-	rvec *sh_atomf = (rvec *)(t_hb + blockDim.x);
-	//real *sh_cdbo = t_hb + blockDim.x;
-	//rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
-
-	int i, j, k, pi, pk, itr, top;
-	int type_i, type_j, type_k;
-	int start_j, end_j, hb_start_j, hb_end_j;
-	int hblist[MAX_BONDS];
-	int num_hb_intrs = 0;
-	real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-	real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
-	rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-	rvec dvec_jk, force, ext_press;
-	ivec rel_jk;
-	// rtensor temp_rtensor, total_rtensor;
-	hbond_parameters *hbp;
-	bond_order_data *bo_ij;
-	bond_data *pbond_ij;
-	far_neighbor_data *nbr_jk;
-	list *bonds, *hbonds;
-	bond_data *bond_list;
-	hbond_data *hbond_list, *hbond_jk;
-	static_storage *workspace = &p_workspace;
-
-	j = blockIdx.x * blockDim.x + threadIdx.x;
-	if (j >= N) return;
-
-	//j = blockIdx.x;
-
-	bonds = &p_bonds;
-	bond_list = bonds->select.bond_list;
-
-	hbonds = &p_hbonds;
-	hbond_list = hbonds->select.hbond_list;
-
-	// loops below discover the Hydrogen bonds between i-j-k triplets.
-	// here j is H atom and there has to be some bond between i and j.
-	// Hydrogen bond is between j and k.
-	// so in this function i->X, j->H, k->Z when we map 
-	// variables onto the ones in the handout.
-
-	//for( j = 0; j < system->N; ++j )
-	sh_hb [threadIdx.x] = 0;
-	rvec_MakeZero ( sh_atomf[ threadIdx.x] );
-
-	if( sbp[atoms[j].type].p_hbond==1) {// j must be H
-		//set j's variables 
-		type_j  = atoms[j].type;
-		start_j = Start_Index(j, bonds);
-		end_j   = End_Index(j, bonds);
-		hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-		hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
-
-		top = 0;
-		for( pi = start_j; pi < end_j; ++pi ) {
-			pbond_ij = &( bond_list[pi] );
-			i = pbond_ij->nbr;
-			bo_ij = &(pbond_ij->bo_data);
-			type_i = atoms[i].type;
-
-			if( sbp[type_i].p_hbond == 2 && 
-					bo_ij->BO >= HB_THRESHOLD )
-				hblist[top++] = pi;
-		}
-
-		// fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-		//          j, top, hb_start_j, hb_end_j );
-
-		for( pk = hb_start_j; pk < hb_end_j; ++pk )
-			//pk = hb_start_j + threadIdx.x;
-			//while (pk < hb_end_j)
-		{
-			// set k's varibles 
-			//TODO
-			hbond_jk = &( hbond_list[pk] );
-			//TODO
-			k = hbond_list[pk].nbr;
-			type_k = atoms[k].type;
-			nbr_jk = hbond_list[pk].ptr;
-			r_jk = nbr_jk->d;
-			rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
-
-			//TODO Double check this Hydrogen Bonds fix
-			//rvec_MakeZero ( nbr_jk->h_f );
-			rvec_MakeZero ( hbond_jk->h_f );
-			//TODO Double check this Hydrogen Bonds fix
-
-			//sh_hb [threadIdx.x] = 0;
-
-
-			//itr = threadIdx.x;
-			for( itr=0; itr < top; ++itr ) {
-				//while (itr < top) {
-				pi = hblist[itr];
-				pbond_ij = &( bond_list[pi] );
-				i = pbond_ij->nbr;
-
-				//TODO
-				//rvec_MakeZero (sh_hf [threadIdx.x]);
-				//sh_cdbo [threadIdx.x] = 0;
-
-				//rvec_MakeZero ( sh_atomf[ threadIdx.x] );
-
-
-				if( i != k ) {
-					bo_ij = &(pbond_ij->bo_data);
-					type_i = atoms[i].type;
-					r_ij = pbond_ij->d;	     
-					hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
-					++num_hb_intrs;
-
-					Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-							&theta, &cos_theta );
-					// the derivative of cos(theta)
-					Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-							&dcos_theta_di, &dcos_theta_dj, 
-							&dcos_theta_dk );
-
-					// hydrogen bond energy
-					sin_theta2 = SIN( theta/2.0 );
-					sin_xhz4 = SQR(sin_theta2);
-					sin_xhz4 *= sin_xhz4;
-					cos_xhz1 = ( 1.0 - cos_theta );
-					exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-					exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-								r_jk / hbp->r0_hb - 2.0 ) );
-
-					//PERFORMANCE IMPACT
-					e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-					//atomicAdd ( &data->E_HB, e_hb );
-					//E_HB [j] += e_hb;
-					sh_hb [threadIdx.x] += e_hb;
-
-					CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
-					CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-					CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
-							1.0 / hbp->r0_hb);
-
-					//this is the problem here
-					//TODO
-					// hydrogen bond forces
-					bo_ij->Cdbo += CEhb1;   // dbo term
-					//sh_cdbo[threadIdx.x] += CEhb1;
-					//TODO
-
-
-					if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-
-						//PERFORMANCE IMPACT
-						/*
-						   atomic_rvecScaledAdd( atoms[i].f, 
-						   +CEhb2, dcos_theta_di ); //dcos terms
-						   atomic_rvecScaledAdd( atoms[j].f, 
-						   +CEhb2, dcos_theta_dj );
-						   atomic_rvecScaledAdd( atoms[k].f, 
-						   +CEhb2, dcos_theta_dk );
-						//dr terms
-						atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-						atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk );
-						 */
-
-						//PERFORMANCE IMPACT
-						rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms
-						//rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms
-
-						//rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
-						rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
-
-						//TODO you forgot here
-						//TODO Hydrogen bonds fix. -- BE VERY CAREFUL *****
-						rvec_ScaledAdd( hbond_jk->h_f, 
-								+CEhb2, dcos_theta_dk );
-
-						//rvec_ScaledAdd( nbr_jk->h_f, 
-						//     +CEhb2, dcos_theta_dk );
-
-						//dr terms
-						//rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-						rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk );
-
-						//atoms_f [j] ++;
-
-						//TODO you forgot 
-						rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk );
-						//rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk );
-					}
-					else
-					{
-						// for pressure coupling, terms that are not related 
-						// to bond order derivatives are added directly into 
-						// pressure vector/tensor 
-						rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
-						rvec_Add( pbond_ij->h_f, force );
-						rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-						//rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-						//rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press );
-
-						rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
-
-						ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
-						rvec_Scale( force, +CEhb2, dcos_theta_dk );
-
-						//rvec_Add( nbr_jk->h_f, force );
-						rvec_Add( hbond_jk->h_f, force );
-
-						rvec_iMultiply( ext_press, rel_jk, force );
-						//rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-						//rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
-
-						//dr terms
-						rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-
-						rvec_Scale( force, CEhb3/r_jk, dvec_jk );
-						rvec_Add( hbond_jk->h_f, force );
-						rvec_iMultiply( ext_press, rel_jk, force );
-						//rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-						//rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
-
-					}
-
-					//do the reduction for the bond_ij here
-					/*
-					   if (threadIdx.x < 16){
-					   sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
-					   rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
-
-					   sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-					   rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
-					   }
-					   if (threadIdx.x < 8){ 
-					//sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
-					//rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
-
-					sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-					//rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
-					}
-					if (threadIdx.x < 4){
-					//sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
-					//rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
-
-					sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-					//rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
-					}
-					if (threadIdx.x < 2){
-					//sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
-					//rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
-
-					sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-					//rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
-					}
-					if (threadIdx.x < 1){
-					//sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
-					//rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
-
-					sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-					//rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
-					}
-					if (threadIdx.x == 0){
-					//bo_ij->Cdbo += sh_cdbo [threadIdx.x];
-					//rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
-
-					E_HB [j] += sh_hb [threadIdx.x];
-					//rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-					}
-					 */
-
-
-				} // i != k if statement
-
-
-				//itr += blockDim.x;
-
-			} //itr for statement
-
-			/*
-			   __syncthreads ();
-
-			   for (int x = 1; x < blockDim.x; x++)
-			   sh_hb [0] += sh_hb [x];	
-
-			   E_HB [j] += sh_hb[0];
-			   if (threadIdx.x < 16) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-			   if (threadIdx.x < 8) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-			   if (threadIdx.x < 4) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-			   if (threadIdx.x < 2) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-			   if (threadIdx.x < 1) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-			   if (threadIdx.x == 0) E_HB [j] += sh_hb [threadIdx.x];
-			 */
-
-
-			//pk += blockDim.x;
-
-			}  // pk for statement
-		} // main if statment
-
-		//do the reduction for the bond_ij here
-		/*
-		   if (threadIdx.x < 16){
-		   sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-		//rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
-		}
-		if (threadIdx.x < 8){ 
-		sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-		//rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
-		}
-		if (threadIdx.x < 4){
-		sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-		//rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
-		}
-		if (threadIdx.x < 2){
-		sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-		//rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
-		}
-		if (threadIdx.x < 1){
-		sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-		//rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
-		}
-		if (threadIdx.x == 0){
-		E_HB [j] += sh_hb [threadIdx.x];
-		//rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-		}
-		 */
-
-		E_HB [j]  += sh_hb [threadIdx.x];
-		rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-
-		//rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]);
-	}
-
-
-	DEVICE void warpReduce(volatile real* sdata, int tid) 
-	{
-		if (tid < 16) sdata[tid] += sdata[tid + 16]; 
-		if (tid < 8) sdata[tid] += sdata[tid + 8]; 
-		if (tid < 4) sdata[tid] += sdata[tid + 4]; 
-		if (tid < 2) sdata[tid] += sdata[tid + 2]; 
-		if (tid < 1) sdata[tid] += sdata[tid + 1]; 
-	}
-
-
-
-
-	GLOBAL void Hydrogen_Bonds_HB (	reax_atom *atoms,
-			single_body_parameters *sbp,
-			hbond_parameters *d_hbp,
-			control_params *control,
-			simulation_data *data,
-			static_storage p_workspace, 
-			list p_bonds, list p_hbonds,
-			int N, int num_atom_types, 
-			real *E_HB, rvec *aux_ext_press, rvec *atoms_f )
-	{
-		extern __shared__ real t_hb[];
-		extern __shared__ rvec t__f[];
-		extern __shared__ rvec t_cdbo[];
-		extern __shared__ rvec t_hf [];
-
-		real *sh_hb = t_hb;
-		real *sh_cdbo = t_hb + blockDim.x;
-		rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x);
-		rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
-
-		int __THREADS_PER_ATOM__ = HBONDS_THREADS_PER_ATOM;
-
-		int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-		int warp_id = thread_id / __THREADS_PER_ATOM__;
-		int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
-		int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
-
-		if (warp_id >= N ) return;
-
-
-		int i, j, k, pi, pk, itr, top;
-		int type_i, type_j, type_k;
-		int start_j, end_j, hb_start_j, hb_end_j;
-		int hblist[MAX_BONDS];
-		int num_hb_intrs = 0;
-		real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
-		real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
-		rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
-		rvec dvec_jk, force, ext_press;
-		ivec rel_jk;
-		// rtensor temp_rtensor, total_rtensor;
-		hbond_parameters *hbp;
-		bond_order_data *bo_ij;
-		bond_data *pbond_ij;
-		far_neighbor_data *nbr_jk;
-		list *bonds, *hbonds;
-		bond_data *bond_list;
-		hbond_data *hbond_list, *hbond_jk;
-		static_storage *workspace = &p_workspace;
-
-		/*
-		   j = blockIdx.x * blockDim.x + threadIdx.x;
-		   if (j >= N) return;
-		 */
-
-		// 	j = blockIdx.x;
-
-		j = warp_id;
-
-		bonds = &p_bonds;
-		bond_list = bonds->select.bond_list;
-
-		hbonds = &p_hbonds;
-		hbond_list = hbonds->select.hbond_list;
-
-		// loops below discover the Hydrogen bonds between i-j-k triplets.
-		// here j is H atom and there has to be some bond between i and j.
-		// Hydrogen bond is between j and k.
-		// so in this function i->X, j->H, k->Z when we map 
-		// variables onto the ones in the handout.
-
-		//for( j = 0; j < system->N; ++j )
-		sh_hb [threadIdx.x] = 0;
-		rvec_MakeZero ( sh_atomf[ threadIdx.x] );
-
-		if( sbp[atoms[j].type].p_hbond==1) {// j must be H
-			//set j's variables 
-			type_j  = atoms[j].type;
-			start_j = Start_Index(j, bonds);
-			end_j   = End_Index(j, bonds);
-			hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-			hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
-
-			top = 0;
-			for( pi = start_j; pi < end_j; ++pi ) {
-				pbond_ij = &( bond_list[pi] );
-				i = pbond_ij->nbr;
-				bo_ij = &(pbond_ij->bo_data);
-				type_i = atoms[i].type;
-
-				if( sbp[type_i].p_hbond == 2 && 
-						bo_ij->BO >= HB_THRESHOLD ) {
-					hblist[top++] = pi;
-				}
-			}
-
-			// fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
-			//          j, top, hb_start_j, hb_end_j );
-
-			for( itr=0; itr < top; ++itr ) {
-				pi = hblist[itr];
-				pbond_ij = &( bond_list[pi] );
-				i = pbond_ij->nbr;
-
-				//TODO
-				rvec_MakeZero (sh_hf [threadIdx.x]);
-				sh_cdbo [threadIdx.x] = 0;
-
-
-				//for( pk = hb_start_j; pk < hb_end_j; ++pk )
-				int loopcount = (hb_end_j - hb_start_j) / HBONDS_THREADS_PER_ATOM + (((hb_end_j - hb_start_j)%HBONDS_THREADS_PER_ATOM == 0) ? 0 : 1);
-				int count = 0;
-				//jpk = hb_start_j + threadIdx.x;
-				pk = hb_start_j + lane_id;
-				//while (pk < hb_end_j)
-				while (count < loopcount)
-				{
-
-					if (pk < hb_end_j)
-					{
-						// set k's varibles 
-						//TODO
-						hbond_jk = &( hbond_list[pk] );
-						//TODO
-						k = hbond_list[pk].nbr;
-						type_k = atoms[k].type;
-						nbr_jk = hbond_list[pk].ptr;
-						r_jk = nbr_jk->d;
-						rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
-					}
-					else k = -1;
-
-					//TODO Double check this Hydrogen Bonds fix
-					//rvec_MakeZero ( nbr_jk->h_f );
-					//rvec_MakeZero ( hbond_jk->h_f );
-					//TODO Double check this Hydrogen Bonds fix
-
-					//sh_hb [threadIdx.x] = 0;
-					//rvec_MakeZero ( sh_atomf[ threadIdx.x] );
-					//__syncthreads ();
-
-
-					if(( i != k ) && (k != -1)) {
-						bo_ij = &(pbond_ij->bo_data);
-						type_i = atoms[i].type;
-						r_ij = pbond_ij->d;	     
-						hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
-						++num_hb_intrs;
-
-						Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-								&theta, &cos_theta );
-						// the derivative of cos(theta)
-						Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
-								&dcos_theta_di, &dcos_theta_dj, 
-								&dcos_theta_dk );
-
-						// hydrogen bond energy
-						sin_theta2 = SIN( theta/2.0 );
-						sin_xhz4 = SQR(sin_theta2);
-						sin_xhz4 *= sin_xhz4;
-						cos_xhz1 = ( 1.0 - cos_theta );
-						exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
-						exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
-									r_jk / hbp->r0_hb - 2.0 ) );
-
-						//PERFORMANCE IMPACT
-						e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
-						//atomicAdd ( &data->E_HB, e_hb );
-						//E_HB [j] += e_hb;
-						sh_hb [threadIdx.x] += e_hb;
-
-						CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
-						CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
-						CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
-								1.0 / hbp->r0_hb);
-
-						//this is the problem here
-						//TODO
-						// hydrogen bond forces
-						//bo_ij->Cdbo += CEhb1;   // dbo term
-						sh_cdbo[threadIdx.x] += CEhb1;
-						//TODO
-						//warpReduce (sh_cdbo, threadIdx.x);
-						//if (threadIdx.x == 0)
-						//	bo_ij->Cdbo += sh_cdbo [0];
-
-
-
-						if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
-
-							//PERFORMANCE IMPACT
-							/*
-							   atomic_rvecScaledAdd( atoms[i].f, 
-							   +CEhb2, dcos_theta_di ); //dcos terms
-							   atomic_rvecScaledAdd( atoms[j].f, 
-							   +CEhb2, dcos_theta_dj );
-							   atomic_rvecScaledAdd( atoms[k].f, 
-							   +CEhb2, dcos_theta_dk );
-							//dr terms
-							atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-							atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk );
-							 */
-
-							//PERFORMANCE IMPACT
-							//rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms
-							rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms
-
-							//rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
-							rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
-
-
-							//TODO you forgot here
-							//TODO Hydrogen bonds fix. -- BE VERY CAREFUL *****
-							rvec_ScaledAdd( hbond_jk->h_f, +CEhb2, dcos_theta_dk );
-
-							//rvec_ScaledAdd( nbr_jk->h_f, 
-							//     +CEhb2, dcos_theta_dk );
-
-							//dr terms
-							//rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-							rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk );
-
-							//TODO you forgot 
-							rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk );
-							//rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk );
-						}
-						else
-						{
-							// for pressure coupling, terms that are not related 
-							// to bond order derivatives are added directly into 
-							// pressure vector/tensor 
-							//rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
-							//rvec_Add( pbond_ij->h_f, force );
-							//rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
-							//rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-							//rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press );
-
-							//rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
-
-							//ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
-							//rvec_Scale( force, +CEhb2, dcos_theta_dk );
-
-							//rvec_Add( nbr_jk->h_f, force );
-							//rvec_Add( hbond_jk->h_f, force );
-
-							//rvec_iMultiply( ext_press, rel_jk, force );
-							//rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-							//rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
-
-							//dr terms
-							//rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
-
-							//rvec_Scale( force, CEhb3/r_jk, dvec_jk );
-							//rvec_Add( hbond_jk->h_f, force );
-							//rvec_iMultiply( ext_press, rel_jk, force );
-							//rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
-							//rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
-
-						}
-
-					} // i != k if statement
-
-					pk += __THREADS_PER_ATOM__;
-					count ++;
-
-				}  // pk for statement
-
-				//__syncthreads ();
-
-				//at this point done with one bond....
-				//do the reduction now
-				//if (threadIdx.x == 0){
-				if (lane_id < 16) {
-					sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
-					rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
-				}
-				if (lane_id < 8) {
-					sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
-					rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
-				}
-				if (lane_id < 4) {
-					sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
-					rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
-				}
-				if (lane_id < 2) {
-					sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
-					rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
-				}
-				if (lane_id < 1) {
-					sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
-					rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
-
-					bo_ij->Cdbo += sh_cdbo [threadIdx.x];
-					rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
-				}
-				/*
-				   if (lane_id == 0){
-				   for (i = 1; i < 32; i++)
-				   {
-				//sh_cdbo [threadIdx.x] += sh_cdbo [i];
-				//rvec_Add (sh_hf [threadIdx.x], sh_hf [i]);
+    extern __shared__ real t_hb[];
+    extern __shared__ real t_f[];
+    //extern __shared__ rvec t_cdbo[];
+    //extern __shared__ rvec t_hf [];
+
+    real *sh_hb = t_hb;
+    rvec *sh_atomf = (rvec *)(t_hb + blockDim.x);
+    //real *sh_cdbo = t_hb + blockDim.x;
+    //rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
+
+    int i, j, k, pi, pk, itr, top;
+    int type_i, type_j, type_k;
+    int start_j, end_j, hb_start_j, hb_end_j;
+    int hblist[MAX_BONDS];
+    int num_hb_intrs = 0;
+    real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+    real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+    rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+    rvec dvec_jk, force, ext_press;
+    ivec rel_jk;
+    // rtensor temp_rtensor, total_rtensor;
+    hbond_parameters *hbp;
+    bond_order_data *bo_ij;
+    bond_data *pbond_ij;
+    far_neighbor_data *nbr_jk;
+    list *bonds, *hbonds;
+    bond_data *bond_list;
+    hbond_data *hbond_list, *hbond_jk;
+    static_storage *workspace = &p_workspace;
+
+    j = blockIdx.x * blockDim.x + threadIdx.x;
+    if (j >= N) return;
+
+    //j = blockIdx.x;
+
+    bonds = &p_bonds;
+    bond_list = bonds->select.bond_list;
+
+    hbonds = &p_hbonds;
+    hbond_list = hbonds->select.hbond_list;
+
+    // loops below discover the Hydrogen bonds between i-j-k triplets.
+    // here j is H atom and there has to be some bond between i and j.
+    // Hydrogen bond is between j and k.
+    // so in this function i->X, j->H, k->Z when we map 
+    // variables onto the ones in the handout.
+
+    //for( j = 0; j < system->N; ++j )
+    sh_hb [threadIdx.x] = 0;
+    rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+
+    if( sbp[atoms[j].type].p_hbond==1) {// j must be H
+        //set j's variables 
+        type_j  = atoms[j].type;
+        start_j = Start_Index(j, bonds);
+        end_j   = End_Index(j, bonds);
+        hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+        hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
+
+        top = 0;
+        for( pi = start_j; pi < end_j; ++pi ) {
+            pbond_ij = &( bond_list[pi] );
+            i = pbond_ij->nbr;
+            bo_ij = &(pbond_ij->bo_data);
+            type_i = atoms[i].type;
+
+            if( sbp[type_i].p_hbond == 2 && 
+                    bo_ij->BO >= HB_THRESHOLD )
+                hblist[top++] = pi;
+        }
+
+        // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+        //          j, top, hb_start_j, hb_end_j );
+
+        for( pk = hb_start_j; pk < hb_end_j; ++pk )
+            //pk = hb_start_j + threadIdx.x;
+            //while (pk < hb_end_j)
+        {
+            // set k's varibles 
+            //TODO
+            hbond_jk = &( hbond_list[pk] );
+            //TODO
+            k = hbond_list[pk].nbr;
+            type_k = atoms[k].type;
+            nbr_jk = hbond_list[pk].ptr;
+            r_jk = nbr_jk->d;
+            rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+
+            //TODO Double check this Hydrogen Bonds fix
+            //rvec_MakeZero ( nbr_jk->h_f );
+            rvec_MakeZero ( hbond_jk->h_f );
+            //TODO Double check this Hydrogen Bonds fix
+
+            //sh_hb [threadIdx.x] = 0;
+
+
+            //itr = threadIdx.x;
+            for( itr=0; itr < top; ++itr ) {
+                //while (itr < top) {
+                pi = hblist[itr];
+                pbond_ij = &( bond_list[pi] );
+                i = pbond_ij->nbr;
+
+                //TODO
+                //rvec_MakeZero (sh_hf [threadIdx.x]);
+                //sh_cdbo [threadIdx.x] = 0;
+
+                //rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+
+
+                if( i != k ) {
+                    bo_ij = &(pbond_ij->bo_data);
+                    type_i = atoms[i].type;
+                    r_ij = pbond_ij->d;         
+                    hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
+                    ++num_hb_intrs;
+
+                    Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &theta, &cos_theta );
+                    // the derivative of cos(theta)
+                    Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                            &dcos_theta_di, &dcos_theta_dj, 
+                            &dcos_theta_dk );
+
+                    // hydrogen bond energy
+                    sin_theta2 = SIN( theta/2.0 );
+                    sin_xhz4 = SQR(sin_theta2);
+                    sin_xhz4 *= sin_xhz4;
+                    cos_xhz1 = ( 1.0 - cos_theta );
+                    exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                    exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
+                                r_jk / hbp->r0_hb - 2.0 ) );
+
+                    //PERFORMANCE IMPACT
+                    e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                    //atomicAdd ( &data->E_HB, e_hb );
+                    //E_HB [j] += e_hb;
+                    sh_hb [threadIdx.x] += e_hb;
+
+                    CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
+                    CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                    CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
+                            1.0 / hbp->r0_hb);
+
+                    //this is the problem here
+                    //TODO
+                    // hydrogen bond forces
+                    bo_ij->Cdbo += CEhb1;   // dbo term
+                    //sh_cdbo[threadIdx.x] += CEhb1;
+                    //TODO
+
+
+                    if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+
+                        //PERFORMANCE IMPACT
+                        /*
+                           atomic_rvecScaledAdd( atoms[i].f, 
+                           +CEhb2, dcos_theta_di ); //dcos terms
+                           atomic_rvecScaledAdd( atoms[j].f, 
+                           +CEhb2, dcos_theta_dj );
+                           atomic_rvecScaledAdd( atoms[k].f, 
+                           +CEhb2, dcos_theta_dk );
+                        //dr terms
+                        atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                        atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk );
+                         */
+
+                        //PERFORMANCE IMPACT
+                        rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms
+                        //rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms
+
+                        //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
+                        rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
+
+                        //TODO you forgot here
+                        //TODO Hydrogen bonds fix. -- BE VERY CAREFUL *****
+                        rvec_ScaledAdd( hbond_jk->h_f, 
+                                +CEhb2, dcos_theta_dk );
+
+                        //rvec_ScaledAdd( nbr_jk->h_f, 
+                        //     +CEhb2, dcos_theta_dk );
+
+                        //dr terms
+                        //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                        rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk );
+
+                        //atoms_f [j] ++;
+
+                        //TODO you forgot 
+                        rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk );
+                        //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk );
+                    }
+                    else
+                    {
+                        // for pressure coupling, terms that are not related 
+                        // to bond order derivatives are added directly into 
+                        // pressure vector/tensor 
+                        rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
+                        rvec_Add( pbond_ij->h_f, force );
+                        rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                        //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press );
+
+                        rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
+
+                        ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                        rvec_Scale( force, +CEhb2, dcos_theta_dk );
+
+                        //rvec_Add( nbr_jk->h_f, force );
+                        rvec_Add( hbond_jk->h_f, force );
+
+                        rvec_iMultiply( ext_press, rel_jk, force );
+                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                        //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
+
+                        //dr terms
+                        rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+
+                        rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                        rvec_Add( hbond_jk->h_f, force );
+                        rvec_iMultiply( ext_press, rel_jk, force );
+                        //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                        //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
+
+                    }
+
+                    //do the reduction for the bond_ij here
+                    /*
+                       if (threadIdx.x < 16){
+                       sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
+                       rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
+
+                       sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
+                       rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
+                       }
+                       if (threadIdx.x < 8){ 
+                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
+                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
+
+                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
+                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
+                    }
+                    if (threadIdx.x < 4){
+                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
+                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
+
+                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
+                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
+                    }
+                    if (threadIdx.x < 2){
+                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
+                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
+
+                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
+                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
+                    }
+                    if (threadIdx.x < 1){
+                    //sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
+                    //rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
+
+                    sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
+                    //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
+                    }
+                    if (threadIdx.x == 0){
+                    //bo_ij->Cdbo += sh_cdbo [threadIdx.x];
+                    //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
+
+                    E_HB [j] += sh_hb [threadIdx.x];
+                    //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+                    }
+                     */
+
+
+                } // i != k if statement
+
+
+                //itr += blockDim.x;
+
+            } //itr for statement
+
+            /*
+               __syncthreads ();
+
+               for (int x = 1; x < blockDim.x; x++)
+               sh_hb [0] += sh_hb [x];    
+
+               E_HB [j] += sh_hb[0];
+               if (threadIdx.x < 16) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
+               if (threadIdx.x < 8) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
+               if (threadIdx.x < 4) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
+               if (threadIdx.x < 2) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
+               if (threadIdx.x < 1) sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
+               if (threadIdx.x == 0) E_HB [j] += sh_hb [threadIdx.x];
+             */
+
+
+            //pk += blockDim.x;
+
+            }  // pk for statement
+        } // main if statment
+
+        //do the reduction for the bond_ij here
+        /*
+           if (threadIdx.x < 16){
+           sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
+        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
+        }
+        if (threadIdx.x < 8){ 
+        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
+        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
+        }
+        if (threadIdx.x < 4){
+        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
+        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
+        }
+        if (threadIdx.x < 2){
+        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
+        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
+        }
+        if (threadIdx.x < 1){
+        sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
+        //rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
+        }
+        if (threadIdx.x == 0){
+        E_HB [j] += sh_hb [threadIdx.x];
+        //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+        }
+         */
+
+        E_HB [j]  += sh_hb [threadIdx.x];
+        rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+
+        //rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]);
+    }
+
+
+    DEVICE void warpReduce(volatile real* sdata, int tid) 
+    {
+        if (tid < 16) sdata[tid] += sdata[tid + 16]; 
+        if (tid < 8) sdata[tid] += sdata[tid + 8]; 
+        if (tid < 4) sdata[tid] += sdata[tid + 4]; 
+        if (tid < 2) sdata[tid] += sdata[tid + 2]; 
+        if (tid < 1) sdata[tid] += sdata[tid + 1]; 
+    }
+
+
+
+
+    GLOBAL void Hydrogen_Bonds_HB (    reax_atom *atoms,
+            single_body_parameters *sbp,
+            hbond_parameters *d_hbp,
+            control_params *control,
+            simulation_data *data,
+            static_storage p_workspace, 
+            list p_bonds, list p_hbonds,
+            int N, int num_atom_types, 
+            real *E_HB, rvec *aux_ext_press, rvec *atoms_f )
+    {
+        extern __shared__ real t_hb[];
+        extern __shared__ rvec t__f[];
+        extern __shared__ rvec t_cdbo[];
+        extern __shared__ rvec t_hf [];
+
+        real *sh_hb = t_hb;
+        real *sh_cdbo = t_hb + blockDim.x;
+        rvec *sh_atomf = (rvec *)(sh_cdbo + blockDim.x);
+        rvec *sh_hf = (rvec *) (sh_atomf + blockDim.x);
+
+        int __THREADS_PER_ATOM__ = HBONDS_THREADS_PER_ATOM;
+
+        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        int warp_id = thread_id / __THREADS_PER_ATOM__;
+        int lane_id = thread_id & (__THREADS_PER_ATOM__ -1); 
+        int my_bucket = threadIdx.x / __THREADS_PER_ATOM__;
+
+        if (warp_id >= N ) return;
+
+
+        int i, j, k, pi, pk, itr, top;
+        int type_i, type_j, type_k;
+        int start_j, end_j, hb_start_j, hb_end_j;
+        int hblist[MAX_BONDS];
+        int num_hb_intrs = 0;
+        real r_ij, r_jk, theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
+        real e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
+        rvec dcos_theta_di, dcos_theta_dj, dcos_theta_dk;
+        rvec dvec_jk, force, ext_press;
+        ivec rel_jk;
+        // rtensor temp_rtensor, total_rtensor;
+        hbond_parameters *hbp;
+        bond_order_data *bo_ij;
+        bond_data *pbond_ij;
+        far_neighbor_data *nbr_jk;
+        list *bonds, *hbonds;
+        bond_data *bond_list;
+        hbond_data *hbond_list, *hbond_jk;
+        static_storage *workspace = &p_workspace;
+
+        /*
+           j = blockIdx.x * blockDim.x + threadIdx.x;
+           if (j >= N) return;
+         */
+
+        //     j = blockIdx.x;
+
+        j = warp_id;
+
+        bonds = &p_bonds;
+        bond_list = bonds->select.bond_list;
+
+        hbonds = &p_hbonds;
+        hbond_list = hbonds->select.hbond_list;
+
+        // loops below discover the Hydrogen bonds between i-j-k triplets.
+        // here j is H atom and there has to be some bond between i and j.
+        // Hydrogen bond is between j and k.
+        // so in this function i->X, j->H, k->Z when we map 
+        // variables onto the ones in the handout.
+
+        //for( j = 0; j < system->N; ++j )
+        sh_hb [threadIdx.x] = 0;
+        rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+
+        if( sbp[atoms[j].type].p_hbond==1) {// j must be H
+            //set j's variables 
+            type_j  = atoms[j].type;
+            start_j = Start_Index(j, bonds);
+            end_j   = End_Index(j, bonds);
+            hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+            hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
+
+            top = 0;
+            for( pi = start_j; pi < end_j; ++pi ) {
+                pbond_ij = &( bond_list[pi] );
+                i = pbond_ij->nbr;
+                bo_ij = &(pbond_ij->bo_data);
+                type_i = atoms[i].type;
+
+                if( sbp[type_i].p_hbond == 2 && 
+                        bo_ij->BO >= HB_THRESHOLD ) {
+                    hblist[top++] = pi;
+                }
+            }
+
+            // fprintf( stderr, "j: %d, top: %d, hb_start_j: %d, hb_end_j:%d\n", 
+            //          j, top, hb_start_j, hb_end_j );
+
+            for( itr=0; itr < top; ++itr ) {
+                pi = hblist[itr];
+                pbond_ij = &( bond_list[pi] );
+                i = pbond_ij->nbr;
+
+                //TODO
+                rvec_MakeZero (sh_hf [threadIdx.x]);
+                sh_cdbo [threadIdx.x] = 0;
+
+
+                //for( pk = hb_start_j; pk < hb_end_j; ++pk )
+                int loopcount = (hb_end_j - hb_start_j) / HBONDS_THREADS_PER_ATOM + (((hb_end_j - hb_start_j)%HBONDS_THREADS_PER_ATOM == 0) ? 0 : 1);
+                int count = 0;
+                //jpk = hb_start_j + threadIdx.x;
+                pk = hb_start_j + lane_id;
+                //while (pk < hb_end_j)
+                while (count < loopcount)
+                {
+
+                    if (pk < hb_end_j)
+                    {
+                        // set k's varibles 
+                        //TODO
+                        hbond_jk = &( hbond_list[pk] );
+                        //TODO
+                        k = hbond_list[pk].nbr;
+                        type_k = atoms[k].type;
+                        nbr_jk = hbond_list[pk].ptr;
+                        r_jk = nbr_jk->d;
+                        rvec_Scale( dvec_jk, hbond_list[pk].scl, nbr_jk->dvec );
+                    }
+                    else k = -1;
+
+                    //TODO Double check this Hydrogen Bonds fix
+                    //rvec_MakeZero ( nbr_jk->h_f );
+                    //rvec_MakeZero ( hbond_jk->h_f );
+                    //TODO Double check this Hydrogen Bonds fix
+
+                    //sh_hb [threadIdx.x] = 0;
+                    //rvec_MakeZero ( sh_atomf[ threadIdx.x] );
+                    //__syncthreads ();
+
+
+                    if(( i != k ) && (k != -1)) {
+                        bo_ij = &(pbond_ij->bo_data);
+                        type_i = atoms[i].type;
+                        r_ij = pbond_ij->d;         
+                        hbp = &(d_hbp[ index_hbp(type_i, type_j, type_k, num_atom_types) ]);
+                        ++num_hb_intrs;
+
+                        Calculate_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                                &theta, &cos_theta );
+                        // the derivative of cos(theta)
+                        Calculate_dCos_Theta( pbond_ij->dvec, pbond_ij->d, dvec_jk, r_jk,
+                                &dcos_theta_di, &dcos_theta_dj, 
+                                &dcos_theta_dk );
+
+                        // hydrogen bond energy
+                        sin_theta2 = SIN( theta/2.0 );
+                        sin_xhz4 = SQR(sin_theta2);
+                        sin_xhz4 *= sin_xhz4;
+                        cos_xhz1 = ( 1.0 - cos_theta );
+                        exp_hb2 = EXP( -hbp->p_hb2 * bo_ij->BO );
+                        exp_hb3 = EXP( -hbp->p_hb3 * ( hbp->r0_hb / r_jk + 
+                                    r_jk / hbp->r0_hb - 2.0 ) );
+
+                        //PERFORMANCE IMPACT
+                        e_hb = hbp->p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
+                        //atomicAdd ( &data->E_HB, e_hb );
+                        //E_HB [j] += e_hb;
+                        sh_hb [threadIdx.x] += e_hb;
+
+                        CEhb1 = hbp->p_hb1*hbp->p_hb2 * exp_hb2*exp_hb3 * sin_xhz4;
+                        CEhb2 = -hbp->p_hb1/2.0*(1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
+                        CEhb3 = -hbp->p_hb3 * e_hb * (-hbp->r0_hb / SQR(r_jk) + 
+                                1.0 / hbp->r0_hb);
+
+                        //this is the problem here
+                        //TODO
+                        // hydrogen bond forces
+                        //bo_ij->Cdbo += CEhb1;   // dbo term
+                        sh_cdbo[threadIdx.x] += CEhb1;
+                        //TODO
+                        //warpReduce (sh_cdbo, threadIdx.x);
+                        //if (threadIdx.x == 0)
+                        //    bo_ij->Cdbo += sh_cdbo [0];
+
+
+
+                        if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
+
+                            //PERFORMANCE IMPACT
+                            /*
+                               atomic_rvecScaledAdd( atoms[i].f, 
+                               +CEhb2, dcos_theta_di ); //dcos terms
+                               atomic_rvecScaledAdd( atoms[j].f, 
+                               +CEhb2, dcos_theta_dj );
+                               atomic_rvecScaledAdd( atoms[k].f, 
+                               +CEhb2, dcos_theta_dk );
+                            //dr terms
+                            atomic_rvecScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                            atomic_rvecScaledAdd( atoms[k].f, +CEhb3/r_jk, dvec_jk );
+                             */
+
+                            //PERFORMANCE IMPACT
+                            //rvec_ScaledAdd( pbond_ij->h_f, +CEhb2, dcos_theta_di ); //dcos terms
+                            rvec_ScaledAdd( sh_hf [threadIdx.x], +CEhb2, dcos_theta_di ); //dcos terms
+
+                            //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
+                            rvec_ScaledAdd( sh_atomf [threadIdx.x], +CEhb2, dcos_theta_dj );
+
+
+                            //TODO you forgot here
+                            //TODO Hydrogen bonds fix. -- BE VERY CAREFUL *****
+                            rvec_ScaledAdd( hbond_jk->h_f, +CEhb2, dcos_theta_dk );
+
+                            //rvec_ScaledAdd( nbr_jk->h_f, 
+                            //     +CEhb2, dcos_theta_dk );
+
+                            //dr terms
+                            //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+                            rvec_ScaledAdd( sh_atomf [threadIdx.x], -CEhb3/r_jk, dvec_jk );
+
+                            //TODO you forgot 
+                            rvec_ScaledAdd( hbond_jk->h_f, +CEhb3/r_jk, dvec_jk );
+                            //rvec_ScaledAdd( nbr_jk->h_f, +CEhb3/r_jk, dvec_jk );
+                        }
+                        else
+                        {
+                            // for pressure coupling, terms that are not related 
+                            // to bond order derivatives are added directly into 
+                            // pressure vector/tensor 
+                            //rvec_Scale( force, +CEhb2, dcos_theta_di ); // dcos terms
+                            //rvec_Add( pbond_ij->h_f, force );
+                            //rvec_iMultiply( ext_press, pbond_ij->rel_box, force );
+                            //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                            //rvec_ScaledAdd (sh_press [threadIdx.x], 1.0, ext_press );
+
+                            //rvec_ScaledAdd( atoms[j].f, +CEhb2, dcos_theta_dj );
+
+                            //ivec_Scale( rel_jk, hbond_list[pk].scl, nbr_jk->rel_box );
+                            //rvec_Scale( force, +CEhb2, dcos_theta_dk );
+
+                            //rvec_Add( nbr_jk->h_f, force );
+                            //rvec_Add( hbond_jk->h_f, force );
+
+                            //rvec_iMultiply( ext_press, rel_jk, force );
+                            //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                            //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
+
+                            //dr terms
+                            //rvec_ScaledAdd( atoms[j].f, -CEhb3/r_jk, dvec_jk );
+
+                            //rvec_Scale( force, CEhb3/r_jk, dvec_jk );
+                            //rvec_Add( hbond_jk->h_f, force );
+                            //rvec_iMultiply( ext_press, rel_jk, force );
+                            //rvec_ScaledAdd( aux_ext_press [j], 1.0, ext_press );
+                            //rvec_ScaledAdd( sh_press [threadIdx.x], 1.0, ext_press );
+
+                        }
+
+                    } // i != k if statement
+
+                    pk += __THREADS_PER_ATOM__;
+                    count ++;
+
+                }  // pk for statement
+
+                //__syncthreads ();
+
+                //at this point done with one bond....
+                //do the reduction now
+                //if (threadIdx.x == 0){
+                if (lane_id < 16) {
+                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 16];
+                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 16]);
+                }
+                if (lane_id < 8) {
+                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 8];
+                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 8]);
+                }
+                if (lane_id < 4) {
+                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 4];
+                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 4]);
+                }
+                if (lane_id < 2) {
+                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 2];
+                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 2]);
+                }
+                if (lane_id < 1) {
+                    sh_cdbo [threadIdx.x] += sh_cdbo [threadIdx.x + 1];
+                    rvec_Add (sh_hf [threadIdx.x], sh_hf [threadIdx.x + 1]);
+
+                    bo_ij->Cdbo += sh_cdbo [threadIdx.x];
+                    rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
+                }
+                /*
+                   if (lane_id == 0){
+                   for (i = 1; i < 32; i++)
+                   {
+                //sh_cdbo [threadIdx.x] += sh_cdbo [i];
+                //rvec_Add (sh_hf [threadIdx.x], sh_hf [i]);
 
-				sh_cdbo [lane_id] += sh_cdbo [lane_id + i];
-				rvec_Add (sh_hf [lane_id], sh_hf [lane_id + i]);
-				}
+                sh_cdbo [lane_id] += sh_cdbo [lane_id + i];
+                rvec_Add (sh_hf [lane_id], sh_hf [lane_id + i]);
+                }
 
-				//bo_ij->Cdbo += sh_cdbo [threadIdx.x];
-				//rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
+                //bo_ij->Cdbo += sh_cdbo [threadIdx.x];
+                //rvec_Add (pbond_ij->h_f, sh_hf [threadIdx.x]);
 
-				bo_ij->Cdbo += sh_cdbo [lane_id];
-				rvec_Add (pbond_ij->h_f, sh_hf [lane_id]);
-				}
-				 */
+                bo_ij->Cdbo += sh_cdbo [lane_id];
+                rvec_Add (pbond_ij->h_f, sh_hf [lane_id]);
+                }
+                 */
 
-			} //itr for statement
+            } //itr for statement
 
-			//__syncthreads ();
-			} // main if statment
+            //__syncthreads ();
+            } // main if statment
 
-			//__syncthreads ();
+            //__syncthreads ();
 
 
-			//do the reduction for the bond_ij here
-			if (lane_id < 16){
-				sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
-				rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
-			}
-			if (lane_id < 8){ 
-				sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
-				rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
-			}
-			if (lane_id < 4){
-				sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
-				rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
-			}
-			if (lane_id < 2){
-				sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
-				rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
-			}
-			if (lane_id < 1){
-				sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
-				rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
+            //do the reduction for the bond_ij here
+            if (lane_id < 16){
+                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 16];
+                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 16] );
+            }
+            if (lane_id < 8){ 
+                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 8];
+                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 8] );
+            }
+            if (lane_id < 4){
+                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 4];
+                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 4] );
+            }
+            if (lane_id < 2){
+                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 2];
+                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 2] );
+            }
+            if (lane_id < 1){
+                sh_hb [threadIdx.x] += sh_hb [threadIdx.x + 1];
+                rvec_Add ( sh_atomf [threadIdx.x], sh_atomf [threadIdx.x + 1] );
 
-				E_HB [j] += sh_hb [threadIdx.x];
-				rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-			}
-			/*
-			   if (lane == 0){
-			//E_HB [j] += sh_hb [threadIdx.x];
-			rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-			rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]);
-			}
-			 */
-			//if (threadIdx.x == 0){
-			/*
-			   if (lane_id == 0){
-			   for (i = 1; i < 32; i++)
-			   {
-			//sh_hb [threadIdx.x] += sh_hb [i];
-			//rvec_Add (sh_atomf [threadIdx.x], sh_atomf [i]);
-			sh_hb [lane_id] += sh_hb [lane_id + i];
-			rvec_Add (sh_atomf [lane_id], sh_atomf [lane_id + i]);
-			}
+                E_HB [j] += sh_hb [threadIdx.x];
+                rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+            }
+            /*
+               if (lane == 0){
+            //E_HB [j] += sh_hb [threadIdx.x];
+            rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+            rvec_Copy (atoms_f [j], sh_atomf [threadIdx.x]);
+            }
+             */
+            //if (threadIdx.x == 0){
+            /*
+               if (lane_id == 0){
+               for (i = 1; i < 32; i++)
+               {
+            //sh_hb [threadIdx.x] += sh_hb [i];
+            //rvec_Add (sh_atomf [threadIdx.x], sh_atomf [i]);
+            sh_hb [lane_id] += sh_hb [lane_id + i];
+            rvec_Add (sh_atomf [lane_id], sh_atomf [lane_id + i]);
+            }
 
-			//E_HB [j] += sh_hb [threadIdx.x];
-			//rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+            //E_HB [j] += sh_hb [threadIdx.x];
+            //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
 
-			E_HB [j] += sh_hb [lane_id];
-			rvec_Add (atoms[j].f, sh_atomf [lane_id]);
-			//rvec_Copy (atoms_f[j], sh_atomf [threadIdx.x]);
-			}
-			 */
+            E_HB [j] += sh_hb [lane_id];
+            rvec_Add (atoms[j].f, sh_atomf [lane_id]);
+            //rvec_Copy (atoms_f[j], sh_atomf [threadIdx.x]);
+            }
+             */
 
-			//E_HB [j]  += sh_hb [threadIdx.x];
-			//rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
-		}
+            //E_HB [j]  += sh_hb [threadIdx.x];
+            //rvec_Add (atoms[j].f, sh_atomf [threadIdx.x]);
+        }
 
 
 
@@ -2309,154 +2309,154 @@ GLOBAL void Hydrogen_Bonds (	reax_atom *atoms,
 
 
 
-		GLOBAL void Hydrogen_Bonds_Postprocess ( 	reax_atom *atoms, 
-				single_body_parameters *sbp,
-				static_storage p_workspace,
-				list p_bonds, list p_hbonds, list p_far_nbrs, int N, 
-				real *e_hb)
-		{
+        GLOBAL void Hydrogen_Bonds_Postprocess (     reax_atom *atoms, 
+                single_body_parameters *sbp,
+                static_storage p_workspace,
+                list p_bonds, list p_hbonds, list p_far_nbrs, int N, 
+                real *e_hb)
+        {
 
-			int i, pj, hj, nbr, k, j;
-			int start, end;
+            int i, pj, hj, nbr, k, j;
+            int start, end;
 
-			bond_data *pbond;
-			bond_data *sym_index_bond;
-			far_neighbor_data *nbr_pj, *sym_index_nbr;
+            bond_data *pbond;
+            bond_data *sym_index_bond;
+            far_neighbor_data *nbr_pj, *sym_index_nbr;
 
-			list *bonds = &p_bonds;
-			list *far_nbrs = &p_far_nbrs;
+            list *bonds = &p_bonds;
+            list *far_nbrs = &p_far_nbrs;
 
-			i = blockIdx.x * blockDim.x + threadIdx.x;
+            i = blockIdx.x * blockDim.x + threadIdx.x;
 
-			if ( i >= N) return;
+            if ( i >= N) return;
 
-			// For processing ij information
-			start = Start_Index(i, bonds);
-			end = End_Index(i, bonds); 
+            // For processing ij information
+            start = Start_Index(i, bonds);
+            end = End_Index(i, bonds); 
 
-			//rvec_Scale (atoms[i].f, e_hb[i], atoms[i].f);
+            //rvec_Scale (atoms[i].f, e_hb[i], atoms[i].f);
 
-			for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
+            for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ){
 
-				pbond = &(bonds->select.bond_list[pj]);
-				sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
+                pbond = &(bonds->select.bond_list[pj]);
+                sym_index_bond = &( bonds->select.bond_list[ pbond->sym_index ] );
 
-				rvec_Add (atoms[i].f, sym_index_bond->h_f );
-			}
+                rvec_Add (atoms[i].f, sym_index_bond->h_f );
+            }
 
-			/*
-			   for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++)
-			   {
-			// check if the neighbor is of h_type
-			nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-			j = nbr_pj->nbr;
+            /*
+               for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++)
+               {
+            // check if the neighbor is of h_type
+            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
 
-			sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]);
-			rvec_Add (atoms[i].f, sym_index_nbr->h_f );
-			}
-			 */
+            sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]);
+            rvec_Add (atoms[i].f, sym_index_nbr->h_f );
+            }
+             */
 
-			//	if (workspace->hbond_index [j] != -1)
-			//	{
-			//		hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-			//		hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
+            //    if (workspace->hbond_index [j] != -1)
+            //    {
+            //        hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+            //        hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
 
-			//		for ( hj = hb_start_j; hj < hb_end_j; hj ++ )
-			//		{
-			//			h_bond_data = &( hbonds->select.hbond_list [hj] );
-			//		 	nbr = h_bond_data->nbr;
+            //        for ( hj = hb_start_j; hj < hb_end_j; hj ++ )
+            //        {
+            //            h_bond_data = &( hbonds->select.hbond_list [hj] );
+            //             nbr = h_bond_data->nbr;
 
-			//			if (nbr == i) {
-			//		 			rvec_Add (atoms[i].f, h_bond_data->h_f );
-			//			}
-			//		}
-			//	}
-		}
+            //            if (nbr == i) {
+            //                     rvec_Add (atoms[i].f, h_bond_data->h_f );
+            //            }
+            //        }
+            //    }
+        }
 
-		GLOBAL void Hydrogen_Bonds_Far_Nbrs ( 	reax_atom *atoms, 
-				single_body_parameters *sbp,
-				static_storage p_workspace,
-				list p_bonds, list p_hbonds, list p_far_nbrs, int N )
-		{
+        GLOBAL void Hydrogen_Bonds_Far_Nbrs (     reax_atom *atoms, 
+                single_body_parameters *sbp,
+                static_storage p_workspace,
+                list p_bonds, list p_hbonds, list p_far_nbrs, int N )
+        {
 
-			extern __shared__ rvec __f[];
-			int i, pj,j;
-			int start, end;
+            extern __shared__ rvec __f[];
+            int i, pj,j;
+            int start, end;
 
-			far_neighbor_data *nbr_pj, *sym_index_nbr;
-			list *far_nbrs = &p_far_nbrs;
+            far_neighbor_data *nbr_pj, *sym_index_nbr;
+            list *far_nbrs = &p_far_nbrs;
 
-			i = blockIdx.x;
+            i = blockIdx.x;
 
-			start = Start_Index (i, far_nbrs);
-			end = End_Index (i, far_nbrs);
-			pj = start + threadIdx.x;
+            start = Start_Index (i, far_nbrs);
+            end = End_Index (i, far_nbrs);
+            pj = start + threadIdx.x;
 
-			rvec_MakeZero (__f[threadIdx.x]);
+            rvec_MakeZero (__f[threadIdx.x]);
 
-			while (pj < end)
-			{
-				nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-				j = nbr_pj->nbr;
+            while (pj < end)
+            {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j = nbr_pj->nbr;
 
-				//sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]);
-				//
-				//rvec_Add (atoms[i].f, sym_index_nbr->h_f );
-				//
-				//rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f );
+                //sym_index_nbr = & (far_nbrs->select.far_nbr_list[ nbr_pj->sym_index ]);
+                //
+                //rvec_Add (atoms[i].f, sym_index_nbr->h_f );
+                //
+                //rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f );
 
-				pj += blockDim.x;
-			}
+                pj += blockDim.x;
+            }
 
-			if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
-			if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
-			if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
-			if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
-			if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
+            if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
+            if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
+            if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
+            if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
+            if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
 
-			if (threadIdx.x == 0)
-				rvec_Add (atoms[i].f, __f[0]);
-		}
+            if (threadIdx.x == 0)
+                rvec_Add (atoms[i].f, __f[0]);
+        }
 
-		GLOBAL void Hydrogen_Bonds_HNbrs ( 	reax_atom *atoms, 
-				single_body_parameters *sbp,
-				static_storage p_workspace,
-				list p_bonds, list p_hbonds, list p_far_nbrs, int N )
-		{
+        GLOBAL void Hydrogen_Bonds_HNbrs (     reax_atom *atoms, 
+                single_body_parameters *sbp,
+                static_storage p_workspace,
+                list p_bonds, list p_hbonds, list p_far_nbrs, int N )
+        {
 
-			extern __shared__ rvec __f[];
-			int i, pj,j;
-			int start, end;
+            extern __shared__ rvec __f[];
+            int i, pj,j;
+            int start, end;
 
-			hbond_data *nbr_pj, *sym_index_nbr;
-			list *hbonds = &p_hbonds;
-
-			i = blockIdx.x;
+            hbond_data *nbr_pj, *sym_index_nbr;
+            list *hbonds = &p_hbonds;
+
+            i = blockIdx.x;
 
-			start = Start_Index (i, hbonds);
-			end = End_Index (i, hbonds);
-			pj = start + threadIdx.x;
-
-			rvec_MakeZero (__f[threadIdx.x]);
-
-			while (pj < end)
-			{
-				nbr_pj = &( hbonds->select.hbond_list[pj] );
-				j = nbr_pj->nbr;
-
-				sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
-				rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f );
-
-				pj += blockDim.x;
-			}
-
-			if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
-			if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
-			if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
-			if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
-			if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
-
-			if (threadIdx.x == 0)
-				rvec_Add (atoms[i].f, __f[0]);
-		}
+            start = Start_Index (i, hbonds);
+            end = End_Index (i, hbonds);
+            pj = start + threadIdx.x;
+
+            rvec_MakeZero (__f[threadIdx.x]);
+
+            while (pj < end)
+            {
+                nbr_pj = &( hbonds->select.hbond_list[pj] );
+                j = nbr_pj->nbr;
+
+                sym_index_nbr = & (hbonds->select.hbond_list[ nbr_pj->sym_index ]);
+                rvec_Add (__f[threadIdx.x], sym_index_nbr->h_f );
+
+                pj += blockDim.x;
+            }
+
+            if (threadIdx.x < 16) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 16]);
+            if (threadIdx.x < 8) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 8]);
+            if (threadIdx.x < 4) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 4]);
+            if (threadIdx.x < 2) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 2]);
+            if (threadIdx.x < 1) rvec_Add (__f[threadIdx.x], __f[threadIdx.x + 1]);
+
+            if (threadIdx.x == 0)
+                rvec_Add (atoms[i].f, __f[0]);
+        }
 
diff --git a/PuReMD-GPU/src/traj.cu b/PuReMD-GPU/src/traj.cu
index 5575c61a..97496e7f 100644
--- a/PuReMD-GPU/src/traj.cu
+++ b/PuReMD-GPU/src/traj.cu
@@ -27,418 +27,418 @@
 /************************************************/
 
 int Write_Custom_Header(reax_system *system, control_params *control, 
-		static_storage *workspace, output_controls *out_control)
+        static_storage *workspace, output_controls *out_control)
 {
-	int i, header_len, control_block_len, frame_format_len;
-	// char buffer[2048];
-	char control_block[2048];
-	char frame_format[2048];
-	char atom_format[100], bond_format[100], angle_format[100];
-
-	sprintf( control_block, CONTROL_BLOCK,
-			system->N,
-			control->restart,
-			control->restart_from,
-			control->random_vel,
-			out_control->restart_freq,
-			control->ensemble,
-			control->nsteps,
-			control->dt,
-			control->reposition_atoms,
-			control->restrict_bonds,
-			control->tabulate,
-			control->nbr_cut,
-			control->r_cut,
-			control->bg_cut,
-			control->bo_cut,
-			control->thb_cut,
-			control->hb_cut,
-			control->q_err,
-			control->T_init,
-			control->T_final,
-			control->Tau_T,
-			control->T_mode,
-			control->T_rate,
-			control->T_freq,
-			control->P[0], control->P[1], control->P[2], 
-			control->Tau_P[0], control->Tau_P[1], control->Tau_P[2],
-			control->compressibility,
-			control->press_mode,
-			control->remove_CoM_vel,
-			out_control->write_steps,
-			out_control->traj_compress,
-			out_control->traj_format,
-			out_control->atom_format,
-			out_control->bond_info,
-			out_control->angle_info,
-			out_control->energy_update_freq,
-			control->molec_anal,
-			control->freq_molec_anal );
-
-			control_block_len = strlen( control_block );
-
-
-			sprintf( frame_format, "Frame Format: %d\n%s\n%s\n", 
-					NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES );
-
-			atom_format[0] = OPT_NOATOM;
-			switch( out_control->atom_format )
-			{
-				case OPT_ATOM_BASIC: sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC );
-						     break;
-				case OPT_ATOM_wF: sprintf( atom_format, "Atom_wF: %s", ATOM_wF );
-						  break;
-				case OPT_ATOM_wV: sprintf( atom_format, "Atom_wV: %s", ATOM_wV );
-						  break;
-				case OPT_ATOM_FULL: sprintf( atom_format, "Atom_Full: %s", ATOM_FULL );
-						    break;
-			}
-			strcat( frame_format, atom_format );
-
-			bond_format[0] = OPT_NOBOND;
-			if( out_control->bond_info == OPT_BOND_BASIC )
-				sprintf( bond_format, "Bond_Line: %s", BOND_BASIC );
-			else if( out_control->bond_info == OPT_BOND_FULL )
-				sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL );
-			strcat( frame_format, bond_format );
-
-			angle_format[0] = OPT_NOANGLE;
-			if( out_control->angle_info == OPT_ANGLE_BASIC )
-				sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC );
-			strcat( frame_format, angle_format );
-
-			frame_format_len = strlen( frame_format );
-
-
-			header_len = HEADER_INIT_LEN + (control_block_len + SIZE_INFO_LEN2)+ 
-				(frame_format_len + SIZE_INFO_LEN2) + 
-				(ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2);
-
-			out_control->write( out_control->trj, HEADER_INIT, 
-					header_len, HEADER_INIT_LEN, out_control->traj_title );
-
-			out_control->write( out_control->trj, SIZE_INFO_LINE2,
-					control_block_len + (frame_format_len + SIZE_INFO_LEN2) + 
-					(ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), 
-					control_block_len );
-			out_control->write( out_control->trj, "%s", control_block );
-
-			out_control->write( out_control->trj, SIZE_INFO_LINE2, 
-					frame_format_len + 
-					(ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), 
-					frame_format_len );
-			out_control->write( out_control->trj, "%s", frame_format );
-
-			out_control->write( out_control->trj, SIZE_INFO_LINE2, 
-					ATOM_MAPPING_LEN * system->N, 
-					ATOM_MAPPING_LEN * system->N );
-
-			for( i = 0; i < system->N; ++i )
-				out_control->write( out_control->trj, ATOM_MAPPING,  
-						workspace->orig_id[i], 
-						system->atoms[i].type, 
-						system->atoms[i].name, 
-						system->reaxprm.sbp[ system->atoms[i].type ].mass ); 
-
-			fflush( out_control->trj );
-
-			return 0;
+    int i, header_len, control_block_len, frame_format_len;
+    // char buffer[2048];
+    char control_block[2048];
+    char frame_format[2048];
+    char atom_format[100], bond_format[100], angle_format[100];
+
+    sprintf( control_block, CONTROL_BLOCK,
+            system->N,
+            control->restart,
+            control->restart_from,
+            control->random_vel,
+            out_control->restart_freq,
+            control->ensemble,
+            control->nsteps,
+            control->dt,
+            control->reposition_atoms,
+            control->restrict_bonds,
+            control->tabulate,
+            control->nbr_cut,
+            control->r_cut,
+            control->bg_cut,
+            control->bo_cut,
+            control->thb_cut,
+            control->hb_cut,
+            control->q_err,
+            control->T_init,
+            control->T_final,
+            control->Tau_T,
+            control->T_mode,
+            control->T_rate,
+            control->T_freq,
+            control->P[0], control->P[1], control->P[2], 
+            control->Tau_P[0], control->Tau_P[1], control->Tau_P[2],
+            control->compressibility,
+            control->press_mode,
+            control->remove_CoM_vel,
+            out_control->write_steps,
+            out_control->traj_compress,
+            out_control->traj_format,
+            out_control->atom_format,
+            out_control->bond_info,
+            out_control->angle_info,
+            out_control->energy_update_freq,
+            control->molec_anal,
+            control->freq_molec_anal );
+
+            control_block_len = strlen( control_block );
+
+
+            sprintf( frame_format, "Frame Format: %d\n%s\n%s\n", 
+                    NUM_FRAME_GLOBALS, FRAME_GLOBALS_FORMAT, FRAME_GLOBAL_NAMES );
+
+            atom_format[0] = OPT_NOATOM;
+            switch( out_control->atom_format )
+            {
+                case OPT_ATOM_BASIC: sprintf( atom_format, "Atom_Basic: %s", ATOM_BASIC );
+                             break;
+                case OPT_ATOM_wF: sprintf( atom_format, "Atom_wF: %s", ATOM_wF );
+                          break;
+                case OPT_ATOM_wV: sprintf( atom_format, "Atom_wV: %s", ATOM_wV );
+                          break;
+                case OPT_ATOM_FULL: sprintf( atom_format, "Atom_Full: %s", ATOM_FULL );
+                            break;
+            }
+            strcat( frame_format, atom_format );
+
+            bond_format[0] = OPT_NOBOND;
+            if( out_control->bond_info == OPT_BOND_BASIC )
+                sprintf( bond_format, "Bond_Line: %s", BOND_BASIC );
+            else if( out_control->bond_info == OPT_BOND_FULL )
+                sprintf( bond_format, "Bond_Line_Full: %s", BOND_FULL );
+            strcat( frame_format, bond_format );
+
+            angle_format[0] = OPT_NOANGLE;
+            if( out_control->angle_info == OPT_ANGLE_BASIC )
+                sprintf( angle_format, "Angle_Line: %s", ANGLE_BASIC );
+            strcat( frame_format, angle_format );
+
+            frame_format_len = strlen( frame_format );
+
+
+            header_len = HEADER_INIT_LEN + (control_block_len + SIZE_INFO_LEN2)+ 
+                (frame_format_len + SIZE_INFO_LEN2) + 
+                (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2);
+
+            out_control->write( out_control->trj, HEADER_INIT, 
+                    header_len, HEADER_INIT_LEN, out_control->traj_title );
+
+            out_control->write( out_control->trj, SIZE_INFO_LINE2,
+                    control_block_len + (frame_format_len + SIZE_INFO_LEN2) + 
+                    (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), 
+                    control_block_len );
+            out_control->write( out_control->trj, "%s", control_block );
+
+            out_control->write( out_control->trj, SIZE_INFO_LINE2, 
+                    frame_format_len + 
+                    (ATOM_MAPPING_LEN * system->N + SIZE_INFO_LEN2), 
+                    frame_format_len );
+            out_control->write( out_control->trj, "%s", frame_format );
+
+            out_control->write( out_control->trj, SIZE_INFO_LINE2, 
+                    ATOM_MAPPING_LEN * system->N, 
+                    ATOM_MAPPING_LEN * system->N );
+
+            for( i = 0; i < system->N; ++i )
+                out_control->write( out_control->trj, ATOM_MAPPING,  
+                        workspace->orig_id[i], 
+                        system->atoms[i].type, 
+                        system->atoms[i].name, 
+                        system->reaxprm.sbp[ system->atoms[i].type ].mass ); 
+
+            fflush( out_control->trj );
+
+            return 0;
 }
 
 
 int Append_Custom_Frame( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
-	int i, j, pi, pk, pk_j;
-	int write_atoms, write_bonds, write_angles;
-	int frame_len, atom_line_len, bond_line_len, angle_line_len, rest_of_frame_len;
-	int frame_globals_len, num_bonds, num_thb_intrs;
-	real P;
-	char buffer[2048];
-	list *bonds = (*lists) + BONDS;
-	list *thb_intrs =  (*lists) + THREE_BODIES;
-	bond_data *bo_ij;
-
-
-	/* IMPORTANT: This whole part will go to init_trj after finalized! */
-	switch( out_control->atom_format )
-	{
-		case OPT_ATOM_BASIC: 
-			atom_line_len = ATOM_BASIC_LEN;
-			write_atoms = 1;
-			break;
-		case OPT_ATOM_wF: 
-			atom_line_len = ATOM_wF_LEN; 
-			write_atoms = 1;
-			break;
-		case OPT_ATOM_wV: 
-			atom_line_len = ATOM_wV_LEN; 
-			write_atoms = 1;
-			break;
-		case OPT_ATOM_FULL: 
-			atom_line_len = ATOM_FULL_LEN; 
-			write_atoms = 1;
-			break;
-		default: 
-			atom_line_len = 0;
-			write_atoms = 0;
-	}
-
-
-	/* bond preparations */
-	bond_line_len = write_bonds = 0;
-	if( out_control->bond_info == OPT_BOND_BASIC )
-	{
-		bond_line_len = BOND_BASIC_LEN;
-		write_bonds = 1;
-	}
-	else if( out_control->bond_info == OPT_BOND_FULL )
-	{
-		bond_line_len = BOND_FULL_LEN;
-		write_bonds = 1;
-	}
+    int i, j, pi, pk, pk_j;
+    int write_atoms, write_bonds, write_angles;
+    int frame_len, atom_line_len, bond_line_len, angle_line_len, rest_of_frame_len;
+    int frame_globals_len, num_bonds, num_thb_intrs;
+    real P;
+    char buffer[2048];
+    list *bonds = (*lists) + BONDS;
+    list *thb_intrs =  (*lists) + THREE_BODIES;
+    bond_data *bo_ij;
+
+
+    /* IMPORTANT: This whole part will go to init_trj after finalized! */
+    switch( out_control->atom_format )
+    {
+        case OPT_ATOM_BASIC: 
+            atom_line_len = ATOM_BASIC_LEN;
+            write_atoms = 1;
+            break;
+        case OPT_ATOM_wF: 
+            atom_line_len = ATOM_wF_LEN; 
+            write_atoms = 1;
+            break;
+        case OPT_ATOM_wV: 
+            atom_line_len = ATOM_wV_LEN; 
+            write_atoms = 1;
+            break;
+        case OPT_ATOM_FULL: 
+            atom_line_len = ATOM_FULL_LEN; 
+            write_atoms = 1;
+            break;
+        default: 
+            atom_line_len = 0;
+            write_atoms = 0;
+    }
+
+
+    /* bond preparations */
+    bond_line_len = write_bonds = 0;
+    if( out_control->bond_info == OPT_BOND_BASIC )
+    {
+        bond_line_len = BOND_BASIC_LEN;
+        write_bonds = 1;
+    }
+    else if( out_control->bond_info == OPT_BOND_FULL )
+    {
+        bond_line_len = BOND_FULL_LEN;
+        write_bonds = 1;
+    }
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, "Append Custom Frame -- write_bonds --> %d \n", write_bonds);
+    fprintf (stderr, "Append Custom Frame -- write_bonds --> %d \n", write_bonds);
 #endif
 
-	num_bonds = 0;
-	if( write_bonds )
-	{
+    num_bonds = 0;
+    if( write_bonds )
+    {
 
 #ifndef __PRINT_CPU_RESULTS__
-		//fprintf (stderr, "Synching bonds from device for printing ....\n");
-		Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND );
+        //fprintf (stderr, "Synching bonds from device for printing ....\n");
+        Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND );
 #endif
 
-		for( i = 0; i < system->N; ++i )
-			for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
-				if( i < bonds->select.bond_list[j].nbr && 
-						bonds->select.bond_list[j].bo_data.BO >= control->bg_cut )
-					++num_bonds;
-	}
-
-
-	/* angle preparations */
-	if( out_control->angle_info == OPT_ANGLE_BASIC )
-	{
-		angle_line_len = ANGLE_BASIC_LEN;
-		write_angles = 1;
-	}
-	else 
-	{
-		angle_line_len = 0;
-		write_angles = 0;
-	}
+        for( i = 0; i < system->N; ++i )
+            for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
+                if( i < bonds->select.bond_list[j].nbr && 
+                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut )
+                    ++num_bonds;
+    }
+
+
+    /* angle preparations */
+    if( out_control->angle_info == OPT_ANGLE_BASIC )
+    {
+        angle_line_len = ANGLE_BASIC_LEN;
+        write_angles = 1;
+    }
+    else 
+    {
+        angle_line_len = 0;
+        write_angles = 0;
+    }
 
 #ifdef __DEBUG_CUDA__
-	fprintf (stderr, "Append Custom Frame -- write-angles --> %d \n", write_angles );
+    fprintf (stderr, "Append Custom Frame -- write-angles --> %d \n", write_angles );
 #endif
 
-	num_thb_intrs = 0;
-	if( write_angles ) {
+    num_thb_intrs = 0;
+    if( write_angles ) {
 
 #ifndef __PRINT_CPU_RESULTS__
-		//fprintf (stderr, "Synching three bodies from deivce for printing ... \n");
-		Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY );
-		if ( !write_bonds) {
-			//fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n");
-			Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND );
-		}
+        //fprintf (stderr, "Synching three bodies from deivce for printing ... \n");
+        Sync_Host_Device (thb_intrs, dev_lists + THREE_BODIES, TYP_THREE_BODY );
+        if ( !write_bonds) {
+            //fprintf (stderr, "Synching bonds for three bodies from device for printing ... \n");
+            Sync_Host_Device (bonds, (dev_lists + BONDS), TYP_BOND );
+        }
 #endif 
 
-		for( j = 0; j < system->N; ++j )
-			for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
-				if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) 
-					// physical j&i bond
-					for( pk = Start_Index( pi, thb_intrs ); 
-							pk < End_Index( pi, thb_intrs ); ++pk )
-						if( bonds->select.bond_list[pi].nbr < 
-								thb_intrs->select.three_body_list[pk].thb ) {
-							// get k's pointer on j's bond list
-							pk_j = thb_intrs->select.three_body_list[pk].pthb;
-
-							if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) 
-								// physical j&k bond
-								++num_thb_intrs;
-						}
-	}
-
-
-
-	/* get correct pressure */
-	if( control->ensemble == NPT || control->ensemble == sNPT )
-		P = data->flex_bar.P_scalar;
-	else  if( control->ensemble == iNPT )
-		P = data->iso_bar.P;
-	else P = 0;
-
-
-	/* calculate total frame length*/
-	sprintf( buffer, FRAME_GLOBALS,
-			data->step, data->time, 
-			data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T,
-			P, system->box.volume,
-			system->box.box_norms[0], 
-			system->box.box_norms[1], 
-			system->box.box_norms[2],
-			90.0, 90.0, 90.0, // IMPORTANT: need to rewrite for flexible boxes!
-			data->E_BE,
-			data->E_Ov,  data->E_Un,  data->E_Lp,
-			data->E_Ang, data->E_Pen, data->E_Coa, data->E_HB,
-			data->E_Tor, data->E_Con, 
-			data->E_vdW, data->E_Ele, data->E_Pol );
-	frame_globals_len = strlen( buffer );
-
-	frame_len = frame_globals_len + 
-		write_atoms  * SIZE_INFO_LEN3 + system->N * atom_line_len +
-		write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
-		write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
-
-
-	/* write size info & frame globals */
-	out_control->write( out_control->trj, SIZE_INFO_LINE2, 
-			frame_len, frame_globals_len );
-	out_control->write( out_control->trj, "%s", buffer );
-
-
-	/* write size info & atom lines */  
-	if( write_atoms ) 
-	{
-		rest_of_frame_len = system->N * atom_line_len +
-			write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
-			write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
-
-		out_control->write( out_control->trj, SIZE_INFO_LINE3, 
-				rest_of_frame_len, system->N * atom_line_len, 
-				system->N );
-	}
-
-	switch( out_control->atom_format )
-	{
-		case 4: 
-			for( i = 0; i < system->N; ++i )
-				out_control->write( out_control->trj, ATOM_BASIC, 
-						workspace->orig_id[i], 
-						system->atoms[i].x[0], 
-						system->atoms[i].x[1], 
-						system->atoms[i].x[2],
-						system->atoms[i].q );
-			break;
-		case 5:
-			for( i = 0; i < system->N; ++i )
-				out_control->write( out_control->trj, ATOM_wF, 
-						workspace->orig_id[i],
-						system->atoms[i].x[0], 
-						system->atoms[i].x[1], 
-						system->atoms[i].x[2],
-						system->atoms[i].f[0], 
-						system->atoms[i].f[1], 
-						system->atoms[i].f[2],
-						system->atoms[i].q );
-			break;
-		case 6: 
-			for( i = 0; i < system->N; ++i )
-				out_control->write( out_control->trj, ATOM_wV, 
-						workspace->orig_id[i], 
-						system->atoms[i].x[0], 
-						system->atoms[i].x[1], 
-						system->atoms[i].x[2],
-						system->atoms[i].v[0], 
-						system->atoms[i].v[1], 
-						system->atoms[i].v[2],
-						system->atoms[i].q );
-			break;
-		case 7: 
-			for( i = 0; i < system->N; ++i )
-				out_control->write( out_control->trj, ATOM_FULL, 
-						workspace->orig_id[i], 
-						system->atoms[i].x[0], 
-						system->atoms[i].x[1], 
-						system->atoms[i].x[2],
-						system->atoms[i].v[0], 
-						system->atoms[i].v[1], 
-						system->atoms[i].v[2],
-						system->atoms[i].f[0], 
-						system->atoms[i].f[1], 
-						system->atoms[i].f[2],
-						system->atoms[i].q );
-			break;
-	}
-	fflush( out_control->trj );
-
-
-	/* write size info & bond lines */
-	if( write_bonds )
-	{
-		rest_of_frame_len = num_bonds * bond_line_len +
-			write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
-
-		out_control->write( out_control->trj, SIZE_INFO_LINE3, 
-				rest_of_frame_len, num_bonds * bond_line_len, 
-				num_bonds );
-	}
-
-	if( out_control->bond_info == 1 ) {
-		for( i = 0; i < system->N; ++i )
-			for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
-				if( i < bonds->select.bond_list[j].nbr && 
-						bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) {
-					bo_ij = &( bonds->select.bond_list[j] );
-					out_control->write( out_control->trj, BOND_BASIC, 
-							workspace->orig_id[i], 
-							workspace->orig_id[bo_ij->nbr], 
-							bo_ij->d, bo_ij->bo_data.BO );
-				}
-	}
-	else if( out_control->bond_info == 2 ) {
-		for( i = 0; i < system->N; ++i )
-			for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
-				if( i < bonds->select.bond_list[j].nbr && 
-						bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) {
-					bo_ij = &( bonds->select.bond_list[j] );
-					out_control->write( out_control->trj, BOND_FULL, 
-							workspace->orig_id[i], 
-							workspace->orig_id[bo_ij->nbr], 
-							bo_ij->d, bo_ij->bo_data.BO, bo_ij->bo_data.BO_s, 
-							bo_ij->bo_data.BO_pi, bo_ij->bo_data.BO_pi2 );
-				}
-	}
-
-	fflush( out_control->trj );
-
-
-	/* write size info & angle lines */
-	if( out_control->angle_info ) {
-		out_control->write( out_control->trj, SIZE_INFO_LINE3,
-				num_thb_intrs * angle_line_len, 
-				num_thb_intrs * angle_line_len, num_thb_intrs );
-
-		for( j = 0; j < system->N; ++j )
-			for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
-				if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) 
-					// physical j&i bond
-					for( pk = Start_Index( pi, thb_intrs ); 
-							pk < End_Index( pi, thb_intrs ); ++pk )
-						if( bonds->select.bond_list[pi].nbr < 
-								thb_intrs->select.three_body_list[pk].thb ) {
-							pk_j = thb_intrs->select.three_body_list[pk].pthb; 
-							// get k's pointer on j's bond list
-
-							if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) 
-								// physical j&k bond
-								out_control->write( out_control->trj, ANGLE_BASIC,
-										workspace->orig_id[bonds->select.bond_list[pi].nbr], 
-										workspace->orig_id[j], 
-										workspace->orig_id[thb_intrs->select.three_body_list[pk].thb], 
-										RAD2DEG(thb_intrs->select.three_body_list[pk].theta) );
-						}
-	}
-
-	fflush( out_control->trj );
-
-	return 0;
+        for( j = 0; j < system->N; ++j )
+            for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
+                if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) 
+                    // physical j&i bond
+                    for( pk = Start_Index( pi, thb_intrs ); 
+                            pk < End_Index( pi, thb_intrs ); ++pk )
+                        if( bonds->select.bond_list[pi].nbr < 
+                                thb_intrs->select.three_body_list[pk].thb ) {
+                            // get k's pointer on j's bond list
+                            pk_j = thb_intrs->select.three_body_list[pk].pthb;
+
+                            if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) 
+                                // physical j&k bond
+                                ++num_thb_intrs;
+                        }
+    }
+
+
+
+    /* get correct pressure */
+    if( control->ensemble == NPT || control->ensemble == sNPT )
+        P = data->flex_bar.P_scalar;
+    else  if( control->ensemble == iNPT )
+        P = data->iso_bar.P;
+    else P = 0;
+
+
+    /* calculate total frame length*/
+    sprintf( buffer, FRAME_GLOBALS,
+            data->step, data->time, 
+            data->E_Tot, data->E_Pot, E_CONV * data->E_Kin, data->therm.T,
+            P, system->box.volume,
+            system->box.box_norms[0], 
+            system->box.box_norms[1], 
+            system->box.box_norms[2],
+            90.0, 90.0, 90.0, // IMPORTANT: need to rewrite for flexible boxes!
+            data->E_BE,
+            data->E_Ov,  data->E_Un,  data->E_Lp,
+            data->E_Ang, data->E_Pen, data->E_Coa, data->E_HB,
+            data->E_Tor, data->E_Con, 
+            data->E_vdW, data->E_Ele, data->E_Pol );
+    frame_globals_len = strlen( buffer );
+
+    frame_len = frame_globals_len + 
+        write_atoms  * SIZE_INFO_LEN3 + system->N * atom_line_len +
+        write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
+        write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
+
+
+    /* write size info & frame globals */
+    out_control->write( out_control->trj, SIZE_INFO_LINE2, 
+            frame_len, frame_globals_len );
+    out_control->write( out_control->trj, "%s", buffer );
+
+
+    /* write size info & atom lines */  
+    if( write_atoms ) 
+    {
+        rest_of_frame_len = system->N * atom_line_len +
+            write_bonds  * SIZE_INFO_LEN3 + num_bonds * bond_line_len +
+            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
+
+        out_control->write( out_control->trj, SIZE_INFO_LINE3, 
+                rest_of_frame_len, system->N * atom_line_len, 
+                system->N );
+    }
+
+    switch( out_control->atom_format )
+    {
+        case 4: 
+            for( i = 0; i < system->N; ++i )
+                out_control->write( out_control->trj, ATOM_BASIC, 
+                        workspace->orig_id[i], 
+                        system->atoms[i].x[0], 
+                        system->atoms[i].x[1], 
+                        system->atoms[i].x[2],
+                        system->atoms[i].q );
+            break;
+        case 5:
+            for( i = 0; i < system->N; ++i )
+                out_control->write( out_control->trj, ATOM_wF, 
+                        workspace->orig_id[i],
+                        system->atoms[i].x[0], 
+                        system->atoms[i].x[1], 
+                        system->atoms[i].x[2],
+                        system->atoms[i].f[0], 
+                        system->atoms[i].f[1], 
+                        system->atoms[i].f[2],
+                        system->atoms[i].q );
+            break;
+        case 6: 
+            for( i = 0; i < system->N; ++i )
+                out_control->write( out_control->trj, ATOM_wV, 
+                        workspace->orig_id[i], 
+                        system->atoms[i].x[0], 
+                        system->atoms[i].x[1], 
+                        system->atoms[i].x[2],
+                        system->atoms[i].v[0], 
+                        system->atoms[i].v[1], 
+                        system->atoms[i].v[2],
+                        system->atoms[i].q );
+            break;
+        case 7: 
+            for( i = 0; i < system->N; ++i )
+                out_control->write( out_control->trj, ATOM_FULL, 
+                        workspace->orig_id[i], 
+                        system->atoms[i].x[0], 
+                        system->atoms[i].x[1], 
+                        system->atoms[i].x[2],
+                        system->atoms[i].v[0], 
+                        system->atoms[i].v[1], 
+                        system->atoms[i].v[2],
+                        system->atoms[i].f[0], 
+                        system->atoms[i].f[1], 
+                        system->atoms[i].f[2],
+                        system->atoms[i].q );
+            break;
+    }
+    fflush( out_control->trj );
+
+
+    /* write size info & bond lines */
+    if( write_bonds )
+    {
+        rest_of_frame_len = num_bonds * bond_line_len +
+            write_angles * SIZE_INFO_LEN3 + num_thb_intrs * angle_line_len;
+
+        out_control->write( out_control->trj, SIZE_INFO_LINE3, 
+                rest_of_frame_len, num_bonds * bond_line_len, 
+                num_bonds );
+    }
+
+    if( out_control->bond_info == 1 ) {
+        for( i = 0; i < system->N; ++i )
+            for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
+                if( i < bonds->select.bond_list[j].nbr && 
+                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) {
+                    bo_ij = &( bonds->select.bond_list[j] );
+                    out_control->write( out_control->trj, BOND_BASIC, 
+                            workspace->orig_id[i], 
+                            workspace->orig_id[bo_ij->nbr], 
+                            bo_ij->d, bo_ij->bo_data.BO );
+                }
+    }
+    else if( out_control->bond_info == 2 ) {
+        for( i = 0; i < system->N; ++i )
+            for( j = Start_Index( i, bonds ); j < End_Index( i, bonds ); ++j )
+                if( i < bonds->select.bond_list[j].nbr && 
+                        bonds->select.bond_list[j].bo_data.BO >= control->bg_cut ) {
+                    bo_ij = &( bonds->select.bond_list[j] );
+                    out_control->write( out_control->trj, BOND_FULL, 
+                            workspace->orig_id[i], 
+                            workspace->orig_id[bo_ij->nbr], 
+                            bo_ij->d, bo_ij->bo_data.BO, bo_ij->bo_data.BO_s, 
+                            bo_ij->bo_data.BO_pi, bo_ij->bo_data.BO_pi2 );
+                }
+    }
+
+    fflush( out_control->trj );
+
+
+    /* write size info & angle lines */
+    if( out_control->angle_info ) {
+        out_control->write( out_control->trj, SIZE_INFO_LINE3,
+                num_thb_intrs * angle_line_len, 
+                num_thb_intrs * angle_line_len, num_thb_intrs );
+
+        for( j = 0; j < system->N; ++j )
+            for( pi = Start_Index(j, bonds); pi < End_Index(j, bonds); ++pi )
+                if( bonds->select.bond_list[pi].bo_data.BO >= control->bg_cut ) 
+                    // physical j&i bond
+                    for( pk = Start_Index( pi, thb_intrs ); 
+                            pk < End_Index( pi, thb_intrs ); ++pk )
+                        if( bonds->select.bond_list[pi].nbr < 
+                                thb_intrs->select.three_body_list[pk].thb ) {
+                            pk_j = thb_intrs->select.three_body_list[pk].pthb; 
+                            // get k's pointer on j's bond list
+
+                            if( bonds->select.bond_list[pk_j].bo_data.BO >= control->bg_cut ) 
+                                // physical j&k bond
+                                out_control->write( out_control->trj, ANGLE_BASIC,
+                                        workspace->orig_id[bonds->select.bond_list[pi].nbr], 
+                                        workspace->orig_id[j], 
+                                        workspace->orig_id[thb_intrs->select.three_body_list[pk].thb], 
+                                        RAD2DEG(thb_intrs->select.three_body_list[pk].theta) );
+                        }
+    }
+
+    fflush( out_control->trj );
+
+    return 0;
 }
 
 /*
@@ -480,35 +480,35 @@ gzclose( out_control->trj );
 /********************************************************/
 
 int Write_xyz_Header( reax_system *system, control_params *control, 
-		static_storage* workspace, output_controls *out_control )
+        static_storage* workspace, output_controls *out_control )
 {
-	fflush( out_control->trj );
+    fflush( out_control->trj );
 
-	return 1;
+    return 1;
 }
 
 
 int Append_xyz_Frame( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
-	int i;
+    int i;
 
-	out_control->write( out_control->trj, "%d\n", system->N );
+    out_control->write( out_control->trj, "%d\n", system->N );
 
-	out_control->write( out_control->trj, "%d\t%8.3f\t%8.3f\t%8.3f\t%8.3f\n",
-			data->step,
-			data->E_Tot, data->E_Pot, 
-			E_CONV*data->E_Kin, data->therm.T );
+    out_control->write( out_control->trj, "%d\t%8.3f\t%8.3f\t%8.3f\t%8.3f\n",
+            data->step,
+            data->E_Tot, data->E_Pot, 
+            E_CONV*data->E_Kin, data->therm.T );
 
-	for( i = 0; i < system->N; ++i )
-		out_control->write( out_control->trj, "%3s %10.5f %10.5f %10.5f\n",
-				system->reaxprm.sbp[ system->atoms[i].type ].name,
-				system->atoms[i].x[0], 
-				system->atoms[i].x[1], 
-				system->atoms[i].x[2] );
+    for( i = 0; i < system->N; ++i )
+        out_control->write( out_control->trj, "%3s %10.5f %10.5f %10.5f\n",
+                system->reaxprm.sbp[ system->atoms[i].type ].name,
+                system->atoms[i].x[0], 
+                system->atoms[i].x[1], 
+                system->atoms[i].x[2] );
 
-	fflush( out_control->trj );
+    fflush( out_control->trj );
 
-	return 1;
+    return 1;
 }
diff --git a/PuReMD-GPU/src/two_body_interactions.cu b/PuReMD-GPU/src/two_body_interactions.cu
index f1f5a18c..f53b0cfb 100644
--- a/PuReMD-GPU/src/two_body_interactions.cu
+++ b/PuReMD-GPU/src/two_body_interactions.cu
@@ -29,126 +29,126 @@
 
 
 void Bond_Energy( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
-	int i, j, pj;
-	int start_i, end_i;
-	int type_i, type_j;
-	real ebond, pow_BOs_be2, exp_be12, CEbo;
-	real gp3, gp4, gp7, gp10, gp37;
-	real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
-	real decobdbo, decobdboua, decobdboub;
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_order_data *bo_ij;
-	list *bonds;
-
-	bonds = (*lists) + BONDS;
-	gp3 = system->reaxprm.gp.l[3];
-	gp4 = system->reaxprm.gp.l[4];
-	gp7 = system->reaxprm.gp.l[7];
-	gp10 = system->reaxprm.gp.l[10];
-	gp37 = (int) system->reaxprm.gp.l[37];
-
-	for( i=0; i < system->N; ++i ) {
-		start_i = Start_Index(i, bonds);
-		end_i = End_Index(i, bonds);
-		//fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
-		for( pj = start_i; pj < end_i; ++pj )
-			if( i < bonds->select.bond_list[pj].nbr ) {
-				/* set the pointers */
-				j = bonds->select.bond_list[pj].nbr;
-				type_i = system->atoms[i].type;
-				type_j = system->atoms[j].type;
-				sbp_i = &( system->reaxprm.sbp[type_i] );
-				sbp_j = &( system->reaxprm.sbp[type_j] );
-				twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] );
-				bo_ij = &( bonds->select.bond_list[pj].bo_data );
-
-				/* calculate the constants */
-				pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
-				exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
-				CEbo = -twbp->De_s * exp_be12 * 
-					( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
-
-				/* calculate the Bond Energy */
-				ebond = 
-					-twbp->De_s * bo_ij->BO_s * exp_be12 
-					-twbp->De_p * bo_ij->BO_pi 
-					-twbp->De_pp * bo_ij->BO_pi2;
-
-				data->E_BE += ebond;
-
-				/* calculate derivatives of Bond Orders */
-				bo_ij->Cdbo += CEbo;
-				bo_ij->Cdbopi -= (CEbo + twbp->De_p);
-				bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    real ebond, pow_BOs_be2, exp_be12, CEbo;
+    real gp3, gp4, gp7, gp10, gp37;
+    real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
+    real decobdbo, decobdboua, decobdboub;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij;
+    list *bonds;
+
+    bonds = (*lists) + BONDS;
+    gp3 = system->reaxprm.gp.l[3];
+    gp4 = system->reaxprm.gp.l[4];
+    gp7 = system->reaxprm.gp.l[7];
+    gp10 = system->reaxprm.gp.l[10];
+    gp37 = (int) system->reaxprm.gp.l[37];
+
+    for( i=0; i < system->N; ++i ) {
+        start_i = Start_Index(i, bonds);
+        end_i = End_Index(i, bonds);
+        //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
+        for( pj = start_i; pj < end_i; ++pj )
+            if( i < bonds->select.bond_list[pj].nbr ) {
+                /* set the pointers */
+                j = bonds->select.bond_list[pj].nbr;
+                type_i = system->atoms[i].type;
+                type_j = system->atoms[j].type;
+                sbp_i = &( system->reaxprm.sbp[type_i] );
+                sbp_j = &( system->reaxprm.sbp[type_j] );
+                twbp = &( system->reaxprm.tbp[ index_tbp (type_i,type_j,&system->reaxprm) ] );
+                bo_ij = &( bonds->select.bond_list[pj].bo_data );
+
+                /* calculate the constants */
+                pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
+                exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
+                CEbo = -twbp->De_s * exp_be12 * 
+                    ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
+
+                /* calculate the Bond Energy */
+                ebond = 
+                    -twbp->De_s * bo_ij->BO_s * exp_be12 
+                    -twbp->De_p * bo_ij->BO_pi 
+                    -twbp->De_pp * bo_ij->BO_pi2;
+
+                data->E_BE += ebond;
+
+                /* calculate derivatives of Bond Orders */
+                bo_ij->Cdbo += CEbo;
+                bo_ij->Cdbopi -= (CEbo + twbp->De_p);
+                bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
 
 #ifdef TEST_ENERGY
-				fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
-						workspace->orig_id[i], workspace->orig_id[j], 
-						// i+1, j+1, 
-						bo_ij->BO, ebond/*, data->E_BE*/ );
-				/* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
-				   workspace->orig_id[i], workspace->orig_id[j], 
-				   CEbo, -twbp->De_p, -twbp->De_pp );*/
+                fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
+                        workspace->orig_id[i], workspace->orig_id[j], 
+                        // i+1, j+1, 
+                        bo_ij->BO, ebond/*, data->E_BE*/ );
+                /* fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
+                   workspace->orig_id[i], workspace->orig_id[j], 
+                   CEbo, -twbp->De_p, -twbp->De_pp );*/
 #endif
 #ifdef TEST_FORCES
-				Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
-				Add_dBOpinpi2( system, lists, i, pj, 
-						-(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
-						workspace->f_be, workspace->f_be );
+                Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
+                Add_dBOpinpi2( system, lists, i, pj, 
+                        -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
+                        workspace->f_be, workspace->f_be );
 #endif
 
-				/* Stabilisation terminal triple bond */
-				if( bo_ij->BO >= 1.00 ) {
-					if( gp37 == 2 ||
-							(sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
-							(sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
-						// ba = SQR(bo_ij->BO - 2.50);
-						exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
-						//oboa=abo(j1)-boa;
-						//obob=abo(j2)-boa;
-						exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
-						exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
-						//ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
-						exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
-						hulpov = 1.0 / (1.0 + 25.0 * exphuov);
-
-						estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
-						//estrain(j1) = estrain(j1) + 0.50*estriph;
-						//estrain(j2) = estrain(j2) + 0.50*estriph;
-						data->E_BE += estriph;
-
-						decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
-							( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
-						decobdboua = -gp10 * exphu * hulpov * 
-							(gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-						decobdboub = -gp10 * exphu * hulpov * 
-							(gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-
-						bo_ij->Cdbo += decobdbo;
-						workspace->CdDelta[i] += decobdboua;
-						workspace->CdDelta[j] += decobdboub;
-						//loop_j ++;
-						//fprintf (stderr, "incrementing loopj %d \n", loop_j);
+                /* Stabilisation terminal triple bond */
+                if( bo_ij->BO >= 1.00 ) {
+                    if( gp37 == 2 ||
+                            (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
+                            (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
+                        // ba = SQR(bo_ij->BO - 2.50);
+                        exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
+                        //oboa=abo(j1)-boa;
+                        //obob=abo(j2)-boa;
+                        exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
+                        exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
+                        //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
+                        exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
+                        hulpov = 1.0 / (1.0 + 25.0 * exphuov);
+
+                        estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
+                        //estrain(j1) = estrain(j1) + 0.50*estriph;
+                        //estrain(j2) = estrain(j2) + 0.50*estriph;
+                        data->E_BE += estriph;
+
+                        decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
+                            ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
+                        decobdboua = -gp10 * exphu * hulpov * 
+                            (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+                        decobdboub = -gp10 * exphu * hulpov * 
+                            (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+
+                        bo_ij->Cdbo += decobdbo;
+                        workspace->CdDelta[i] += decobdboua;
+                        workspace->CdDelta[j] += decobdboub;
+                        //loop_j ++;
+                        //fprintf (stderr, "incrementing loopj %d \n", loop_j);
 #ifdef TEST_ENERGY
-						fprintf( out_control->ebond, 
-								"%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-								workspace->orig_id[i], workspace->orig_id[j],
-								//i+1, j+1, 
-								estriph, decobdbo, decobdboua, decobdboub );
+                        fprintf( out_control->ebond, 
+                                "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                                workspace->orig_id[i], workspace->orig_id[j],
+                                //i+1, j+1, 
+                                estriph, decobdbo, decobdboua, decobdboub );
 #endif
 #ifdef TEST_FORCES
-						Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
-						Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
-						Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
+                        Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
+                        Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
+                        Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
 #endif
-					}
-				}
-			}
-	}
+                    }
+                }
+            }
+    }
 }
 
 
@@ -158,361 +158,361 @@ void Bond_Energy( reax_system *system, control_params *control,
 
 
 GLOBAL void Cuda_Bond_Energy ( reax_atom *atoms, global_parameters g_params, 
-		single_body_parameters *sbp, two_body_parameters *tbp, 
-		simulation_data *data,
-		static_storage p_workspace, list p_bonds, 
-		int N, int num_atom_types, real *E_BE)
+        single_body_parameters *sbp, two_body_parameters *tbp, 
+        simulation_data *data,
+        static_storage p_workspace, list p_bonds, 
+        int N, int num_atom_types, real *E_BE)
 {
-	int i, j, pj;
-	int start_i, end_i;
-	int type_i, type_j;
-	real ebond, pow_BOs_be2, exp_be12, CEbo;
-	real gp3, gp4, gp7, gp10, gp37;
-	real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
-	real decobdbo, decobdboua, decobdboub;
-	single_body_parameters *sbp_i, *sbp_j;
-	two_body_parameters *twbp;
-	bond_order_data *bo_ij;
-	list *bonds;
-	static_storage *workspace;
-
-	i = blockIdx.x * blockDim.x + threadIdx.x;
-	if ( i >= N ) return;
-
-	bonds = &p_bonds;
-	workspace = &p_workspace;
-
-	gp3 = g_params.l[3];
-	gp4 = g_params.l[4];
-	gp7 = g_params.l[7];
-	gp10 = g_params.l[10];
-	gp37 = (int) g_params.l[37];
-
-	//for( i=0; i < system->N; ++i )
-	start_i = Start_Index(i, bonds);
-	end_i = End_Index(i, bonds);
-	//fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
-	for( pj = start_i; pj < end_i; ++pj )
-	{
-		//TODO
-		//if( i < bonds->select.bond_list[pj].nbr ) 
-		if( i < bonds->select.bond_list[pj].nbr ) 
-		{
-			//TODO
-			/* set the pointers */
-			j = bonds->select.bond_list[pj].nbr;
-			type_i = atoms[i].type;
-			type_j = atoms[j].type;
-			sbp_i = &( sbp[type_i] );
-			sbp_j = &( sbp[type_j] );
-			twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );
-			bo_ij = &( bonds->select.bond_list[pj].bo_data );
-
-			/* calculate the constants */
-			pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
-			exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
-			CEbo = -twbp->De_s * exp_be12 * 
-				( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
-
-			/* calculate the Bond Energy */
-			ebond = 
-				-twbp->De_s * bo_ij->BO_s * exp_be12 
-				-twbp->De_p * bo_ij->BO_pi 
-				-twbp->De_pp * bo_ij->BO_pi2;
-
-			//PERFORMANCE IMAPCT
-			//atomicAdd (&data->E_BE, ebond);
-			//TODO
-			//E_BE [ i ] += ebond/2.0;
-			E_BE [ i ] += ebond;
-			//data->E_BE += ebond;
-
-			/* calculate derivatives of Bond Orders */
-			bo_ij->Cdbo += CEbo;
-			bo_ij->Cdbopi -= (CEbo + twbp->De_p);
-			bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
+    int i, j, pj;
+    int start_i, end_i;
+    int type_i, type_j;
+    real ebond, pow_BOs_be2, exp_be12, CEbo;
+    real gp3, gp4, gp7, gp10, gp37;
+    real exphu, exphua1, exphub1, exphuov, hulpov, estriph;
+    real decobdbo, decobdboua, decobdboub;
+    single_body_parameters *sbp_i, *sbp_j;
+    two_body_parameters *twbp;
+    bond_order_data *bo_ij;
+    list *bonds;
+    static_storage *workspace;
+
+    i = blockIdx.x * blockDim.x + threadIdx.x;
+    if ( i >= N ) return;
+
+    bonds = &p_bonds;
+    workspace = &p_workspace;
+
+    gp3 = g_params.l[3];
+    gp4 = g_params.l[4];
+    gp7 = g_params.l[7];
+    gp10 = g_params.l[10];
+    gp37 = (int) g_params.l[37];
+
+    //for( i=0; i < system->N; ++i )
+    start_i = Start_Index(i, bonds);
+    end_i = End_Index(i, bonds);
+    //fprintf( stderr, "i=%d start=%d end=%d\n", i, start_i, end_i );
+    for( pj = start_i; pj < end_i; ++pj )
+    {
+        //TODO
+        //if( i < bonds->select.bond_list[pj].nbr ) 
+        if( i < bonds->select.bond_list[pj].nbr ) 
+        {
+            //TODO
+            /* set the pointers */
+            j = bonds->select.bond_list[pj].nbr;
+            type_i = atoms[i].type;
+            type_j = atoms[j].type;
+            sbp_i = &( sbp[type_i] );
+            sbp_j = &( sbp[type_j] );
+            twbp = &( tbp[ index_tbp (type_i,type_j,num_atom_types) ] );
+            bo_ij = &( bonds->select.bond_list[pj].bo_data );
+
+            /* calculate the constants */
+            pow_BOs_be2 = POW( bo_ij->BO_s, twbp->p_be2 );
+            exp_be12 = EXP( twbp->p_be1 * ( 1.0 - pow_BOs_be2 ) );
+            CEbo = -twbp->De_s * exp_be12 * 
+                ( 1.0 - twbp->p_be1 * twbp->p_be2 * pow_BOs_be2 );
+
+            /* calculate the Bond Energy */
+            ebond = 
+                -twbp->De_s * bo_ij->BO_s * exp_be12 
+                -twbp->De_p * bo_ij->BO_pi 
+                -twbp->De_pp * bo_ij->BO_pi2;
+
+            //PERFORMANCE IMAPCT
+            //atomicAdd (&data->E_BE, ebond);
+            //TODO
+            //E_BE [ i ] += ebond/2.0;
+            E_BE [ i ] += ebond;
+            //data->E_BE += ebond;
+
+            /* calculate derivatives of Bond Orders */
+            bo_ij->Cdbo += CEbo;
+            bo_ij->Cdbopi -= (CEbo + twbp->De_p);
+            bo_ij->Cdbopi2 -= (CEbo + twbp->De_pp);
 
 #ifdef TEST_ENERGY
-			//TODO
-			//fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
-			//	 workspace->orig_id[i], workspace->orig_id[j], 
-			// i+1, j+1, 
-			//	 bo_ij->BO, ebond/*, data->E_BE*/ );
-			/*
-			   fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
-			   workspace->orig_id[i], workspace->orig_id[j], 
-			   CEbo, -twbp->De_p, -twbp->De_pp );*/
+            //TODO
+            //fprintf( out_control->ebond, "%6d%6d%24.15e%24.15e\n", 
+            //     workspace->orig_id[i], workspace->orig_id[j], 
+            // i+1, j+1, 
+            //     bo_ij->BO, ebond/*, data->E_BE*/ );
+            /*
+               fprintf( out_control->ebond, "%6d%6d%12.6f%12.6f%12.6f\n", 
+               workspace->orig_id[i], workspace->orig_id[j], 
+               CEbo, -twbp->De_p, -twbp->De_pp );*/
 #endif
 #ifdef TEST_FORCES
-			//TODO
-			/*
-			   Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
-			   Add_dBOpinpi2( system, lists, i, pj, 
-			   -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
-			   workspace->f_be, workspace->f_be );
-			 */
-			//TODO
+            //TODO
+            /*
+               Add_dBO( system, lists, i, pj, CEbo, workspace->f_be );
+               Add_dBOpinpi2( system, lists, i, pj, 
+               -(CEbo + twbp->De_p), -(CEbo + twbp->De_pp), 
+               workspace->f_be, workspace->f_be );
+             */
+            //TODO
 #endif
 
-			/* Stabilisation terminal triple bond */
-			if( bo_ij->BO >= 1.00 ) {
-				if( gp37 == 2 ||
-						(sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
-						(sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
-					// ba = SQR(bo_ij->BO - 2.50);
-					exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
-					//oboa=abo(j1)-boa;
-					//obob=abo(j2)-boa;
-					exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
-					exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
-					//ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
-					exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
-					hulpov = 1.0 / (1.0 + 25.0 * exphuov);
-
-					estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
-					//estrain(j1) = estrain(j1) + 0.50*estriph;
-					//estrain(j2) = estrain(j2) + 0.50*estriph;
-
-					//PERFORMANCE IMPACT
-					//atomicAdd (&data->E_BE, estriph);
-					E_BE [ i] += estriph;
-					//data->E_BE += estriph;
-
-					decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
-						( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
-					decobdboua = -gp10 * exphu * hulpov * 
-						(gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-					decobdboub = -gp10 * exphu * hulpov * 
-						(gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
-
-					bo_ij->Cdbo += decobdbo;
-
-					//PERFORMANCE IMAPCT
-					workspace->CdDelta[i] += decobdboua;
-					//atomicAdd (&workspace->CdDelta[j], decobdboub);
-					//CdDelta [ i * N + i ] += decobdboua;
-					//CdDelta [ i * N + j ] += decobdboua;
-					//workspace->CdDelta [i] += decobdboua;
-					//workspace->CdDelta [j] += decobdboub;
+            /* Stabilisation terminal triple bond */
+            if( bo_ij->BO >= 1.00 ) {
+                if( gp37 == 2 ||
+                        (sbp_i->mass == 12.0000 && sbp_j->mass == 15.9990) || 
+                        (sbp_j->mass == 12.0000 && sbp_i->mass == 15.9990) ) {
+                    // ba = SQR(bo_ij->BO - 2.50);
+                    exphu = EXP( -gp7 * SQR(bo_ij->BO - 2.50) );
+                    //oboa=abo(j1)-boa;
+                    //obob=abo(j2)-boa;
+                    exphua1 = EXP(-gp3*(workspace->total_bond_order[i]-bo_ij->BO));
+                    exphub1 = EXP(-gp3*(workspace->total_bond_order[j]-bo_ij->BO));
+                    //ovoab=abo(j1)-aval(it1)+abo(j2)-aval(it2);
+                    exphuov = EXP(gp4*(workspace->Delta[i] + workspace->Delta[j]));
+                    hulpov = 1.0 / (1.0 + 25.0 * exphuov);
+
+                    estriph = gp10 * exphu * hulpov * (exphua1 + exphub1);
+                    //estrain(j1) = estrain(j1) + 0.50*estriph;
+                    //estrain(j2) = estrain(j2) + 0.50*estriph;
+
+                    //PERFORMANCE IMPACT
+                    //atomicAdd (&data->E_BE, estriph);
+                    E_BE [ i] += estriph;
+                    //data->E_BE += estriph;
+
+                    decobdbo = gp10 * exphu * hulpov * (exphua1 + exphub1) * 
+                        ( gp3 - 2.0 * gp7 * (bo_ij->BO-2.50) );
+                    decobdboua = -gp10 * exphu * hulpov * 
+                        (gp3*exphua1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+                    decobdboub = -gp10 * exphu * hulpov * 
+                        (gp3*exphub1 + 25.0*gp4*exphuov*hulpov*(exphua1+exphub1));
+
+                    bo_ij->Cdbo += decobdbo;
+
+                    //PERFORMANCE IMAPCT
+                    workspace->CdDelta[i] += decobdboua;
+                    //atomicAdd (&workspace->CdDelta[j], decobdboub);
+                    //CdDelta [ i * N + i ] += decobdboua;
+                    //CdDelta [ i * N + j ] += decobdboua;
+                    //workspace->CdDelta [i] += decobdboua;
+                    //workspace->CdDelta [j] += decobdboub;
 
 #ifdef TEST_ENERGY
-					/*
-					   fprintf( out_control->ebond, 
-					   "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-					   workspace->orig_id[i], workspace->orig_id[j],
-					//i+1, j+1, 
-					estriph, decobdbo, decobdboua, decobdboub );
-					 */
+                    /*
+                       fprintf( out_control->ebond, 
+                       "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                       workspace->orig_id[i], workspace->orig_id[j],
+                    //i+1, j+1, 
+                    estriph, decobdbo, decobdboua, decobdboub );
+                     */
 #endif
 #ifdef TEST_FORCES
-					/*
-					   Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
-					   Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
-					   Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
-					 */
+                    /*
+                       Add_dBO( system, lists, i, pj, decobdbo, workspace->f_be );
+                       Add_dDelta( system, lists, i, decobdboua, workspace->f_be );
+                       Add_dDelta( system, lists, j, decobdboub, workspace->f_be );
+                     */
 #endif
-				}
-			}
-		}
-	} //TODO commented out the if statement for processing i < j. 
-	// we process all teh bonds and add only half the energy
+                }
+            }
+        }
+    } //TODO commented out the if statement for processing i < j. 
+    // we process all teh bonds and add only half the energy
 }
 
 
 void vdW_Coulomb_Energy( reax_system *system, control_params *control, 
-		simulation_data *data, static_storage *workspace, 
-		list **lists, output_controls *out_control )
+        simulation_data *data, static_storage *workspace, 
+        list **lists, output_controls *out_control )
 {
-	int  i, j, pj;
-	int  start_i, end_i;
-	real self_coef;
-	real p_vdW1, p_vdW1i;
-	real powr_vdW1, powgi_vdW1;
-	real tmp, r_ij, fn13, exp1, exp2;
-	real Tap, dTap, dfn13, CEvd, CEclmb;
-	real dr3gamij_1, dr3gamij_3;
-	real e_ele, e_vdW, e_core, de_core;
-	rvec temp, ext_press;
-	// rtensor temp_rtensor, total_rtensor;
-	two_body_parameters *twbp;
-	far_neighbor_data *nbr_pj;
-	list *far_nbrs;
-
-	p_vdW1 = system->reaxprm.gp.l[28];
-	p_vdW1i = 1.0 / p_vdW1;
-	far_nbrs = (*lists) + FAR_NBRS; 
-	e_ele = 0;
-	e_vdW = 0;
-	e_core = 0;
-	de_core = 0;
-
-	for( i = 0; i < system->N; ++i ) {
-		start_i = Start_Index(i, far_nbrs);
-		end_i   = End_Index(i, far_nbrs);
-		// fprintf( stderr, "i: %d, start: %d, end: %d\n",
-		//     i, start_i, end_i );
-
-		for( pj = start_i; pj < end_i; ++pj )
-			if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
-				nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-				j = nbr_pj->nbr;
-				r_ij = nbr_pj->d;
-				twbp = &(system->reaxprm.tbp[ index_tbp (system->atoms[i].type, system->atoms[j].type, &system->reaxprm) ]);
-				self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
-
-				/* Calculate Taper and its derivative */
-				// Tap = nbr_pj->Tap;   -- precomputed during compte_H
-				Tap = control->Tap7 * r_ij + control->Tap6;
-				Tap = Tap * r_ij + control->Tap5;
-				Tap = Tap * r_ij + control->Tap4;
-				Tap = Tap * r_ij + control->Tap3;
-				Tap = Tap * r_ij + control->Tap2;
-				Tap = Tap * r_ij + control->Tap1;
-				Tap = Tap * r_ij + control->Tap0;
-
-				dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-				dTap = dTap * r_ij + 5*control->Tap5;
-				dTap = dTap * r_ij + 4*control->Tap4;
-				dTap = dTap * r_ij + 3*control->Tap3;
-				dTap = dTap * r_ij + 2*control->Tap2;
-				dTap += control->Tap1/r_ij;
-
-				/*vdWaals Calculations*/
-				if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) {
-					// shielding
-					powr_vdW1 = POW(r_ij, p_vdW1);
-					powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-
-					fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-					exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-					exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-
-					data->E_vdW += e_vdW = 
-						self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);		
-
-					dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-						POW(r_ij, p_vdW1 - 2.0);
-
-					CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
-							Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-							(exp1 - exp2) * dfn13 );
-				}
-				else{ // no shielding
-					exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-					exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-
-					data->E_vdW += e_vdW = 
-						self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);		
-
-					CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-							Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-							(exp1 - exp2) );
-				}
-
-				if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) {
-					// innner wall
-					e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
-					e_vdW += self_coef * Tap * e_core;
-					data->E_vdW += self_coef * Tap * e_core;
-
-					de_core = -(twbp->acore/twbp->rcore) * e_core;
-					CEvd += self_coef * ( dTap * e_core + Tap * de_core );
-				}
-
-				/*Coulomb Calculations*/
-				dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-				dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-
-				tmp = Tap / dr3gamij_3;
-				//tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
-				data->E_Ele += e_ele = 
-					self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
-
-
-				CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q *
-					( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-				/*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
-				  ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/
-
-
-				if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-					rvec_ScaledAdd( system->atoms[i].f, 
-							-(CEvd+CEclmb), nbr_pj->dvec );
-					rvec_ScaledAdd( system->atoms[j].f, 
-							+(CEvd+CEclmb), nbr_pj->dvec );
-				}
-				else { // NPT, iNPT or sNPT
-					/* for pressure coupling, terms not related to bond order 
-					   derivatives are added directly into pressure vector/tensor */
-					rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-
-					rvec_ScaledAdd( system->atoms[i].f, -1., temp );
-					rvec_Add( system->atoms[j].f, temp );
-
-					rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-					rvec_Add( data->ext_press, ext_press );
-
-					/*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", 
-					  i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] );
-
-					  fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] );
-
-					  fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",		
-					  data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
-
-					/* This part is intended for a fully-flexible box */	      
-					/* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, 
-					   system->atoms[i].x );
-					   rtensor_Scale( total_rtensor, 
-					   F_C * -(CEvd + CEclmb), temp_rtensor );
-					   rvec_OuterProduct( temp_rtensor, 
-					   nbr_pj->dvec, system->atoms[j].x );
-					   rtensor_ScaledAdd( total_rtensor, 
-					   F_C * +(CEvd + CEclmb), temp_rtensor );
-
-					   if( nbr_pj->imaginary )
-					// This is an external force due to an imaginary nbr
-					rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
-					else
-					// This interaction is completely internal
-					rtensor_Add( data->flex_bar.P, total_rtensor ); */
-				}
+    int  i, j, pj;
+    int  start_i, end_i;
+    real self_coef;
+    real p_vdW1, p_vdW1i;
+    real powr_vdW1, powgi_vdW1;
+    real tmp, r_ij, fn13, exp1, exp2;
+    real Tap, dTap, dfn13, CEvd, CEclmb;
+    real dr3gamij_1, dr3gamij_3;
+    real e_ele, e_vdW, e_core, de_core;
+    rvec temp, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    list *far_nbrs;
+
+    p_vdW1 = system->reaxprm.gp.l[28];
+    p_vdW1i = 1.0 / p_vdW1;
+    far_nbrs = (*lists) + FAR_NBRS; 
+    e_ele = 0;
+    e_vdW = 0;
+    e_core = 0;
+    de_core = 0;
+
+    for( i = 0; i < system->N; ++i ) {
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
+        // fprintf( stderr, "i: %d, start: %d, end: %d\n",
+        //     i, start_i, end_i );
+
+        for( pj = start_i; pj < end_i; ++pj )
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j = nbr_pj->nbr;
+                r_ij = nbr_pj->d;
+                twbp = &(system->reaxprm.tbp[ index_tbp (system->atoms[i].type, system->atoms[j].type, &system->reaxprm) ]);
+                self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
+
+                /* Calculate Taper and its derivative */
+                // Tap = nbr_pj->Tap;   -- precomputed during compte_H
+                Tap = control->Tap7 * r_ij + control->Tap6;
+                Tap = Tap * r_ij + control->Tap5;
+                Tap = Tap * r_ij + control->Tap4;
+                Tap = Tap * r_ij + control->Tap3;
+                Tap = Tap * r_ij + control->Tap2;
+                Tap = Tap * r_ij + control->Tap1;
+                Tap = Tap * r_ij + control->Tap0;
+
+                dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
+                dTap = dTap * r_ij + 5*control->Tap5;
+                dTap = dTap * r_ij + 4*control->Tap4;
+                dTap = dTap * r_ij + 3*control->Tap3;
+                dTap = dTap * r_ij + 2*control->Tap2;
+                dTap += control->Tap1/r_ij;
+
+                /*vdWaals Calculations*/
+                if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3) {
+                    // shielding
+                    powr_vdW1 = POW(r_ij, p_vdW1);
+                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+
+                    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+                    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+
+                    data->E_vdW += e_vdW = 
+                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+
+                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
+                        POW(r_ij, p_vdW1 - 2.0);
+
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+                            (exp1 - exp2) * dfn13 );
+                }
+                else{ // no shielding
+                    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+
+                    data->E_vdW += e_vdW = 
+                        self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+                            (exp1 - exp2) );
+                }
+
+                if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3) {
+                    // innner wall
+                    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+                    e_vdW += self_coef * Tap * e_core;
+                    data->E_vdW += self_coef * Tap * e_core;
+
+                    de_core = -(twbp->acore/twbp->rcore) * e_core;
+                    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
+                }
+
+                /*Coulomb Calculations*/
+                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+
+                tmp = Tap / dr3gamij_3;
+                //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
+                data->E_Ele += e_ele = 
+                    self_coef * C_ele * system->atoms[i].q * system->atoms[j].q * tmp;
+
+
+                CEclmb = self_coef * C_ele * system->atoms[i].q * system->atoms[j].q *
+                    ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+                /*CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
+                  ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;*/
+
+
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                    rvec_ScaledAdd( system->atoms[i].f, 
+                            -(CEvd+CEclmb), nbr_pj->dvec );
+                    rvec_ScaledAdd( system->atoms[j].f, 
+                            +(CEvd+CEclmb), nbr_pj->dvec );
+                }
+                else { // NPT, iNPT or sNPT
+                    /* for pressure coupling, terms not related to bond order 
+                       derivatives are added directly into pressure vector/tensor */
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+
+                    rvec_ScaledAdd( system->atoms[i].f, -1., temp );
+                    rvec_Add( system->atoms[j].f, temp );
+
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                    rvec_Add( data->ext_press, ext_press );
+
+                    /*fprintf( stderr, "nonbonded(%d,%d): rel_box (%f %f %f)", 
+                      i,j,nbr_pj->rel_box[0],nbr_pj->rel_box[1],nbr_pj->rel_box[2] );
+
+                      fprintf( stderr, "force(%f %f %f)", temp[0], temp[1], temp[2] );
+
+                      fprintf( stderr, "ext_press (%12.6f %12.6f %12.6f)\n",        
+                      data->ext_press[0], data->ext_press[1], data->ext_press[2] );*/
+
+                    /* This part is intended for a fully-flexible box */          
+                    /* rvec_OuterProduct( temp_rtensor, nbr_pj->dvec, 
+                       system->atoms[i].x );
+                       rtensor_Scale( total_rtensor, 
+                       F_C * -(CEvd + CEclmb), temp_rtensor );
+                       rvec_OuterProduct( temp_rtensor, 
+                       nbr_pj->dvec, system->atoms[j].x );
+                       rtensor_ScaledAdd( total_rtensor, 
+                       F_C * +(CEvd + CEclmb), temp_rtensor );
+
+                       if( nbr_pj->imaginary )
+                    // This is an external force due to an imaginary nbr
+                    rtensor_ScaledAdd( data->flex_bar.P, -1.0, total_rtensor );
+                    else
+                    // This interaction is completely internal
+                    rtensor_Add( data->flex_bar.P, total_rtensor ); */
+                }
 
 #ifdef TEST_ENERGY
-				rvec_MakeZero( temp );
-				rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec );
-				fprintf( out_control->evdw,
-						"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-						//i+1, j+1,
-						MIN( workspace->orig_id[i], workspace->orig_id[j] ), 
-						MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
-						r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ );
-
-				fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
-						MIN( workspace->orig_id[i], workspace->orig_id[j] ),
-						MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
-						r_ij, system->atoms[i].q, system->atoms[j].q, 
-						e_ele/*, data->E_Ele*/ );
+                rvec_MakeZero( temp );
+                rvec_ScaledAdd( temp, +CEvd, nbr_pj->dvec );
+                fprintf( out_control->evdw,
+                        "%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                        //i+1, j+1,
+                        MIN( workspace->orig_id[i], workspace->orig_id[j] ), 
+                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
+                        r_ij, e_vdW, temp[0], temp[1], temp[2]/*, data->E_vdW*/ );
+
+                fprintf( out_control->ecou, "%6d%6d%24.15e%24.15e%24.15e%24.15e\n",
+                        MIN( workspace->orig_id[i], workspace->orig_id[j] ),
+                        MAX( workspace->orig_id[i], workspace->orig_id[j] ), 
+                        r_ij, system->atoms[i].q, system->atoms[j].q, 
+                        e_ele/*, data->E_Ele*/ );
 #endif
 #ifdef TEST_FORCES
-				rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-				rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-				rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-				rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
 #endif
-			}
-	}
+            }
+    }
 
-	// fclose( fout );
+    // fclose( fout );
 
-	// fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", 
-	// data->ext_press[0], data->ext_press[1], data->ext_press[2] );
+    // fprintf( stderr, "nonbonded: ext_press (%24.15e %24.15e %24.15e)\n", 
+    // data->ext_press[0], data->ext_press[1], data->ext_press[2] );
 }
 
 
 /*
 
-   GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, 	
+   GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms,     
    two_body_parameters *tbp,
    global_parameters g_p,
    control_params *control, 
@@ -583,47 +583,47 @@ dTap += control->Tap1/r_ij;
 
 //vdWaals Calculations
 if(g_p.vdw_type==1 || g_p.vdw_type==3) {
-	// shielding
-	powr_vdW1 = POW(r_ij, p_vdW1);
-	powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+    // shielding
+    powr_vdW1 = POW(r_ij, p_vdW1);
+    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
 
-	fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-	exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-	exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
-	e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);		
-	E_vdW [i] += e_vdW / 2.0;
+    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+    E_vdW [i] += e_vdW / 2.0;
 
-	dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-		POW(r_ij, p_vdW1 - 2.0);
+    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
+        POW(r_ij, p_vdW1 - 2.0);
 
-	CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
-			Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-			(exp1 - exp2) * dfn13 );
+    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+            (exp1 - exp2) * dfn13 );
 }
 else{ // no shielding
-	exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-	exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
-	e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);		
-	E_vdW [i] += e_vdW / 2.0;
+    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+    E_vdW [i] += e_vdW / 2.0;
 
-	CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-			Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-			(exp1 - exp2) );
+    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+            (exp1 - exp2) );
 }
 
 if(g_p.vdw_type==2 || g_p.vdw_type==3) {
-	// innner wall
-	e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
-	e_vdW = self_coef * Tap * e_core;
+    // innner wall
+    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+    e_vdW = self_coef * Tap * e_core;
 
-	//TODO check this
-	E_vdW [i] += e_vdW / 2.0;
-	//TODO check this
+    //TODO check this
+    E_vdW [i] += e_vdW / 2.0;
+    //TODO check this
 
-	de_core = -(twbp->acore/twbp->rcore) * e_core;
-	CEvd += self_coef * ( dTap * e_core + Tap * de_core );
+    de_core = -(twbp->acore/twbp->rcore) * e_core;
+    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
 }
 
 //Coulomb Calculations
@@ -642,27 +642,27 @@ CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q *
 // ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;
 
 if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-	if (i >= j)
-		rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec );
-	else
-		rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec );
+    if (i >= j)
+        rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec );
+    else
+        rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec );
 }
 else { // NPT, iNPT or sNPT
-	// for pressure coupling, terms not related to bond order 
-	//  derivatives are added directly into pressure vector/tensor 
-	rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+    // for pressure coupling, terms not related to bond order 
+    //  derivatives are added directly into pressure vector/tensor 
+    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
 
-	if ( i >= j)
-		rvec_ScaledAdd( atoms[i].f, -1., temp );
-	else
-		rvec_Add( atoms[i].f, temp );
+    if ( i >= j)
+        rvec_ScaledAdd( atoms[i].f, -1., temp );
+    else
+        rvec_Add( atoms[i].f, temp );
 
-	rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
 
-	//rvec_Add( data->ext_press, ext_press );
-	rvec_Copy (aux_ext_press[i], ext_press);
+    //rvec_Add( data->ext_press, ext_press );
+    rvec_Copy (aux_ext_press[i], ext_press);
 
-	//TODO CHECK THIS calculation here, it should be divided by two somehow.
+    //TODO CHECK THIS calculation here, it should be divided by two somehow.
 }
 }
 //}
@@ -673,921 +673,921 @@ else { // NPT, iNPT or sNPT
 
 
 
-GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms, 	
-		two_body_parameters *tbp,
-		global_parameters g_p,
-		control_params *control, 
-		simulation_data *data,  
-		list p_far_nbrs, 
-		real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
-		int num_atom_types, int N )
+GLOBAL void Cuda_vdW_Coulomb_Energy( reax_atom *atoms,     
+        two_body_parameters *tbp,
+        global_parameters g_p,
+        control_params *control, 
+        simulation_data *data,  
+        list p_far_nbrs, 
+        real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
+        int num_atom_types, int N )
 {
-	extern __shared__ real _vdw[];
-	extern __shared__ real _ele[];
-	extern __shared__ rvec _force [];
-
-	real *sh_vdw;
-	real *sh_ele;
-	rvec *sh_force;
-
-	int  i, j, pj;
-	int  start_i, end_i;
-	real self_coef;
-	real p_vdW1, p_vdW1i;
-	real powr_vdW1, powgi_vdW1;
-	real tmp, r_ij, fn13, exp1, exp2;
-	real Tap, dTap, dfn13, CEvd, CEclmb;
-	real dr3gamij_1, dr3gamij_3;
-	real e_ele, e_vdW, e_core, de_core;
-	rvec temp, ext_press;
-	// rtensor temp_rtensor, total_rtensor;
-	two_body_parameters *twbp;
-	far_neighbor_data *nbr_pj;
-	list *far_nbrs = &p_far_nbrs;
-
-	int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-	int warpid = thread_id / VDW_THREADS_PER_ATOM;
-	int laneid = thread_id & (VDW_THREADS_PER_ATOM -1);
-
-	i = warpid;
-
-	sh_vdw = _vdw;
-	sh_ele = _vdw + blockDim.x;
-	sh_force = (rvec *)( _vdw + 2*blockDim.x);
-
-	sh_vdw[threadIdx.x] = 0.0; 
-	sh_ele[threadIdx.x] = 0.0; 
-	rvec_MakeZero ( sh_force [threadIdx.x] );
-
-	if (i < N)
-	{
-
-		p_vdW1 = g_p.l[28];
-		p_vdW1i = 1.0 / p_vdW1;
-		e_ele = 0;
-		e_vdW = 0;
-		e_core = 0;
-		de_core = 0;
-
-		//for( i = 0; i < system->N; ++i ) {
-		start_i = Start_Index(i, far_nbrs);
-		end_i   = End_Index(i, far_nbrs);
-		// fprintf( stderr, "i: %d, start: %d, end: %d\n",
-		//     i, start_i, end_i );
-
-		pj = start_i + laneid;
-		//for( pj = start_i; pj < end_i; ++pj )
-		while (pj < end_i)
-		{
-			if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
-				nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-				j = nbr_pj->nbr;
-				r_ij = nbr_pj->d;
-				twbp = &(tbp[ index_tbp (atoms[i].type, atoms[j].type, num_atom_types) ]);
-				self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
-
-				//CHANGE ORIGINAL
-				//if (i <= j) continue;
-				//CHANGE ORIGINAL
-
-				// Calculate Taper and its derivative 
-				// Tap = nbr_pj->Tap;   -- precomputed during compte_H
-				Tap = control->Tap7 * r_ij + control->Tap6;
-				Tap = Tap * r_ij + control->Tap5;
-				Tap = Tap * r_ij + control->Tap4;
-				Tap = Tap * r_ij + control->Tap3;
-				Tap = Tap * r_ij + control->Tap2;
-				Tap = Tap * r_ij + control->Tap1;
-				Tap = Tap * r_ij + control->Tap0;
-
-				dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-				dTap = dTap * r_ij + 5*control->Tap5;
-				dTap = dTap * r_ij + 4*control->Tap4;
-				dTap = dTap * r_ij + 3*control->Tap3;
-				dTap = dTap * r_ij + 2*control->Tap2;
-				dTap += control->Tap1/r_ij;
-
-				//vdWaals Calculations
-				if(g_p.vdw_type==1 || g_p.vdw_type==3) {
-					// shielding
-					powr_vdW1 = POW(r_ij, p_vdW1);
-					powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-
-					fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-					exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-					exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-
-					e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);		
-
-
-					//E_vdW [i] += e_vdW / 2.0;
-					sh_vdw [threadIdx.x] += e_vdW/2.0;
-
-					dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-						POW(r_ij, p_vdW1 - 2.0);
-
-					CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
-							Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-							(exp1 - exp2) * dfn13 );
-				}
-				else{ // no shielding
-					exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-					exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-
-					e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);		
-
-
-					//E_vdW [i] += e_vdW / 2.0;
-					sh_vdw [threadIdx.x] += e_vdW/2.0;
-
-					CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-							Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
-							(exp1 - exp2) );
-				}
-
-				if(g_p.vdw_type==2 || g_p.vdw_type==3) {
-					// innner wall
-					e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
-					e_vdW = self_coef * Tap * e_core;
-
-					//TODO check this
-					//E_vdW [i] += e_vdW / 2.0;
-					sh_vdw [threadIdx.x] += e_vdW / 2.0;
-					//TODO check this
-
-					de_core = -(twbp->acore/twbp->rcore) * e_core;
-					CEvd += self_coef * ( dTap * e_core + Tap * de_core );
-				}
-
-				//Coulomb Calculations
-				dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-				dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
-
-				tmp = Tap / dr3gamij_3;
-				//tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
-				e_ele = 
-					self_coef * C_ele * atoms[i].q * atoms[j].q * tmp;
-
-				//E_Ele [i] += e_ele / 2.0;
-				sh_ele [threadIdx.x] += e_ele / 2.0;
-
-				CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q *
-					( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-				//CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
-				// ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;
-
-				if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-					if (i >= j){
-						//rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec );
-						rvec_ScaledAdd( sh_force[threadIdx.x], -(CEvd+CEclmb), nbr_pj->dvec );
-					}
-					else
-					{
-						//rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec );
-						rvec_ScaledAdd( sh_force[threadIdx.x], +(CEvd+CEclmb), nbr_pj->dvec );
-					}
-				}
-				else { // NPT, iNPT or sNPT
-					// for pressure coupling, terms not related to bond order 
-					//  derivatives are added directly into pressure vector/tensor 
-					rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-
-					if ( i >= j)
-					{
-						//rvec_ScaledAdd( atoms[i].f, -1., temp );
-						rvec_ScaledAdd( sh_force[threadIdx.x], -1., temp );
-					}
-					else
-					{
-						//rvec_Add( atoms[i].f, temp );
-						rvec_Add( sh_force[threadIdx.x], temp );
-					}
-
-					rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-
-					//rvec_Add( data->ext_press, ext_press );
-					rvec_Copy (aux_ext_press[i], ext_press);
-
-					//TODO CHECK THIS calculation here, it should be divided by two somehow.
-				}
-			} // if condition for far neighbors
-
-
-			pj += VDW_THREADS_PER_ATOM;
-
-		} // end of while loop for pj < end_i condition
-	} // if (i < N ) condition
-	//}
-
-	__syncthreads ();
-
-	if (laneid < 16) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
-	}
-	__syncthreads ();
-	if (laneid < 8) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
-	}
-	__syncthreads ();
-	if (laneid < 4) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
-	}
-	__syncthreads ();
-	if (laneid < 2) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
-	}
-	__syncthreads ();
-	if (laneid < 1) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
-	}
-	__syncthreads ();
-	if (laneid == 0) {
-		E_vdW [i] += sh_vdw[threadIdx.x];
-		E_Ele [i] += sh_ele[threadIdx.x];
-		rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]);
-	}
+    extern __shared__ real _vdw[];
+    extern __shared__ real _ele[];
+    extern __shared__ rvec _force [];
+
+    real *sh_vdw;
+    real *sh_ele;
+    rvec *sh_force;
+
+    int  i, j, pj;
+    int  start_i, end_i;
+    real self_coef;
+    real p_vdW1, p_vdW1i;
+    real powr_vdW1, powgi_vdW1;
+    real tmp, r_ij, fn13, exp1, exp2;
+    real Tap, dTap, dfn13, CEvd, CEclmb;
+    real dr3gamij_1, dr3gamij_3;
+    real e_ele, e_vdW, e_core, de_core;
+    rvec temp, ext_press;
+    // rtensor temp_rtensor, total_rtensor;
+    two_body_parameters *twbp;
+    far_neighbor_data *nbr_pj;
+    list *far_nbrs = &p_far_nbrs;
+
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warpid = thread_id / VDW_THREADS_PER_ATOM;
+    int laneid = thread_id & (VDW_THREADS_PER_ATOM -1);
+
+    i = warpid;
+
+    sh_vdw = _vdw;
+    sh_ele = _vdw + blockDim.x;
+    sh_force = (rvec *)( _vdw + 2*blockDim.x);
+
+    sh_vdw[threadIdx.x] = 0.0; 
+    sh_ele[threadIdx.x] = 0.0; 
+    rvec_MakeZero ( sh_force [threadIdx.x] );
+
+    if (i < N)
+    {
+
+        p_vdW1 = g_p.l[28];
+        p_vdW1i = 1.0 / p_vdW1;
+        e_ele = 0;
+        e_vdW = 0;
+        e_core = 0;
+        de_core = 0;
+
+        //for( i = 0; i < system->N; ++i ) {
+        start_i = Start_Index(i, far_nbrs);
+        end_i   = End_Index(i, far_nbrs);
+        // fprintf( stderr, "i: %d, start: %d, end: %d\n",
+        //     i, start_i, end_i );
+
+        pj = start_i + laneid;
+        //for( pj = start_i; pj < end_i; ++pj )
+        while (pj < end_i)
+        {
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j = nbr_pj->nbr;
+                r_ij = nbr_pj->d;
+                twbp = &(tbp[ index_tbp (atoms[i].type, atoms[j].type, num_atom_types) ]);
+                self_coef = (i == j) ? 0.5 : 1.0; // for supporting small boxes!
+
+                //CHANGE ORIGINAL
+                //if (i <= j) continue;
+                //CHANGE ORIGINAL
+
+                // Calculate Taper and its derivative 
+                // Tap = nbr_pj->Tap;   -- precomputed during compte_H
+                Tap = control->Tap7 * r_ij + control->Tap6;
+                Tap = Tap * r_ij + control->Tap5;
+                Tap = Tap * r_ij + control->Tap4;
+                Tap = Tap * r_ij + control->Tap3;
+                Tap = Tap * r_ij + control->Tap2;
+                Tap = Tap * r_ij + control->Tap1;
+                Tap = Tap * r_ij + control->Tap0;
+
+                dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
+                dTap = dTap * r_ij + 5*control->Tap5;
+                dTap = dTap * r_ij + 4*control->Tap4;
+                dTap = dTap * r_ij + 3*control->Tap3;
+                dTap = dTap * r_ij + 2*control->Tap2;
+                dTap += control->Tap1/r_ij;
+
+                //vdWaals Calculations
+                if(g_p.vdw_type==1 || g_p.vdw_type==3) {
+                    // shielding
+                    powr_vdW1 = POW(r_ij, p_vdW1);
+                    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+
+                    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+                    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+
+                    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+
+
+                    //E_vdW [i] += e_vdW / 2.0;
+                    sh_vdw [threadIdx.x] += e_vdW/2.0;
+
+                    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
+                        POW(r_ij, p_vdW1 - 2.0);
+
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2 * exp2) - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+                            (exp1 - exp2) * dfn13 );
+                }
+                else{ // no shielding
+                    exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+                    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+
+                    e_vdW = self_coef * Tap * twbp->D * (exp1 - 2.0 * exp2);        
+
+
+                    //E_vdW [i] += e_vdW / 2.0;
+                    sh_vdw [threadIdx.x] += e_vdW/2.0;
+
+                    CEvd = self_coef * ( dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+                            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * 
+                            (exp1 - exp2) );
+                }
+
+                if(g_p.vdw_type==2 || g_p.vdw_type==3) {
+                    // innner wall
+                    e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+                    e_vdW = self_coef * Tap * e_core;
+
+                    //TODO check this
+                    //E_vdW [i] += e_vdW / 2.0;
+                    sh_vdw [threadIdx.x] += e_vdW / 2.0;
+                    //TODO check this
+
+                    de_core = -(twbp->acore/twbp->rcore) * e_core;
+                    CEvd += self_coef * ( dTap * e_core + Tap * de_core );
+                }
+
+                //Coulomb Calculations
+                dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+                dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+
+                tmp = Tap / dr3gamij_3;
+                //tmp = Tap * nbr_pj->inv_dr3gamij_3; -- precomputed during compte_H
+                e_ele = 
+                    self_coef * C_ele * atoms[i].q * atoms[j].q * tmp;
+
+                //E_Ele [i] += e_ele / 2.0;
+                sh_ele [threadIdx.x] += e_ele / 2.0;
+
+                CEclmb = self_coef * C_ele * atoms[i].q * atoms[j].q *
+                    ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+                //CEclmb = self_coef*C_ele*system->atoms[i].q*system->atoms[j].q* 
+                // ( dTap- Tap*r_ij*nbr_pj->inv_dr3gamij_1 )*nbr_pj->inv_dr3gamij_3;
+
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                    if (i >= j){
+                        //rvec_ScaledAdd( atoms[i].f, -(CEvd+CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force[threadIdx.x], -(CEvd+CEclmb), nbr_pj->dvec );
+                    }
+                    else
+                    {
+                        //rvec_ScaledAdd( atoms[i].f, +(CEvd+CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force[threadIdx.x], +(CEvd+CEclmb), nbr_pj->dvec );
+                    }
+                }
+                else { // NPT, iNPT or sNPT
+                    // for pressure coupling, terms not related to bond order 
+                    //  derivatives are added directly into pressure vector/tensor 
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+
+                    if ( i >= j)
+                    {
+                        //rvec_ScaledAdd( atoms[i].f, -1., temp );
+                        rvec_ScaledAdd( sh_force[threadIdx.x], -1., temp );
+                    }
+                    else
+                    {
+                        //rvec_Add( atoms[i].f, temp );
+                        rvec_Add( sh_force[threadIdx.x], temp );
+                    }
+
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+
+                    //rvec_Add( data->ext_press, ext_press );
+                    rvec_Copy (aux_ext_press[i], ext_press);
+
+                    //TODO CHECK THIS calculation here, it should be divided by two somehow.
+                }
+            } // if condition for far neighbors
+
+
+            pj += VDW_THREADS_PER_ATOM;
+
+        } // end of while loop for pj < end_i condition
+    } // if (i < N ) condition
+    //}
+
+    __syncthreads ();
+
+    if (laneid < 16) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
+    }
+    __syncthreads ();
+    if (laneid < 8) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
+    }
+    __syncthreads ();
+    if (laneid < 4) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
+    }
+    __syncthreads ();
+    if (laneid < 2) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
+    }
+    __syncthreads ();
+    if (laneid < 1) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
+    }
+    __syncthreads ();
+    if (laneid == 0) {
+        E_vdW [i] += sh_vdw[threadIdx.x];
+        E_Ele [i] += sh_ele[threadIdx.x];
+        rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]);
+    }
 
 
 }
 
 void LR_vdW_Coulomb( reax_system *system, control_params *control, 
-		int i, int j, real r_ij, LR_data *lr )
+        int i, int j, real r_ij, LR_data *lr )
 {
-	real p_vdW1 = system->reaxprm.gp.l[28];
-	real p_vdW1i = 1.0 / p_vdW1;
-	real powr_vdW1, powgi_vdW1;
-	real tmp, fn13, exp1, exp2;
-	real Tap, dTap, dfn13;
-	real dr3gamij_1, dr3gamij_3;
-	real e_core, de_core;
-	two_body_parameters *twbp;
-
-	twbp = &(system->reaxprm.tbp[ index_tbp (i,j,&system->reaxprm) ]);
-	e_core = 0;
-	de_core = 0;
-
-	/* calculate taper and its derivative */
-	Tap = control->Tap7 * r_ij + control->Tap6;
-	Tap = Tap * r_ij + control->Tap5;
-	Tap = Tap * r_ij + control->Tap4;
-	Tap = Tap * r_ij + control->Tap3;
-	Tap = Tap * r_ij + control->Tap2;
-	Tap = Tap * r_ij + control->Tap1;
-	Tap = Tap * r_ij + control->Tap0;
-
-	dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
-	dTap = dTap * r_ij + 5*control->Tap5;
-	dTap = dTap * r_ij + 4*control->Tap4;
-	dTap = dTap * r_ij + 3*control->Tap3;
-	dTap = dTap * r_ij + 2*control->Tap2;
-	dTap += control->Tap1/r_ij;
-
-
-	/* vdWaals calculations */
-	powr_vdW1 = POW(r_ij, p_vdW1);
-	powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
-
-	fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-	exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-	exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-
-	lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);		
-	/* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
+    real p_vdW1 = system->reaxprm.gp.l[28];
+    real p_vdW1i = 1.0 / p_vdW1;
+    real powr_vdW1, powgi_vdW1;
+    real tmp, fn13, exp1, exp2;
+    real Tap, dTap, dfn13;
+    real dr3gamij_1, dr3gamij_3;
+    real e_core, de_core;
+    two_body_parameters *twbp;
+
+    twbp = &(system->reaxprm.tbp[ index_tbp (i,j,&system->reaxprm) ]);
+    e_core = 0;
+    de_core = 0;
+
+    /* calculate taper and its derivative */
+    Tap = control->Tap7 * r_ij + control->Tap6;
+    Tap = Tap * r_ij + control->Tap5;
+    Tap = Tap * r_ij + control->Tap4;
+    Tap = Tap * r_ij + control->Tap3;
+    Tap = Tap * r_ij + control->Tap2;
+    Tap = Tap * r_ij + control->Tap1;
+    Tap = Tap * r_ij + control->Tap0;
+
+    dTap = 7*control->Tap7 * r_ij + 6*control->Tap6;
+    dTap = dTap * r_ij + 5*control->Tap5;
+    dTap = dTap * r_ij + 4*control->Tap4;
+    dTap = dTap * r_ij + 3*control->Tap3;
+    dTap = dTap * r_ij + 2*control->Tap2;
+    dTap += control->Tap1/r_ij;
+
+
+    /* vdWaals calculations */
+    powr_vdW1 = POW(r_ij, p_vdW1);
+    powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+
+    fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+    exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+    exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+
+    lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
+    /* fprintf(stderr,"vdW: Tap:%f, r: %f, f13:%f, D:%f, Energy:%f,\
 Gamma_w:%f, p_vdw: %f, alpha: %f, r_vdw: %f, %lf %lf\n",
 Tap, r_ij, fn13, twbp->D, Tap * twbp->D * (exp1 - 2.0 * exp2), 
 powgi_vdW1, p_vdW1, twbp->alpha, twbp->r_vdW, exp1, exp2); */
 
-	dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
+    dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * POW(r_ij, p_vdW1 - 2.0);
 
-	lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - 
-		Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+    lr->CEvd = dTap * twbp->D * (exp1 - 2 * exp2) - 
+        Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
 
-	/*vdWaals Calculations*/
-	if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3)
-	{ // shielding
-		powr_vdW1 = POW(r_ij, p_vdW1);
-		powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
+    /*vdWaals Calculations*/
+    if(system->reaxprm.gp.vdw_type==1 || system->reaxprm.gp.vdw_type==3)
+    { // shielding
+        powr_vdW1 = POW(r_ij, p_vdW1);
+        powgi_vdW1 = POW( 1.0 / twbp->gamma_w, p_vdW1);
 
-		fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
-		exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
-		exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+        fn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i );
+        exp1 = EXP( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
+        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
-		lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);		
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);        
 
-		dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
-			POW(r_ij, p_vdW1 - 2.0);
+        dfn13 = POW( powr_vdW1 + powgi_vdW1, p_vdW1i - 1.0) * 
+            POW(r_ij, p_vdW1 - 2.0);
 
-		lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-			Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
-	}
-	else{ // no shielding
-		exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
-		exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
+    }
+    else{ // no shielding
+        exp1 = EXP( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
+        exp2 = EXP( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
-		lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
+        lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
 
-		lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
-			Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
-	}
+        lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) - 
+            Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2);
+    }
 
-	if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3)
-	{ // innner wall
-		e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
-		lr->e_vdW += Tap * e_core;
+    if(system->reaxprm.gp.vdw_type==2 || system->reaxprm.gp.vdw_type==3)
+    { // innner wall
+        e_core = twbp->ecore * EXP(twbp->acore * (1.0-(r_ij/twbp->rcore)));
+        lr->e_vdW += Tap * e_core;
 
-		de_core = -(twbp->acore/twbp->rcore) * e_core;
-		lr->CEvd += dTap * e_core + Tap * de_core;
-	}
+        de_core = -(twbp->acore/twbp->rcore) * e_core;
+        lr->CEvd += dTap * e_core + Tap * de_core;
+    }
 
-	/* Coulomb calculations */
-	dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
-	dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
+    /* Coulomb calculations */
+    dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
+    dr3gamij_3 = POW( dr3gamij_1 , 0.33333333333333 );
 
-	tmp = Tap / dr3gamij_3;
-	lr->H = EV_to_KCALpMOL * tmp;
-	lr->e_ele = C_ele * tmp;
-	/* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
+    tmp = Tap / dr3gamij_3;
+    lr->H = EV_to_KCALpMOL * tmp;
+    lr->e_ele = C_ele * tmp;
+    /* fprintf( stderr,"i:%d(%d), j:%d(%d), gamma:%f,\
 Tap:%f, dr3gamij_3:%f, qi: %f, qj: %f\n",
 i, system->atoms[i].type, j, system->atoms[j].type, 
 twbp->gamma, Tap, dr3gamij_3, 
 system->atoms[i].q, system->atoms[j].q ); */
 
-	lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
-	/* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
-	   i+1, j+1, r_ij, e_vdW, CEvd * r_ij,
-	   system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */
+    lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
+    /* fprintf( stdout, "%d %d\t%g\t%g  %g\t%g  %g\t%g  %g\n",
+       i+1, j+1, r_ij, e_vdW, CEvd * r_ij,
+       system->atoms[i].q, system->atoms[j].q, e_ele, CEclmb * r_ij ); */
 
-	/* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n",
-	   i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */
+    /* fprintf( stderr,"LR_Lookup:%3d%3d%5.3f-%8.5f,%8.5f%8.5f,%8.5f%8.5f\n",
+       i, j, r_ij, lr->H, lr->e_vdW, lr->CEvd, lr->e_ele, lr->CEclmb ); */
 }
 
 
 void Tabulated_vdW_Coulomb_Energy( reax_system *system, control_params *control,
-		simulation_data *data, 
-		static_storage *workspace, list **lists, 
-		output_controls *out_control )
+        simulation_data *data, 
+        static_storage *workspace, list **lists, 
+        output_controls *out_control )
 {
-	int i, j, pj, r, steps, update_freq, update_energies;
-	int type_i, type_j, tmin, tmax;
-	int start_i, end_i;
-	real r_ij, self_coef, base, dif;
-	real e_vdW, e_ele;
-	real CEvd, CEclmb;
-	rvec temp, ext_press;
-	far_neighbor_data *nbr_pj;
-	list *far_nbrs = (*lists) + FAR_NBRS;
-	LR_lookup_table *t;
-
-	steps = data->step - data->prev_steps;
-	update_freq = out_control->energy_update_freq;
-	update_energies = update_freq > 0 && steps % update_freq == 0;
-
-	for( i = 0; i < system->N; ++i ) {
-		type_i  = system->atoms[i].type;
-		start_i = Start_Index(i,far_nbrs);
-		end_i   = End_Index(i,far_nbrs);
-
-		for( pj = start_i; pj < end_i; ++pj ) 
-			if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
-				nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-				j      = nbr_pj->nbr;
-				type_j = system->atoms[j].type;
-				r_ij   = nbr_pj->d;
-				self_coef = (i == j) ? 0.5 : 1.0;
-				tmin  = MIN( type_i, type_j );
-				tmax  = MAX( type_i, type_j );
-				t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); 
-
-				/* Cubic Spline Interpolation */
-				r = (int)(r_ij * t->inv_dx);
-				if( r == 0 )  ++r;
-				base = (real)(r+1) * t->dx;
-				dif = r_ij - base;
-				//fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
-
-				if( update_energies ) {
-					e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-						t->vdW[r].a;
-					e_vdW *= self_coef;
-
-					e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-						t->ele[r].a;
-					e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q;
-
-					data->E_vdW += e_vdW;
-					data->E_Ele += e_ele;
-				}	
-
-				CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-					t->CEvd[r].a;
-				CEvd *= self_coef;
-				//CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b;
-
-				CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-					t->CEclmb[r].a;
-				CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q;
-
-				if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-					rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
-					rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec );
-				}
-				else { // NPT, iNPT or sNPT
-					/* for pressure coupling, terms not related to bond order 
-					   derivatives are added directly into pressure vector/tensor */
-					rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-					rvec_ScaledAdd( system->atoms[i].f, -1., temp );
-					rvec_Add( system->atoms[j].f, temp );
-					rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-					rvec_Add( data->ext_press, ext_press );
-				}
+    int i, j, pj, r, steps, update_freq, update_energies;
+    int type_i, type_j, tmin, tmax;
+    int start_i, end_i;
+    real r_ij, self_coef, base, dif;
+    real e_vdW, e_ele;
+    real CEvd, CEclmb;
+    rvec temp, ext_press;
+    far_neighbor_data *nbr_pj;
+    list *far_nbrs = (*lists) + FAR_NBRS;
+    LR_lookup_table *t;
+
+    steps = data->step - data->prev_steps;
+    update_freq = out_control->energy_update_freq;
+    update_energies = update_freq > 0 && steps % update_freq == 0;
+
+    for( i = 0; i < system->N; ++i ) {
+        type_i  = system->atoms[i].type;
+        start_i = Start_Index(i,far_nbrs);
+        end_i   = End_Index(i,far_nbrs);
+
+        for( pj = start_i; pj < end_i; ++pj ) 
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j      = nbr_pj->nbr;
+                type_j = system->atoms[j].type;
+                r_ij   = nbr_pj->d;
+                self_coef = (i == j) ? 0.5 : 1.0;
+                tmin  = MIN( type_i, type_j );
+                tmax  = MAX( type_i, type_j );
+                t = &( LR[ index_lr (tmin,tmax,system->reaxprm.num_atom_types) ] ); 
+
+                /* Cubic Spline Interpolation */
+                r = (int)(r_ij * t->inv_dx);
+                if( r == 0 )  ++r;
+                base = (real)(r+1) * t->dx;
+                dif = r_ij - base;
+                //fprintf(stderr, "r: %f, i: %d, base: %f, dif: %f\n", r, i, base, dif);
+
+                if( update_energies ) {
+                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
+                        t->vdW[r].a;
+                    e_vdW *= self_coef;
+
+                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                        t->ele[r].a;
+                    e_ele *= self_coef * system->atoms[i].q * system->atoms[j].q;
+
+                    data->E_vdW += e_vdW;
+                    data->E_Ele += e_ele;
+                }    
+
+                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
+                    t->CEvd[r].a;
+                CEvd *= self_coef;
+                //CEvd = (3*t->vdW[r].d*dif + 2*t->vdW[r].c)*dif + t->vdW[r].b;
+
+                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
+                    t->CEclmb[r].a;
+                CEclmb *= self_coef * system->atoms[i].q * system->atoms[j].q;
+
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                    rvec_ScaledAdd( system->atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
+                    rvec_ScaledAdd( system->atoms[j].f, +(CEvd + CEclmb), nbr_pj->dvec );
+                }
+                else { // NPT, iNPT or sNPT
+                    /* for pressure coupling, terms not related to bond order 
+                       derivatives are added directly into pressure vector/tensor */
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                    rvec_ScaledAdd( system->atoms[i].f, -1., temp );
+                    rvec_Add( system->atoms[j].f, temp );
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                    rvec_Add( data->ext_press, ext_press );
+                }
 
 #ifdef TEST_ENERGY
-				fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
-						workspace->orig_id[i], workspace->orig_id[j], 
-						r_ij, e_vdW, data->E_vdW );
-				fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
-						workspace->orig_id[i], workspace->orig_id[j],
-						r_ij, system->atoms[i].q, system->atoms[j].q, 
-						e_ele, data->E_Ele );
+                fprintf(out_control->evdw, "%6d%6d%24.15e%24.15e%24.15e\n",
+                        workspace->orig_id[i], workspace->orig_id[j], 
+                        r_ij, e_vdW, data->E_vdW );
+                fprintf(out_control->ecou,"%6d%6d%24.15e%24.15e%24.15e%24.15e%24.15e\n",
+                        workspace->orig_id[i], workspace->orig_id[j],
+                        r_ij, system->atoms[i].q, system->atoms[j].q, 
+                        e_ele, data->E_Ele );
 #endif
 #ifdef TEST_FORCES
-				rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
-				rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
-				rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
-				rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_vdw[i], -CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_vdw[j], +CEvd, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[i], -CEclmb, nbr_pj->dvec );
+                rvec_ScaledAdd( workspace->f_ele[j], +CEclmb, nbr_pj->dvec );
 #endif
-			}
-	}
+            }
+    }
 }
 
-GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( 	reax_atom *atoms, 
-		control_params *control,
-		simulation_data *data, 
-		list p_far_nbrs, 
-		real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
-		LR_lookup_table *d_LR,
-		int num_atom_types,
-		int energy_update_freq,
-		int N  )
+GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy(     reax_atom *atoms, 
+        control_params *control,
+        simulation_data *data, 
+        list p_far_nbrs, 
+        real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
+        LR_lookup_table *d_LR,
+        int num_atom_types,
+        int energy_update_freq,
+        int N  )
 {
 
-	extern __shared__ real _vdw[];
-	extern __shared__ real _ele[];
-	extern __shared__ rvec _force [];
-
-	real *sh_vdw;
-	real *sh_ele;
-	rvec *sh_force;
-
-	int i, j, pj, r, steps, update_freq, update_energies;
-	int type_i, type_j, tmin, tmax;
-	int start_i, end_i;
-	real r_ij, self_coef, base, dif;
-	real e_vdW, e_ele;
-	real CEvd, CEclmb;
-	rvec temp, ext_press;
-	far_neighbor_data *nbr_pj;
-	LR_lookup_table *t;
-	list *far_nbrs = &p_far_nbrs;
-
-	int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-	int warpid = thread_id / VDW_THREADS_PER_ATOM;
-	int laneid = thread_id & (VDW_THREADS_PER_ATOM -1);
-
-	i = warpid;
-
-	sh_vdw = _vdw;
-	sh_ele = _vdw + blockDim.x;
-	sh_force = (rvec *)( _vdw + 2*blockDim.x);
-
-	sh_vdw[threadIdx.x] = 0.0; 
-	sh_ele[threadIdx.x] = 0.0; 
-	rvec_MakeZero ( sh_force [threadIdx.x] );
-
-	if ( i < N ) 
-	{
-
-		reax_atom local_atom ;
-		local_atom.q =  atoms[i].q;
-		//local_atom.q =  d_far_data.q[i];
-		local_atom.type = atoms[i].type;
-		//local_atom.type = d_far_data.type[i];
-
-		/*
-		   sh_vdw = _vdw;
-		   sh_ele = _vdw + warpid;
-		   sh_force = (rvec *)( _vdw + 2*warpid);
-
-		   sh_vdw[threadIdx.x] = 0.0; 
-		   sh_ele[threadIdx.x] = 0.0; 
-		   rvec_MakeZero ( sh_force [threadIdx.x] );
-		 */
-
-
-		steps = data->step - data->prev_steps;
-		update_freq = energy_update_freq;
-		update_energies = update_freq > 0 && steps % update_freq == 0;
-
-		//for( i = 0; i < system->N; ++i ) {
-		type_i  = local_atom.type;
-		start_i = Start_Index(i,far_nbrs);
-		end_i   = End_Index(i,far_nbrs);
-
-		pj = start_i + laneid;
-
-		//for( pj = start_i; pj < end_i; ++pj ) 
-		while (pj < end_i)
-		{
-			if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
-				//if( d_far_data.d[pj] <= control->r_cut ) 
-			{
-				nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-				j      = nbr_pj->nbr;
-				//j      = d_far_data.nbrs[pj];
-				type_j = atoms[j].type;
-				//type_j = d_far_data.type[j];
-				r_ij   = nbr_pj->d;
-				//r_ij   = d_far_data.d[pj];
-				self_coef = (i == j) ? 0.5 : 1.0;
-				tmin  = MIN( type_i, type_j );
-				tmax  = MAX( type_i, type_j );
-				t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
-
-				//TODO
-				//CHANGE ORIGINAL
-				//if (i <= j) { pj += blockDim.x; continue; }
-				//CHANGE ORIGINAL
-
-				/* Cubic Spline Interpolation */
-				r = (int)(r_ij * t->inv_dx);
-				if( r == 0 )  ++r;
-				base = (real)(r+1) * t->dx;
-				dif = r_ij - base;
-
-				if(( update_energies )) 
-				{
-					e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-						t->vdW[r].a;
-					e_vdW *= self_coef;
-
-					e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + t->ele[r].a;
-					e_ele *= self_coef * local_atom.q * atoms[j].q;
-
-
-					//data->E_vdW += e_vdW;
-					//TODO
-					//E_vdW [i] += e_vdW / 2.0;
-					//E_vdW [i] = __dadd_rd (E_vdW [i], e_vdW/2.0);
-					sh_vdw [threadIdx.x] += e_vdW/2.0;
-					//E_vdW [i] += e_vdW;
-
-					//TODO
-					//data->E_Ele += e_ele;
-					//E_Ele [i] += e_ele / 2.0;
-					//E_Ele [i] = __dadd_rd ( E_Ele [i], e_ele / 2.0);
-					sh_ele [threadIdx.x] += e_ele/2.0;
-					//E_Ele [i] += e_ele;
-				}	
-
-				CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-					t->CEvd[r].a;
-				CEvd *= self_coef;
-
-				CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-					t->CEclmb[r].a;
-				CEclmb *= self_coef * local_atom.q * atoms[j].q;
-				//CEclmb *= self_coef * local_atom.q * d_far_data.q[j];
-
-				if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
-					if ( i >= j)
-						//rvec_ScaledAdd( atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
-						rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec );
-					//rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), d_far_data.dvec[pj] );
-					else 
-						//rvec_ScaledAdd( atoms[i].f, +(CEvd + CEclmb), nbr_pj->dvec );
-						rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec );
-					//rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), d_far_data.dvec[pj] );
-				}
-				else { // NPT, iNPT or sNPT
-					// for pressure coupling, terms not related to bond order 
-					//  derivatives are added directly into pressure vector/tensor /
-					rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-					if (i >= j)
-						rvec_ScaledAdd( atoms[i].f, -1., temp );
-					else
-						rvec_Add( atoms[i].f, temp );
-					rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
-
-					//rvec_Add( data->ext_press, ext_press );
-					rvec_Copy (aux_ext_press [i], ext_press );
-
-					//TODO CHECK THIS
-				}
-
-
-
-			}
-
-			pj += VDW_THREADS_PER_ATOM;
-		}
-
-	}// if i < n condition
-
-	__syncthreads ();
-
-	if (laneid < 16) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
-	}
-	__syncthreads ();
-	if (laneid < 8) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
-	}
-	__syncthreads ();
-	if (laneid < 4) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
-	}
-	__syncthreads ();
-	if (laneid < 2) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
-	}
-	__syncthreads ();
-	if (laneid < 1) {
-		sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
-		sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
-		rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
-	}
-	__syncthreads ();
-	if (laneid == 0) {
-		E_vdW [i] += sh_vdw[threadIdx.x];
-		E_Ele [i] += sh_ele[threadIdx.x];
-		rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]);
-	}
-
-
-	}
-
-
-
-
-	GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1( 	reax_atom *atoms, 
-			control_params *control,
-			simulation_data *data, 
-			list p_far_nbrs, 
-			real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
-			LR_lookup_table *d_LR,
-			int num_atom_types,
-			int energy_update_freq,
-			int N )
-	{
-
-		extern __shared__ real _vdw[];
-		extern __shared__ real _ele[];
-
-		real *sh_vdw;
-		real *sh_ele;
-
-		int i, j, pj, r, steps, update_freq, update_energies;
-		int type_i, type_j, tmin, tmax;
-		int start_i, end_i;
-		real r_ij, self_coef, base, dif;
-		real e_vdW, e_ele;
-		real CEvd, CEclmb;
-		rvec temp, ext_press;
-		far_neighbor_data *nbr_pj;
-		LR_lookup_table *t;
-		list *far_nbrs = &p_far_nbrs;
-
-		i = blockIdx.x;
-
-		reax_atom local_atom;
-		local_atom.q =  atoms[i].q;
-		local_atom.type = atoms[i].type;
-
-		sh_vdw = _vdw;
-		sh_ele = _vdw + blockDim.x;
-
-		sh_vdw[threadIdx.x] = 0.0; 
-		sh_ele[threadIdx.x] = 0.0; 
-
-
-		steps = data->step - data->prev_steps;
-		update_freq = energy_update_freq;
-		update_energies = update_freq > 0 && steps % update_freq == 0;
-
-		type_i  = local_atom.type;
-		start_i = Start_Index(i,far_nbrs);
-		end_i   = End_Index(i,far_nbrs);
-
-		pj = start_i + threadIdx.x;
-
-		while (pj < end_i)
-		{
-			if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
-			{
-				nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-				j      = nbr_pj->nbr;
-				type_j = atoms[j].type;
-				r_ij   = nbr_pj->d;
-				self_coef = (i == j) ? 0.5 : 1.0;
-				tmin  = MIN( type_i, type_j );
-				tmax  = MAX( type_i, type_j );
-				t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
-
-				/* Cubic Spline Interpolation */
-				r = (int)(r_ij * t->inv_dx);
-				if( r == 0 )  ++r;
-				base = (real)(r+1) * t->dx;
-				dif = r_ij - base;
-
-				if(( update_energies )) 
-				{
-					e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
-						t->vdW[r].a;
-					e_vdW *= self_coef;
-
-					e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
-						t->ele[r].a;
-					e_ele *= self_coef * local_atom.q * atoms[j].q;
-
-					sh_vdw [threadIdx.x] += e_vdW/2.0;
-					sh_ele [threadIdx.x] += e_ele/2.0;
-				}	
-			}
-
-			pj += blockDim.x;
-		}
-
-		// now do a reduce inside the warp for E_vdW, E_Ele and force.
-		if (threadIdx.x < 16) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
-		}
-		if (threadIdx.x < 8) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
-		}
-		if (threadIdx.x < 4) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
-		}
-		if (threadIdx.x < 2) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
-		}
-		if (threadIdx.x < 1) {
-			sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
-			sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
-		}
-		if (threadIdx.x == 0) {
-			E_vdW [i] += sh_vdw[0];
-			E_Ele [i] += sh_ele[0];
-		}
-
-	}
-
-
-
-
-
-
-	GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2( 	reax_atom *atoms, 
-			control_params *control,
-			simulation_data *data, 
-			list p_far_nbrs, 
-			real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
-			LR_lookup_table *d_LR,
-			int num_atom_types,
-			int energy_update_freq,
-			int N )
-	{
-
-		extern __shared__ rvec _force [];
-
-		rvec *sh_force;
-
-		int i, j, pj, r, steps, update_freq, update_energies;
-		int type_i, type_j, tmin, tmax;
-		int start_i, end_i;
-		real r_ij, self_coef, base, dif;
-		real e_vdW, e_ele;
-		real CEvd, CEclmb;
-		rvec temp, ext_press;
-		far_neighbor_data *nbr_pj;
-		LR_lookup_table *t;
-		list *far_nbrs = &p_far_nbrs;
-
-		i = blockIdx.x;
-
-		reax_atom local_atom;
-		local_atom.q =  atoms[i].q;
-		local_atom.type = atoms[i].type;
-
-		sh_force = _force;
-		rvec_MakeZero ( sh_force [threadIdx.x] );
-
+    extern __shared__ real _vdw[];
+    extern __shared__ real _ele[];
+    extern __shared__ rvec _force [];
+
+    real *sh_vdw;
+    real *sh_ele;
+    rvec *sh_force;
+
+    int i, j, pj, r, steps, update_freq, update_energies;
+    int type_i, type_j, tmin, tmax;
+    int start_i, end_i;
+    real r_ij, self_coef, base, dif;
+    real e_vdW, e_ele;
+    real CEvd, CEclmb;
+    rvec temp, ext_press;
+    far_neighbor_data *nbr_pj;
+    LR_lookup_table *t;
+    list *far_nbrs = &p_far_nbrs;
+
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int warpid = thread_id / VDW_THREADS_PER_ATOM;
+    int laneid = thread_id & (VDW_THREADS_PER_ATOM -1);
+
+    i = warpid;
+
+    sh_vdw = _vdw;
+    sh_ele = _vdw + blockDim.x;
+    sh_force = (rvec *)( _vdw + 2*blockDim.x);
+
+    sh_vdw[threadIdx.x] = 0.0; 
+    sh_ele[threadIdx.x] = 0.0; 
+    rvec_MakeZero ( sh_force [threadIdx.x] );
+
+    if ( i < N ) 
+    {
+
+        reax_atom local_atom ;
+        local_atom.q =  atoms[i].q;
+        //local_atom.q =  d_far_data.q[i];
+        local_atom.type = atoms[i].type;
+        //local_atom.type = d_far_data.type[i];
+
+        /*
+           sh_vdw = _vdw;
+           sh_ele = _vdw + warpid;
+           sh_force = (rvec *)( _vdw + 2*warpid);
+
+           sh_vdw[threadIdx.x] = 0.0; 
+           sh_ele[threadIdx.x] = 0.0; 
+           rvec_MakeZero ( sh_force [threadIdx.x] );
+         */
+
+
+        steps = data->step - data->prev_steps;
+        update_freq = energy_update_freq;
+        update_energies = update_freq > 0 && steps % update_freq == 0;
+
+        //for( i = 0; i < system->N; ++i ) {
+        type_i  = local_atom.type;
+        start_i = Start_Index(i,far_nbrs);
+        end_i   = End_Index(i,far_nbrs);
+
+        pj = start_i + laneid;
+
+        //for( pj = start_i; pj < end_i; ++pj ) 
+        while (pj < end_i)
+        {
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
+                //if( d_far_data.d[pj] <= control->r_cut ) 
+            {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j      = nbr_pj->nbr;
+                //j      = d_far_data.nbrs[pj];
+                type_j = atoms[j].type;
+                //type_j = d_far_data.type[j];
+                r_ij   = nbr_pj->d;
+                //r_ij   = d_far_data.d[pj];
+                self_coef = (i == j) ? 0.5 : 1.0;
+                tmin  = MIN( type_i, type_j );
+                tmax  = MAX( type_i, type_j );
+                t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
+
+                //TODO
+                //CHANGE ORIGINAL
+                //if (i <= j) { pj += blockDim.x; continue; }
+                //CHANGE ORIGINAL
+
+                /* Cubic Spline Interpolation */
+                r = (int)(r_ij * t->inv_dx);
+                if( r == 0 )  ++r;
+                base = (real)(r+1) * t->dx;
+                dif = r_ij - base;
+
+                if(( update_energies )) 
+                {
+                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
+                        t->vdW[r].a;
+                    e_vdW *= self_coef;
+
+                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + t->ele[r].a;
+                    e_ele *= self_coef * local_atom.q * atoms[j].q;
+
+
+                    //data->E_vdW += e_vdW;
+                    //TODO
+                    //E_vdW [i] += e_vdW / 2.0;
+                    //E_vdW [i] = __dadd_rd (E_vdW [i], e_vdW/2.0);
+                    sh_vdw [threadIdx.x] += e_vdW/2.0;
+                    //E_vdW [i] += e_vdW;
+
+                    //TODO
+                    //data->E_Ele += e_ele;
+                    //E_Ele [i] += e_ele / 2.0;
+                    //E_Ele [i] = __dadd_rd ( E_Ele [i], e_ele / 2.0);
+                    sh_ele [threadIdx.x] += e_ele/2.0;
+                    //E_Ele [i] += e_ele;
+                }    
+
+                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
+                    t->CEvd[r].a;
+                CEvd *= self_coef;
+
+                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
+                    t->CEclmb[r].a;
+                CEclmb *= self_coef * local_atom.q * atoms[j].q;
+                //CEclmb *= self_coef * local_atom.q * d_far_data.q[j];
+
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT) {
+                    if ( i >= j)
+                        //rvec_ScaledAdd( atoms[i].f, -(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec );
+                    //rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), d_far_data.dvec[pj] );
+                    else 
+                        //rvec_ScaledAdd( atoms[i].f, +(CEvd + CEclmb), nbr_pj->dvec );
+                        rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec );
+                    //rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), d_far_data.dvec[pj] );
+                }
+                else { // NPT, iNPT or sNPT
+                    // for pressure coupling, terms not related to bond order 
+                    //  derivatives are added directly into pressure vector/tensor /
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                    if (i >= j)
+                        rvec_ScaledAdd( atoms[i].f, -1., temp );
+                    else
+                        rvec_Add( atoms[i].f, temp );
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+
+                    //rvec_Add( data->ext_press, ext_press );
+                    rvec_Copy (aux_ext_press [i], ext_press );
+
+                    //TODO CHECK THIS
+                }
+
+
+
+            }
+
+            pj += VDW_THREADS_PER_ATOM;
+        }
+
+    }// if i < n condition
+
+    __syncthreads ();
+
+    if (laneid < 16) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
+    }
+    __syncthreads ();
+    if (laneid < 8) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
+    }
+    __syncthreads ();
+    if (laneid < 4) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
+    }
+    __syncthreads ();
+    if (laneid < 2) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
+    }
+    __syncthreads ();
+    if (laneid < 1) {
+        sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
+        sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
+        rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
+    }
+    __syncthreads ();
+    if (laneid == 0) {
+        E_vdW [i] += sh_vdw[threadIdx.x];
+        E_Ele [i] += sh_ele[threadIdx.x];
+        rvec_Add (atoms[i].f, sh_force [ threadIdx.x ]);
+    }
+
+
+    }
+
+
+
+
+    GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_1(     reax_atom *atoms, 
+            control_params *control,
+            simulation_data *data, 
+            list p_far_nbrs, 
+            real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
+            LR_lookup_table *d_LR,
+            int num_atom_types,
+            int energy_update_freq,
+            int N )
+    {
+
+        extern __shared__ real _vdw[];
+        extern __shared__ real _ele[];
+
+        real *sh_vdw;
+        real *sh_ele;
+
+        int i, j, pj, r, steps, update_freq, update_energies;
+        int type_i, type_j, tmin, tmax;
+        int start_i, end_i;
+        real r_ij, self_coef, base, dif;
+        real e_vdW, e_ele;
+        real CEvd, CEclmb;
+        rvec temp, ext_press;
+        far_neighbor_data *nbr_pj;
+        LR_lookup_table *t;
+        list *far_nbrs = &p_far_nbrs;
+
+        i = blockIdx.x;
+
+        reax_atom local_atom;
+        local_atom.q =  atoms[i].q;
+        local_atom.type = atoms[i].type;
+
+        sh_vdw = _vdw;
+        sh_ele = _vdw + blockDim.x;
+
+        sh_vdw[threadIdx.x] = 0.0; 
+        sh_ele[threadIdx.x] = 0.0; 
+
+
+        steps = data->step - data->prev_steps;
+        update_freq = energy_update_freq;
+        update_energies = update_freq > 0 && steps % update_freq == 0;
+
+        type_i  = local_atom.type;
+        start_i = Start_Index(i,far_nbrs);
+        end_i   = End_Index(i,far_nbrs);
+
+        pj = start_i + threadIdx.x;
+
+        while (pj < end_i)
+        {
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
+            {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j      = nbr_pj->nbr;
+                type_j = atoms[j].type;
+                r_ij   = nbr_pj->d;
+                self_coef = (i == j) ? 0.5 : 1.0;
+                tmin  = MIN( type_i, type_j );
+                tmax  = MAX( type_i, type_j );
+                t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
+
+                /* Cubic Spline Interpolation */
+                r = (int)(r_ij * t->inv_dx);
+                if( r == 0 )  ++r;
+                base = (real)(r+1) * t->dx;
+                dif = r_ij - base;
+
+                if(( update_energies )) 
+                {
+                    e_vdW = ((t->vdW[r].d*dif + t->vdW[r].c)*dif + t->vdW[r].b)*dif + 
+                        t->vdW[r].a;
+                    e_vdW *= self_coef;
+
+                    e_ele = ((t->ele[r].d*dif + t->ele[r].c)*dif + t->ele[r].b)*dif + 
+                        t->ele[r].a;
+                    e_ele *= self_coef * local_atom.q * atoms[j].q;
+
+                    sh_vdw [threadIdx.x] += e_vdW/2.0;
+                    sh_ele [threadIdx.x] += e_ele/2.0;
+                }    
+            }
+
+            pj += blockDim.x;
+        }
+
+        // now do a reduce inside the warp for E_vdW, E_Ele and force.
+        if (threadIdx.x < 16) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 16];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 16];
+        }
+        if (threadIdx.x < 8) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 8];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 8];
+        }
+        if (threadIdx.x < 4) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 4];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 4];
+        }
+        if (threadIdx.x < 2) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 2];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 2];
+        }
+        if (threadIdx.x < 1) {
+            sh_vdw[threadIdx.x] += sh_vdw[threadIdx.x + 1];
+            sh_ele[threadIdx.x] += sh_ele[threadIdx.x + 1];
+        }
+        if (threadIdx.x == 0) {
+            E_vdW [i] += sh_vdw[0];
+            E_Ele [i] += sh_ele[0];
+        }
+
+    }
+
+
+
+
+
+
+    GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy_2(     reax_atom *atoms, 
+            control_params *control,
+            simulation_data *data, 
+            list p_far_nbrs, 
+            real *E_vdW, real *E_Ele, rvec *aux_ext_press, 
+            LR_lookup_table *d_LR,
+            int num_atom_types,
+            int energy_update_freq,
+            int N )
+    {
+
+        extern __shared__ rvec _force [];
+
+        rvec *sh_force;
+
+        int i, j, pj, r, steps, update_freq, update_energies;
+        int type_i, type_j, tmin, tmax;
+        int start_i, end_i;
+        real r_ij, self_coef, base, dif;
+        real e_vdW, e_ele;
+        real CEvd, CEclmb;
+        rvec temp, ext_press;
+        far_neighbor_data *nbr_pj;
+        LR_lookup_table *t;
+        list *far_nbrs = &p_far_nbrs;
+
+        i = blockIdx.x;
+
+        reax_atom local_atom;
+        local_atom.q =  atoms[i].q;
+        local_atom.type = atoms[i].type;
+
+        sh_force = _force;
+        rvec_MakeZero ( sh_force [threadIdx.x] );
+
 
-		steps = data->step - data->prev_steps;
-		update_freq = energy_update_freq;
-		update_energies = update_freq > 0 && steps % update_freq == 0;
+        steps = data->step - data->prev_steps;
+        update_freq = energy_update_freq;
+        update_energies = update_freq > 0 && steps % update_freq == 0;
 
-		//for( i = 0; i < system->N; ++i ) {
-		type_i  = local_atom.type;
-		start_i = Start_Index(i,far_nbrs);
-		end_i   = End_Index(i,far_nbrs);
+        //for( i = 0; i < system->N; ++i ) {
+        type_i  = local_atom.type;
+        start_i = Start_Index(i,far_nbrs);
+        end_i   = End_Index(i,far_nbrs);
 
-		pj = start_i + threadIdx.x;
+        pj = start_i + threadIdx.x;
 
-		while (pj < end_i)
-		{
-			if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
-			{
-				nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-				j      = nbr_pj->nbr;
-				type_j = atoms[j].type;
-				r_ij   = nbr_pj->d;
-				self_coef = (i == j) ? 0.5 : 1.0;
-				tmin  = MIN( type_i, type_j );
-				tmax  = MAX( type_i, type_j );
-				t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
+        while (pj < end_i)
+        {
+            if( far_nbrs->select.far_nbr_list[pj].d <= control->r_cut ) 
+            {
+                nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+                j      = nbr_pj->nbr;
+                type_j = atoms[j].type;
+                r_ij   = nbr_pj->d;
+                self_coef = (i == j) ? 0.5 : 1.0;
+                tmin  = MIN( type_i, type_j );
+                tmax  = MAX( type_i, type_j );
+                t = &( d_LR[ index_lr (tmin,tmax,num_atom_types) ] ); 
 
-				/* Cubic Spline Interpolation */
-				r = (int)(r_ij * t->inv_dx);
-				if( r == 0 )  ++r;
-				base = (real)(r+1) * t->dx;
-				dif = r_ij - base;
+                /* Cubic Spline Interpolation */
+                r = (int)(r_ij * t->inv_dx);
+                if( r == 0 )  ++r;
+                base = (real)(r+1) * t->dx;
+                dif = r_ij - base;
 
-				CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
-					t->CEvd[r].a;
-				CEvd *= self_coef;
+                CEvd = ((t->CEvd[r].d*dif + t->CEvd[r].c)*dif + t->CEvd[r].b)*dif + 
+                    t->CEvd[r].a;
+                CEvd *= self_coef;
 
-				CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
-					t->CEclmb[r].a;
-				CEclmb *= self_coef * local_atom.q * atoms[j].q;
+                CEclmb = ((t->CEclmb[r].d*dif+t->CEclmb[r].c)*dif+t->CEclmb[r].b)*dif + 
+                    t->CEclmb[r].a;
+                CEclmb *= self_coef * local_atom.q * atoms[j].q;
 
-				if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
-					if ( i >= j)
-						rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec );
-					else 
-						rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec );
-				}
-				else { // NPT, iNPT or sNPT
-					// for pressure coupling, terms not related to bond order 
-					//  derivatives are added directly into pressure vector/tensor /
-					rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
-					if (i >= j)
-						rvec_ScaledAdd( atoms[i].f, -1., temp );
-					else
-						rvec_Add( atoms[i].f, temp );
-					rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
+                if( control->ensemble == NVE || control->ensemble == NVT || control->ensemble == bNVT ) {
+                    if ( i >= j)
+                        rvec_ScaledAdd( sh_force [threadIdx.x], -(CEvd + CEclmb), nbr_pj->dvec );
+                    else 
+                        rvec_ScaledAdd( sh_force [threadIdx.x], +(CEvd + CEclmb), nbr_pj->dvec );
+                }
+                else { // NPT, iNPT or sNPT
+                    // for pressure coupling, terms not related to bond order 
+                    //  derivatives are added directly into pressure vector/tensor /
+                    rvec_Scale( temp, CEvd + CEclmb, nbr_pj->dvec );
+                    if (i >= j)
+                        rvec_ScaledAdd( atoms[i].f, -1., temp );
+                    else
+                        rvec_Add( atoms[i].f, temp );
+                    rvec_iMultiply( ext_press, nbr_pj->rel_box, temp );
 
-					rvec_Copy (aux_ext_press [i], ext_press );
-				}
-			}
+                    rvec_Copy (aux_ext_press [i], ext_press );
+                }
+            }
 
-			pj += blockDim.x;
-		}
+            pj += blockDim.x;
+        }
 
 
-		// now do a reduce inside the warp for E_vdW, E_Ele and force.
-		if (threadIdx.x < 16) {
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
-		}
-		if (threadIdx.x < 8) {
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
-		}
-		if (threadIdx.x < 4) {
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
-		}
-		if (threadIdx.x < 2) {
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
-		}
-		if (threadIdx.x < 1) {
-			rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
-		}
-		if (threadIdx.x == 0) {
-			rvec_Add (atoms[i].f, sh_force [ 0 ]);
-		}
+        // now do a reduce inside the warp for E_vdW, E_Ele and force.
+        if (threadIdx.x < 16) {
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 16] );
+        }
+        if (threadIdx.x < 8) {
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 8] );
+        }
+        if (threadIdx.x < 4) {
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 4] );
+        }
+        if (threadIdx.x < 2) {
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 2] );
+        }
+        if (threadIdx.x < 1) {
+            rvec_Add (sh_force [threadIdx.x], sh_force [threadIdx.x + 1] );
+        }
+        if (threadIdx.x == 0) {
+            rvec_Add (atoms[i].f, sh_force [ 0 ]);
+        }
 
 
-	}
+    }
 
 
 
@@ -1613,18 +1613,18 @@ GLOBAL void Cuda_Tabulated_vdW_Coulomb_Energy( 	reax_atom *atoms,
 
 
 #if defined(OLD)
-	/* Linear extrapolation */
-	/*p     = (r_ij * t->inv_dx;
-	  r     = (int) p;
-	  prev  = &( t->y[r] );
-	  next  = &( t->y[r+1] );
-
-	  tmp    = p - r;
-	  e_vdW  = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW ));
-	  CEvd   = self_coef * (prev->CEvd  + tmp*(next->CEvd  - prev->CEvd  ));
-
-	  e_ele  = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele ));
-	  e_ele  = e_ele  * system->atoms[i].q * system->atoms[j].q;
-	  CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb));
-	  CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/
+    /* Linear extrapolation */
+    /*p     = (r_ij * t->inv_dx;
+      r     = (int) p;
+      prev  = &( t->y[r] );
+      next  = &( t->y[r+1] );
+
+      tmp    = p - r;
+      e_vdW  = self_coef * (prev->e_vdW + tmp*(next->e_vdW - prev->e_vdW ));
+      CEvd   = self_coef * (prev->CEvd  + tmp*(next->CEvd  - prev->CEvd  ));
+
+      e_ele  = self_coef * (prev->e_ele + tmp*(next->e_ele - prev->e_ele ));
+      e_ele  = e_ele  * system->atoms[i].q * system->atoms[j].q;
+      CEclmb = self_coef * (prev->CEclmb+tmp*(next->CEclmb - prev->CEclmb));
+      CEclmb = CEclmb * system->atoms[i].q * system->atoms[j].q;*/
 #endif
diff --git a/PuReMD-GPU/src/validation.cu b/PuReMD-GPU/src/validation.cu
index c5497977..f8261555 100644
--- a/PuReMD-GPU/src/validation.cu
+++ b/PuReMD-GPU/src/validation.cu
@@ -29,1931 +29,1931 @@
 
 bool check_zero (real p1, real p2)
 {
-	if (abs (p1 - p2) >= GPU_TOLERANCE)
-		return true;
-	else 
-		return false;
+    if (abs (p1 - p2) >= GPU_TOLERANCE)
+        return true;
+    else 
+        return false;
 }
 
 bool check_zero (rvec p1, rvec p2)
 {
 
-	if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) ||
-			((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) ||
-			((abs (p1[2] - p2[2])) >= GPU_TOLERANCE ))
-		return true;
-	else return false;
+    if (((abs (p1[0] - p2[0])) >= GPU_TOLERANCE) ||
+            ((abs (p1[1] - p2[1])) >= GPU_TOLERANCE) ||
+            ((abs (p1[2] - p2[2])) >= GPU_TOLERANCE ))
+        return true;
+    else return false;
 }
 
 bool check_same (ivec p1, ivec p2)
 {
-	if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) )
-		return true;
-	else 
-		return false;
+    if ( (p1[0] == p2[0]) || (p1[1] == p2[1]) || (p1[2] == p2[2]) )
+        return true;
+    else 
+        return false;
 }
 
 bool validate_box (simulation_box *host, simulation_box *dev)
 {
 
-	simulation_box test;
+    simulation_box test;
 
-	copy_host_device (&test, dev, SIMULATION_BOX_SIZE, cudaMemcpyDeviceToHost, RES_SYSTEM_SIMULATION_BOX );
+    copy_host_device (&test, dev, SIMULATION_BOX_SIZE, cudaMemcpyDeviceToHost, RES_SYSTEM_SIMULATION_BOX );
 
-	if (memcmp (&test, host, SIMULATION_BOX_SIZE)) {
-		fprintf (stderr, " Simulation box is not in synch between host and device \n");
-		return false;
-	}
+    if (memcmp (&test, host, SIMULATION_BOX_SIZE)) {
+        fprintf (stderr, " Simulation box is not in synch between host and device \n");
+        return false;
+    }
 
-	fprintf (stderr, " Simulation box is in **synch** between host and device \n");
-	return true;
+    fprintf (stderr, " Simulation box is in **synch** between host and device \n");
+    return true;
 }
 
 bool validate_atoms (reax_system *system, list **lists)
 {
 
-	int start, end, index, count, miscount;
-	reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N);
-	copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
-
-	/*
-	   int *d_start, *d_end;
-	   bond_data *d_bond_data;
-	   list *d_bonds = dev_lists + BONDS;
-	   list *bonds = *lists + BONDS;
-
-	   d_end = (int *)malloc (sizeof (int) * system->N);
-	   d_start = (int *) malloc (sizeof (int) * system->N );
-	   d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-
-	   copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-	   copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-	   copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-
-
-	   count = 0;
-	   miscount = 0;
-	   for (int i = 0; i < 1; i++) {
-
-	   for (int j = d_start[i]; j < d_end[i]; j++) {
-	   bond_data *src, *tgt;
-	   src = &d_bond_data[j];
-	   tgt = &d_bond_data[ src->dbond_index ];
-
-	   fprintf (stderr, "Atom %d f neighbor %d vector (%e %e %e) thbh count %d \n", i, src->nbr, tgt->f[0], tgt->f[1], tgt->f[2], src->scratch );
-	   }
-	   }
-	   exit (-1);
-	 */
-
-	//if (memcmp (test, system->atoms, REAX_ATOM_SIZE * system->N)) {
-	count = miscount = 0;
-	for (int i = 0; i < system->N; i++) 
-	{
-		if (test[i].type != system->atoms[i].type) {
-			fprintf (stderr, " Type does not match (%d %d) @ index %d \n", system->atoms[i].type, test[i].type, i);
-			exit (-1);
-		}
-
-		if ( 	check_zero (test[i].x, system->atoms[i].x) )
-		{
-			fprintf (stderr, "Atom :%d x --> host (%f %f %f) device (%f %f %f) \n", i,
-					system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], 
-					test[i].x[0], test[i].x[1], test[i].x[2] );
-			miscount ++;
-			exit (-1);
-		}
-		if (		check_zero (test[i].v, system->atoms[i].v) )
-		{
-			fprintf (stderr, "Atom :%d v --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i,
-					system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], 
-					test[i].v[0], test[i].v[1], test[i].v[2] );
-			miscount ++;
-			exit (-1);
-		}
-		if (		check_zero (test[i].f, system->atoms[i].f) )
-		{
-			fprintf (stderr, "Atom :%d f --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i,
-					system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2], 
-					test[i].f[0], test[i].f[1], test[i].f[2] );
-			miscount ++;
-			exit (-1);
-		}
-
-		if (		check_zero (test[i].q, system->atoms[i].q) )
-		{
-			fprintf (stderr, "Atom :%d q --> host (%f) device (%f) \n", i,
-					system->atoms[i].q, test[i].q );
-			miscount ++;
-			exit (-1);
-		}
-
-		count ++;
-	}
-
-	//fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount);
-
-	free (test);
-	return true;
+    int start, end, index, count, miscount;
+    reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N);
+    copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
+
+    /*
+       int *d_start, *d_end;
+       bond_data *d_bond_data;
+       list *d_bonds = dev_lists + BONDS;
+       list *bonds = *lists + BONDS;
+
+       d_end = (int *)malloc (sizeof (int) * system->N);
+       d_start = (int *) malloc (sizeof (int) * system->N );
+       d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+
+       copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+       copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+       copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+
+
+       count = 0;
+       miscount = 0;
+       for (int i = 0; i < 1; i++) {
+
+       for (int j = d_start[i]; j < d_end[i]; j++) {
+       bond_data *src, *tgt;
+       src = &d_bond_data[j];
+       tgt = &d_bond_data[ src->dbond_index ];
+
+       fprintf (stderr, "Atom %d f neighbor %d vector (%e %e %e) thbh count %d \n", i, src->nbr, tgt->f[0], tgt->f[1], tgt->f[2], src->scratch );
+       }
+       }
+       exit (-1);
+     */
+
+    //if (memcmp (test, system->atoms, REAX_ATOM_SIZE * system->N)) {
+    count = miscount = 0;
+    for (int i = 0; i < system->N; i++) 
+    {
+        if (test[i].type != system->atoms[i].type) {
+            fprintf (stderr, " Type does not match (%d %d) @ index %d \n", system->atoms[i].type, test[i].type, i);
+            exit (-1);
+        }
+
+        if (     check_zero (test[i].x, system->atoms[i].x) )
+        {
+            fprintf (stderr, "Atom :%d x --> host (%f %f %f) device (%f %f %f) \n", i,
+                    system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2], 
+                    test[i].x[0], test[i].x[1], test[i].x[2] );
+            miscount ++;
+            exit (-1);
+        }
+        if (        check_zero (test[i].v, system->atoms[i].v) )
+        {
+            fprintf (stderr, "Atom :%d v --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i,
+                    system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2], 
+                    test[i].v[0], test[i].v[1], test[i].v[2] );
+            miscount ++;
+            exit (-1);
+        }
+        if (        check_zero (test[i].f, system->atoms[i].f) )
+        {
+            fprintf (stderr, "Atom :%d f --> host (%6.10f %6.10f %6.10f) device (%6.10f %6.10f %6.10f) \n", i,
+                    system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2], 
+                    test[i].f[0], test[i].f[1], test[i].f[2] );
+            miscount ++;
+            exit (-1);
+        }
+
+        if (        check_zero (test[i].q, system->atoms[i].q) )
+        {
+            fprintf (stderr, "Atom :%d q --> host (%f) device (%f) \n", i,
+                    system->atoms[i].q, test[i].q );
+            miscount ++;
+            exit (-1);
+        }
+
+        count ++;
+    }
+
+    //fprintf (stderr, "Reax Atoms DOES **match** between host and device --> %d miscount --> %d \n", count, miscount);
+
+    free (test);
+    return true;
 }
 
 void Print_Matrix( sparse_matrix *A )
 {
-	int i, j;
-	for( i = 0; i < 10; ++i ) { 
-		fprintf( stderr, "i:%d  j(val):", i );
+    int i, j;
+    for( i = 0; i < 10; ++i ) { 
+        fprintf( stderr, "i:%d  j(val):", i );
 
-		for( j = A->start[i]; j < A->end[i]; ++j )
-			fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val );
+        for( j = A->start[i]; j < A->end[i]; ++j )
+            fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val );
 
-		fprintf( stderr, "\n" );
-	}
+        fprintf( stderr, "\n" );
+    }
 }
 
 void Print_Matrix_L( sparse_matrix *A )
 {
-	int i, j;
-	for( i = 0; i < 10; ++i ) { 
-		fprintf( stderr, "i:%d  j(val):", i );
+    int i, j;
+    for( i = 0; i < 10; ++i ) { 
+        fprintf( stderr, "i:%d  j(val):", i );
 
-		for( j = A->start[i]; j < A->start[i+1]; ++j )
-			fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val );
+        for( j = A->start[i]; j < A->start[i+1]; ++j )
+            fprintf( stderr, "%d(%.4f) ", A->entries[j].j, A->entries[j].val );
 
-		fprintf( stderr, "\n" );
-	}
+        fprintf( stderr, "\n" );
+    }
 }
 
 
 bool validate_sort_matrix (reax_system *system, static_storage *workspace)
 {
-	sparse_matrix test;
-	int index, count;
-	test.start = (int *) malloc (INT_SIZE * (system->N + 1));
-	test.end = (int *) malloc (INT_SIZE * (system->N + 1));
-
-	test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries));
-	memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries);
-
-	copy_host_device ( test.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, 
-			cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
-
-	//Print_Matrix ( &test );
-
-	for (int i = 0; i < system->N; i++)
-	{
-		int start = test.start[i];
-		int end = test.end [i];
-
-		//d_quick_sort ( & (test.entries[start]), 0, end - start - 1 );
-		for (int x = start; x < end-1; x++)
-			if (test.entries[x].j > test.entries[x+1].j) {
-				fprintf (stderr, "Matrix is not sorted for the entri %d \n", i );
-				exit (-1);
-			}
-	}
-	fprintf (stderr, " Done sorting with all the entries in the sparse matrix \n");
-
-	free (test.start);
-	free (test.end);
-	free (test.entries);
+    sparse_matrix test;
+    int index, count;
+    test.start = (int *) malloc (INT_SIZE * (system->N + 1));
+    test.end = (int *) malloc (INT_SIZE * (system->N + 1));
+
+    test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries));
+    memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries);
+
+    copy_host_device ( test.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, 
+            cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
+
+    //Print_Matrix ( &test );
+
+    for (int i = 0; i < system->N; i++)
+    {
+        int start = test.start[i];
+        int end = test.end [i];
+
+        //d_quick_sort ( & (test.entries[start]), 0, end - start - 1 );
+        for (int x = start; x < end-1; x++)
+            if (test.entries[x].j > test.entries[x+1].j) {
+                fprintf (stderr, "Matrix is not sorted for the entri %d \n", i );
+                exit (-1);
+            }
+    }
+    fprintf (stderr, " Done sorting with all the entries in the sparse matrix \n");
+
+    free (test.start);
+    free (test.end);
+    free (test.entries);
 }
 
 
 bool validate_sparse_matrix( reax_system *system, static_storage *workspace )
 {
-	sparse_matrix test;
-	int index, count;
-	test.start = (int *) malloc (INT_SIZE * (system->N + 1));
-	test.end = (int *) malloc (INT_SIZE * (system->N + 1));
-
-	test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries));
-
-	memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries);
-	copy_host_device ( test.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, 
-			cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
-
-	/*
-	   for (int i = 0 ; i < system->N; i++) {
-	   if ((test.end[i] - test.start[i]) != (workspace->H.start[i+1] - workspace->H.start[i])){
-	//if ((test.end[i] - test.start[i]) < 32 ){
-	fprintf (stderr, "Sparse Matrix gpu (%d %d) cpu (%d %d)\n", 
-	test.start[i], test.end[i], 
-	workspace->H.start[i], workspace->H.start[i+1]);
-	exit (-1);
-	}
-	}
-	 */
-	//fprintf (stderr, "Sparse Matrix COUNT matches between HOST and DEVICE \n");
-
-	count = 0;
-	for (int i = 0; i < system->N; i++) {
-		for (int j = workspace->H.start[i]; j < workspace->H.start[i+1]; j++) {
-			sparse_matrix_entry *src = &workspace->H.entries[j];
-
-			for (int k = test.start[i]; k < test.end[i]; k++) {
-				sparse_matrix_entry *tgt = &test.entries [k];
-				if (src->j == tgt->j){
-					if ( check_zero (src->val, tgt->val)) {
-						index = test.start [i];
-						/*
-						   fprintf (stderr, " i-1 (%d %d ) (%d %d) \n", 
-						   test.start[i-1], test.end[i-1], 
-						   workspace->H.start[i-1], workspace->H.start[i]);
-						   fprintf (stderr, " Sparse matrix entry does not match for atom %d at index %d (%d %d) (%d %d) \n", 
-						   i, k, test.start[i], test.end[i], 
-						   workspace->H.start[i], workspace->H.start[i+1]);
-						   for (int x = workspace->H.start[i]; x < workspace->H.start[i+1]; x ++)
-						   {
-						   src = &workspace->H.entries[x];
-						   tgt = &test.entries [index];
-						   fprintf (stderr, " cpu (%d %f)**** <--> gpu (%d %f) index %d \n", src->j, src->val, tgt->j, tgt->val, index);
-						   index ++;
-						   }
-						 */
-						fprintf (stderr, "Sparse Matrix DOES NOT match between device and host \n");
-						exit (-1);
-						count++;
-					} else break;
-				}
-			}
-		}
-	}
-
-	//fprintf (stderr, "Sparse Matrix mismatch count %d  \n", count);
-	free (test.start);
-	free (test.end);
-	free (test.entries);
-	return true;
+    sparse_matrix test;
+    int index, count;
+    test.start = (int *) malloc (INT_SIZE * (system->N + 1));
+    test.end = (int *) malloc (INT_SIZE * (system->N + 1));
+
+    test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries));
+
+    memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries);
+    copy_host_device ( test.entries, dev_workspace->H.entries, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, 
+            cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
+
+    /*
+       for (int i = 0 ; i < system->N; i++) {
+       if ((test.end[i] - test.start[i]) != (workspace->H.start[i+1] - workspace->H.start[i])){
+    //if ((test.end[i] - test.start[i]) < 32 ){
+    fprintf (stderr, "Sparse Matrix gpu (%d %d) cpu (%d %d)\n", 
+    test.start[i], test.end[i], 
+    workspace->H.start[i], workspace->H.start[i+1]);
+    exit (-1);
+    }
+    }
+     */
+    //fprintf (stderr, "Sparse Matrix COUNT matches between HOST and DEVICE \n");
+
+    count = 0;
+    for (int i = 0; i < system->N; i++) {
+        for (int j = workspace->H.start[i]; j < workspace->H.start[i+1]; j++) {
+            sparse_matrix_entry *src = &workspace->H.entries[j];
+
+            for (int k = test.start[i]; k < test.end[i]; k++) {
+                sparse_matrix_entry *tgt = &test.entries [k];
+                if (src->j == tgt->j){
+                    if ( check_zero (src->val, tgt->val)) {
+                        index = test.start [i];
+                        /*
+                           fprintf (stderr, " i-1 (%d %d ) (%d %d) \n", 
+                           test.start[i-1], test.end[i-1], 
+                           workspace->H.start[i-1], workspace->H.start[i]);
+                           fprintf (stderr, " Sparse matrix entry does not match for atom %d at index %d (%d %d) (%d %d) \n", 
+                           i, k, test.start[i], test.end[i], 
+                           workspace->H.start[i], workspace->H.start[i+1]);
+                           for (int x = workspace->H.start[i]; x < workspace->H.start[i+1]; x ++)
+                           {
+                           src = &workspace->H.entries[x];
+                           tgt = &test.entries [index];
+                           fprintf (stderr, " cpu (%d %f)**** <--> gpu (%d %f) index %d \n", src->j, src->val, tgt->j, tgt->val, index);
+                           index ++;
+                           }
+                         */
+                        fprintf (stderr, "Sparse Matrix DOES NOT match between device and host \n");
+                        exit (-1);
+                        count++;
+                    } else break;
+                }
+            }
+        }
+    }
+
+    //fprintf (stderr, "Sparse Matrix mismatch count %d  \n", count);
+    free (test.start);
+    free (test.end);
+    free (test.entries);
+    return true;
 }
 
 bool validate_lu (static_storage *workspace)
 {
-	sparse_matrix test;
-	int index, count;
-
-	test.start = (int *) malloc (INT_SIZE * (dev_workspace->L.n + 1));
-	test.end = (int *) malloc (INT_SIZE * (dev_workspace->L.n + 1));
-	test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (dev_workspace->L.m));
-
-	memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->L.m);
-	copy_host_device ( test.entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->L.m, cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.start, dev_workspace->L.start, INT_SIZE * (dev_workspace->L.n + 1), cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.end , dev_workspace->L.end, INT_SIZE * (dev_workspace->L.n + 1), cudaMemcpyDeviceToHost, __LINE__ );
-
-	count = 0;
-	for (int i = 0; i < workspace->L.n; i ++)
-	{
-		if (workspace->L.start[i] != test.start[i]){
-			fprintf (stderr, "L -- Count does not match for index %d \n", i);
-			exit (-1);
-		}
-
-		for (int j = workspace->L.start[i]; j < workspace->L.start[i+1]; j++) 
-		{
-			if (check_zero (workspace->L.entries [j].val, test.entries[j].val) || 
-					workspace->L.entries[j].j != test.entries [j].j)
-			{
-				fprintf (stderr, "L -- J or value does not match for the index %d \n", i);
-				count ++;
-				exit (-1);
-			}
-		}
-	}
-
-	test.start = (int *) malloc (INT_SIZE * (dev_workspace->U.n + 1));
-	test.end = (int *) malloc (INT_SIZE * (dev_workspace->U.n + 1));
-	test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (dev_workspace->U.m));
-
-	memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->U.m);
-	copy_host_device ( test.entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->U.m, cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.start, dev_workspace->U.start, INT_SIZE * (dev_workspace->U.n + 1), cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.end , dev_workspace->U.end, INT_SIZE * (dev_workspace->U.n + 1), cudaMemcpyDeviceToHost, __LINE__ );
-
-	count = 0;
-	for (int i = 0; i < workspace->U.n; i ++)
-	{
-		if (workspace->U.start[i] != test.start[i]){
-			fprintf (stderr, "U -- Count does not match for index %d \n", i);
-			exit (-1);
-		}
-
-		for (int j = workspace->U.start[i]; j < workspace->U.start[i+1]; j++) 
-		{
-			if (check_zero (workspace->U.entries [j].val, test.entries[j].val) || 
-					workspace->U.entries[j].j != test.entries [j].j)
-			{
-				fprintf (stderr, "U -- J or value does not match for the index %d \n", i);
-				count ++;
-				exit (-1);
-			}
-		}
-	}
-
-	//fprintf (stderr, "L and U match on device and host \n");
-	return true;
+    sparse_matrix test;
+    int index, count;
+
+    test.start = (int *) malloc (INT_SIZE * (dev_workspace->L.n + 1));
+    test.end = (int *) malloc (INT_SIZE * (dev_workspace->L.n + 1));
+    test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (dev_workspace->L.m));
+
+    memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->L.m);
+    copy_host_device ( test.entries, dev_workspace->L.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->L.m, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.start, dev_workspace->L.start, INT_SIZE * (dev_workspace->L.n + 1), cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.end , dev_workspace->L.end, INT_SIZE * (dev_workspace->L.n + 1), cudaMemcpyDeviceToHost, __LINE__ );
+
+    count = 0;
+    for (int i = 0; i < workspace->L.n; i ++)
+    {
+        if (workspace->L.start[i] != test.start[i]){
+            fprintf (stderr, "L -- Count does not match for index %d \n", i);
+            exit (-1);
+        }
+
+        for (int j = workspace->L.start[i]; j < workspace->L.start[i+1]; j++) 
+        {
+            if (check_zero (workspace->L.entries [j].val, test.entries[j].val) || 
+                    workspace->L.entries[j].j != test.entries [j].j)
+            {
+                fprintf (stderr, "L -- J or value does not match for the index %d \n", i);
+                count ++;
+                exit (-1);
+            }
+        }
+    }
+
+    test.start = (int *) malloc (INT_SIZE * (dev_workspace->U.n + 1));
+    test.end = (int *) malloc (INT_SIZE * (dev_workspace->U.n + 1));
+    test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (dev_workspace->U.m));
+
+    memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->U.m);
+    copy_host_device ( test.entries, dev_workspace->U.entries, SPARSE_MATRIX_ENTRY_SIZE * dev_workspace->U.m, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.start, dev_workspace->U.start, INT_SIZE * (dev_workspace->U.n + 1), cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.end , dev_workspace->U.end, INT_SIZE * (dev_workspace->U.n + 1), cudaMemcpyDeviceToHost, __LINE__ );
+
+    count = 0;
+    for (int i = 0; i < workspace->U.n; i ++)
+    {
+        if (workspace->U.start[i] != test.start[i]){
+            fprintf (stderr, "U -- Count does not match for index %d \n", i);
+            exit (-1);
+        }
+
+        for (int j = workspace->U.start[i]; j < workspace->U.start[i+1]; j++) 
+        {
+            if (check_zero (workspace->U.entries [j].val, test.entries[j].val) || 
+                    workspace->U.entries[j].j != test.entries [j].j)
+            {
+                fprintf (stderr, "U -- J or value does not match for the index %d \n", i);
+                count ++;
+                exit (-1);
+            }
+        }
+    }
+
+    //fprintf (stderr, "L and U match on device and host \n");
+    return true;
 }
 
 void print_sparse_matrix (reax_system *system, static_storage *workspace)
 {
-	sparse_matrix test;
-	int index, count;
-
-	test.start = (int *) malloc (INT_SIZE * (system->N + 1));
-	test.end = (int *) malloc (INT_SIZE * (system->N + 1));
-
-	test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries));
-	memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries);
-
-	test.j = (int *)  malloc (INT_SIZE * (system->N * system->max_sparse_matrix_entries));
-	test.val = (real *)  malloc (REAL_SIZE * (system->N * system->max_sparse_matrix_entries));
-
-	copy_host_device ( test.entries, dev_workspace->H.entries, 
-			SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
-
-	copy_host_device ( test.j , dev_workspace->H.j, INT_SIZE * (system->N * system->max_sparse_matrix_entries), cudaMemcpyDeviceToHost, __LINE__ );
-	copy_host_device ( test.val , dev_workspace->H.val, REAL_SIZE * (system->N * system->max_sparse_matrix_entries), cudaMemcpyDeviceToHost, __LINE__ );
-
-	count = 0;
-	for (int i = 0; i < 1; i++) {
-		//for (int j = workspace->H.start[i]; j < workspace->H.start[i+1]; j++) {
-		//	sparse_matrix_entry *src = &workspace->H.entries[j];
-		//	fprintf (stderr, " cpu (%d %f) \n", src->j, src->val);
-		//}
-		//fprintf (stderr, " start: %d -- end: %d  ------- count %d\n", test.start[i], test.end[i], test.end[i] - test.start[i]);
-		for (int j = test.start[i]; j < test.end[i]; j++) {
-			//sparse_matrix_entry *src = &test.entries[j];
-			//fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val);
-			fprintf (stderr, "Row:%d:%d:%f\n", i, test.j[j], test.val[j]);
-		}
-
-		//if (test.end[i] - test.start[i] > 500 )
-		//	fprintf (stderr, " Row -- %d,  count %d \n", i, test.end[i] - test.start[i] );
-	}
-	fprintf (stderr, "--------------- ");
-
-	free (test.start);
-	free (test.end);
-	free (test.entries);
-	free (test.j);
-	free (test.val);
+    sparse_matrix test;
+    int index, count;
+
+    test.start = (int *) malloc (INT_SIZE * (system->N + 1));
+    test.end = (int *) malloc (INT_SIZE * (system->N + 1));
+
+    test.entries = (sparse_matrix_entry *) malloc (SPARSE_MATRIX_ENTRY_SIZE * (system->N * system->max_sparse_matrix_entries));
+    memset (test.entries, 0xFF, SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries);
+
+    test.j = (int *)  malloc (INT_SIZE * (system->N * system->max_sparse_matrix_entries));
+    test.val = (real *)  malloc (REAL_SIZE * (system->N * system->max_sparse_matrix_entries));
+
+    copy_host_device ( test.entries, dev_workspace->H.entries, 
+            SPARSE_MATRIX_ENTRY_SIZE * system->N * system->max_sparse_matrix_entries, cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.start, dev_workspace->H.start, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.end , dev_workspace->H.end, INT_SIZE * (system->N + 1), cudaMemcpyDeviceToHost, __LINE__ );
+
+    copy_host_device ( test.j , dev_workspace->H.j, INT_SIZE * (system->N * system->max_sparse_matrix_entries), cudaMemcpyDeviceToHost, __LINE__ );
+    copy_host_device ( test.val , dev_workspace->H.val, REAL_SIZE * (system->N * system->max_sparse_matrix_entries), cudaMemcpyDeviceToHost, __LINE__ );
+
+    count = 0;
+    for (int i = 0; i < 1; i++) {
+        //for (int j = workspace->H.start[i]; j < workspace->H.start[i+1]; j++) {
+        //    sparse_matrix_entry *src = &workspace->H.entries[j];
+        //    fprintf (stderr, " cpu (%d %f) \n", src->j, src->val);
+        //}
+        //fprintf (stderr, " start: %d -- end: %d  ------- count %d\n", test.start[i], test.end[i], test.end[i] - test.start[i]);
+        for (int j = test.start[i]; j < test.end[i]; j++) {
+            //sparse_matrix_entry *src = &test.entries[j];
+            //fprintf (stderr, "Row:%d:%d:%f\n", i, src->j, src->val);
+            fprintf (stderr, "Row:%d:%d:%f\n", i, test.j[j], test.val[j]);
+        }
+
+        //if (test.end[i] - test.start[i] > 500 )
+        //    fprintf (stderr, " Row -- %d,  count %d \n", i, test.end[i] - test.start[i] );
+    }
+    fprintf (stderr, "--------------- ");
+
+    free (test.start);
+    free (test.end);
+    free (test.entries);
+    free (test.j);
+    free (test.val);
 }
 
 
 bool validate_bonds (reax_system *system, static_storage *workspace, list **lists)
 {
-	int start, end, index, count, miscount;
-	int *d_start, *d_end;
-	bond_data *d_bond_data;
-	list *d_bonds = dev_lists + BONDS;
-	list *bonds = *lists + BONDS;
-
-	d_end = (int *)malloc (sizeof (int) * system->N);
-	d_start = (int *) malloc (sizeof (int) * system->N );
-	d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-	//fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds );
-
-	copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-	copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-	copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-
-	count = 0;
-	for (int i = 0; i < system->N; i++) {
-		start = Start_Index (i, bonds);
-		end = End_Index (i, bonds);
-
-		count += end - start;
-		if ((end-start) != (d_end[i]-d_start[i])){
-			fprintf (stderr, "Entries does NOT match --> atom %d: cpu (%d %d) gpu (%d %d) \n", 
-					i, start, end, d_start[i], d_end[i]);
-			exit (-1);
-		}
-
-	}
-	fprintf (stderr, "BOND LIST COUNT match on device and host  count %d \n", count);
-
-	for (int i = 0; i < system->N-1; i++) {
-		if ( d_end[i] >= d_start[i+1] ){
-			fprintf (stderr, "Bonds list check Overwrite @ index --> %d \n", i);
-			exit (-1);
-		}
-	}
-	//fprintf (stderr, " BOND LIST Overwrite *PASSED* \n");
-
-	count = 0;
-	miscount = 0;
-	for (int i = 0; i < system->N; i++) {
-
-		for (int j = d_start[i]; j < d_end[i]; j++) {
-			bond_data *src, *tgt;
-			src = &d_bond_data[j];
-			bond_data *src_sym = & d_bond_data[ src->sym_index ];
-
-			//Previously this was commented out. Thats why it was working.
-			//if (i >= src->nbr) continue;
-
-			int k = 0;
-			for (k = Start_Index (i, bonds); k < End_Index (i, bonds); k++) {
-				tgt = & (bonds->select.bond_list[k]);
-				bond_data *tgt_sym = &(bonds->select.bond_list [tgt->sym_index]);
-
-				if ((src->nbr == tgt->nbr) && !check_zero (src->d,tgt->d) && 
-						!check_zero (src->dvec,tgt->dvec) && check_same (src->rel_box, tgt->rel_box)) {
-
-					bond_order_data *s, *t;
-					s = &(src->bo_data);
-					t = &(tgt->bo_data);
-
-					/*
-					   if (i == 45){
-					   fprintf (stderr, " Host %e for %d\n", t->BO, tgt->nbr);
-					   fprintf (stderr, " Device %e for %d\n", s->BO, src->nbr);
-					   }
-					 */
-
-					if (	!check_zero (s->BO,t->BO) && 
-							!check_zero (s->BO_s,t->BO_s) && 
-							!check_zero(s->BO_pi,t->BO_pi)  && 
-							!check_zero (s->BO_pi2,t->BO_pi2) &&
-							!check_zero (s->Cdbo,t->Cdbo) && !check_zero (s->Cdbopi,t->Cdbopi) && !check_zero (s->Cdbopi2,t->Cdbopi2) &&
-							!check_zero (s->C1dbo,t->C1dbo) && !check_zero (s->C2dbo,t->C2dbo) && !check_zero (s->C3dbo,t->C3dbo) &&
-							!check_zero(s->C1dbopi,t->C1dbopi) && !check_zero(s->C2dbopi,t->C2dbopi) && !check_zero(s->C3dbopi,t->C3dbopi) && !check_zero(s->C4dbopi,t->C4dbopi) &&
-							!check_zero(s->C1dbopi2,t->C1dbopi2) && !check_zero(s->C2dbopi2,t->C2dbopi2) &&!check_zero(s->C3dbopi2,t->C3dbopi2) &&!check_zero(s->C4dbopi2,t->C4dbopi2) &&
-							!check_zero (s->dln_BOp_s, t->dln_BOp_s ) && 
-							!check_zero (s->dln_BOp_pi, t->dln_BOp_pi ) && 
-							!check_zero (s->dln_BOp_pi2, t->dln_BOp_pi2 ) && 
-							!check_zero (s->dBOp, t->dBOp )) {
-						count ++;
-
-						//Check the sym index and dbond index here for double checking
-						// bond_ij on both device and hosts are matched now. 
-						bond_order_data *ss, *ts;
-						ss = & (src_sym->bo_data );
-						ts = & (tgt_sym->bo_data );
-
-						if ((src_sym->nbr != tgt_sym->nbr) || check_zero (src_sym->d,tgt_sym->d) || 
-								check_zero (src_sym->dvec,tgt_sym->dvec) || !check_same (src_sym->rel_box, tgt_sym->rel_box)
-								|| check_zero (ss->Cdbo, ts->Cdbo)){
-
-							fprintf (stderr, " Sym Index information does not match for atom %d \n", i);
-							fprintf (stderr, " atom --> %d \n", i);
-							fprintf (stderr, " nbr --> %d %d\n", src->nbr, tgt->nbr );
-							fprintf (stderr, " d --> %f %f \n", src_sym->d, tgt_sym->d );
-							fprintf (stderr, " sym Index nbr --> %d %d \n", src_sym->nbr, tgt_sym->nbr );
-							fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", 
-									src_sym->dvec[0], src_sym->dvec[1], src_sym->dvec[2], 
-									tgt_sym->dvec[0], tgt_sym->dvec[1], tgt_sym->dvec[2] );
-							fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", 
-									src_sym->rel_box[0], src_sym->rel_box[1], src_sym->rel_box[2], 
-									tgt_sym->rel_box[0], tgt_sym->rel_box[1], tgt_sym->rel_box[2] );
-
-							fprintf (stderr, " sym index Cdbo (%4.10e %4.10e) \n", ss->Cdbo,ts->Cdbo );
-							exit (-1);
-						}
-
-						break;
-					}
-					fprintf (stderr, " d --> %f %f \n", src->d, tgt->d );
-					fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", 
-							src->dvec[0], src->dvec[1], src->dvec[2], 
-							tgt->dvec[0], tgt->dvec[1], tgt->dvec[2] );
-					fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", 
-							src->rel_box[0], src->rel_box[1], src->rel_box[2], 
-							tgt->rel_box[0], tgt->rel_box[1], tgt->rel_box[2] );
-
-					fprintf (stderr, "Bond_Order_Data does not match for atom %d neighbor (%d %d) BO (%e %e) BO_s (%e %e) BO_pi (%e %e) BO_pi2 (%e %e) \n", i, 
-							src->nbr, tgt->nbr, 
-							s->BO, t->BO, 
-							s->BO_s, t->BO_s, 
-							s->BO_pi, t->BO_pi, 
-							s->BO_pi2, t->BO_pi2
-						);
-					fprintf (stderr, " dBOp (%e %e %e) (%e %e %e) \n", s->dBOp[0], s->dBOp[1], s->dBOp[2], 
-							t->dBOp[0], t->dBOp[1], t->dBOp[2] );
-
-					fprintf (stderr, " Cdbo (%4.10e %4.10e) \n", s->Cdbo,t->Cdbo );
-					fprintf (stderr, " Cdbopi (%e %e) \n", s->Cdbopi,t->Cdbopi );
-					fprintf (stderr, " Cdbopi2 (%e %e) \n", s->Cdbopi2,t->Cdbopi2 );
-					fprintf (stderr, " C1dbo (%e %e %e)(%e %e %e) \n", s->C1dbo,s->C2dbo,s->C3dbo, t->C1dbo,t->C2dbo,t->C3dbo );
-					fprintf (stderr, " C1dbopi (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi,s->C2dbopi,s->C3dbopi,s->C4dbopi, t->C1dbopi,t->C2dbopi,t->C3dbopi,t->C4dbopi);
-					fprintf (stderr, " C1dbopi2 (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi2,s->C2dbopi2,s->C3dbopi2,s->C4dbopi2, t->C1dbopi2,t->C2dbopi2,t->C3dbopi2,t->C4dbopi2);
-					fprintf (stderr, " dln_BOp_s (%e %e %e ) (%e %e %e) \n", 
-							s->dln_BOp_s[0], s->dln_BOp_s[1], s->dln_BOp_s[2],
-							t->dln_BOp_s[0], t->dln_BOp_s[1], t->dln_BOp_s[2] );
-					fprintf (stderr, " dln_BOp_pi (%e %e %e ) (%e %e %e) \n", 
-							s->dln_BOp_pi[0], s->dln_BOp_pi[1], s->dln_BOp_pi[2],
-							t->dln_BOp_pi[0], t->dln_BOp_pi[1], t->dln_BOp_pi[2] );
-					fprintf (stderr, " dln_BOp_pi2 (%e %e %e ) (%e %e %e) \n", 
-							s->dln_BOp_pi2[0], s->dln_BOp_pi2[1], s->dln_BOp_pi2[2],
-							t->dln_BOp_pi2[0], t->dln_BOp_pi2[1], t->dln_BOp_pi2[2] );
-
-					//exit (-1);
-				} 
-			}
-
-			if (k >= End_Index (i, bonds)) {
-				miscount ++;
-				fprintf (stderr, " We have a problem with the atom %d and bond entry %d \n", i, j);
-				exit (-1);
-			}
-		}
-	}
-
-	fprintf (stderr, " Total bond order matched count %d miscount %d (%d) \n", count, miscount, (count+miscount));
-
-	/*
-	   for (int i = 5423; i < 5424; i++) {
-	   start = Start_Index (i, bonds);
-	   end = End_Index (i, bonds);
-
-	   index = d_start[i];
-
-	   fprintf (stderr, "Bond Count %d \n", end-start);
-	   for (int j = start; j < end; j++)
-	   {
-	   bond_data src, tgt;
-	   src = bonds->select.bond_list[j];
-	   tgt = d_bond_data[index];
-	   index ++;
-
-	//compare here
-	if ((src.nbr != tgt.nbr) || (src.d != tgt.d) ||
-	memcmp (src.rel_box, tgt.rel_box, IVEC_SIZE) || 
-	memcmp (src.dvec, tgt.dvec, RVEC_SIZE) ) {
-	fprintf (stderr, "Entries does not MATCH with bond data at atom %d index %d \r\n src ( %d %f (%d %d %d) (%f %f %f) )  tgt (%d %f (%d %d %d) (%f %f %f))\n",
-	i, j, 
-	src.nbr, src.d, src.rel_box[0], src.rel_box[1], src.rel_box[2], 
-	src.dvec[0], src.dvec[1], src.dvec[2],
-	tgt.nbr, tgt.d, tgt.rel_box[0], tgt.rel_box[1], tgt.rel_box[2], 
-	tgt.dvec[0], tgt.dvec[1], tgt.dvec[2] );
-	}
-	}
-	}
-	 */
-
-	//fprintf (stderr, "BOND LIST match on device and host \n");
-
-	free (d_start);
-	free (d_end);
-	free (d_bond_data);
-	return true;
+    int start, end, index, count, miscount;
+    int *d_start, *d_end;
+    bond_data *d_bond_data;
+    list *d_bonds = dev_lists + BONDS;
+    list *bonds = *lists + BONDS;
+
+    d_end = (int *)malloc (sizeof (int) * system->N);
+    d_start = (int *) malloc (sizeof (int) * system->N );
+    d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+    //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds );
+
+    copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+
+    count = 0;
+    for (int i = 0; i < system->N; i++) {
+        start = Start_Index (i, bonds);
+        end = End_Index (i, bonds);
+
+        count += end - start;
+        if ((end-start) != (d_end[i]-d_start[i])){
+            fprintf (stderr, "Entries does NOT match --> atom %d: cpu (%d %d) gpu (%d %d) \n", 
+                    i, start, end, d_start[i], d_end[i]);
+            exit (-1);
+        }
+
+    }
+    fprintf (stderr, "BOND LIST COUNT match on device and host  count %d \n", count);
+
+    for (int i = 0; i < system->N-1; i++) {
+        if ( d_end[i] >= d_start[i+1] ){
+            fprintf (stderr, "Bonds list check Overwrite @ index --> %d \n", i);
+            exit (-1);
+        }
+    }
+    //fprintf (stderr, " BOND LIST Overwrite *PASSED* \n");
+
+    count = 0;
+    miscount = 0;
+    for (int i = 0; i < system->N; i++) {
+
+        for (int j = d_start[i]; j < d_end[i]; j++) {
+            bond_data *src, *tgt;
+            src = &d_bond_data[j];
+            bond_data *src_sym = & d_bond_data[ src->sym_index ];
+
+            //Previously this was commented out. Thats why it was working.
+            //if (i >= src->nbr) continue;
+
+            int k = 0;
+            for (k = Start_Index (i, bonds); k < End_Index (i, bonds); k++) {
+                tgt = & (bonds->select.bond_list[k]);
+                bond_data *tgt_sym = &(bonds->select.bond_list [tgt->sym_index]);
+
+                if ((src->nbr == tgt->nbr) && !check_zero (src->d,tgt->d) && 
+                        !check_zero (src->dvec,tgt->dvec) && check_same (src->rel_box, tgt->rel_box)) {
+
+                    bond_order_data *s, *t;
+                    s = &(src->bo_data);
+                    t = &(tgt->bo_data);
+
+                    /*
+                       if (i == 45){
+                       fprintf (stderr, " Host %e for %d\n", t->BO, tgt->nbr);
+                       fprintf (stderr, " Device %e for %d\n", s->BO, src->nbr);
+                       }
+                     */
+
+                    if (    !check_zero (s->BO,t->BO) && 
+                            !check_zero (s->BO_s,t->BO_s) && 
+                            !check_zero(s->BO_pi,t->BO_pi)  && 
+                            !check_zero (s->BO_pi2,t->BO_pi2) &&
+                            !check_zero (s->Cdbo,t->Cdbo) && !check_zero (s->Cdbopi,t->Cdbopi) && !check_zero (s->Cdbopi2,t->Cdbopi2) &&
+                            !check_zero (s->C1dbo,t->C1dbo) && !check_zero (s->C2dbo,t->C2dbo) && !check_zero (s->C3dbo,t->C3dbo) &&
+                            !check_zero(s->C1dbopi,t->C1dbopi) && !check_zero(s->C2dbopi,t->C2dbopi) && !check_zero(s->C3dbopi,t->C3dbopi) && !check_zero(s->C4dbopi,t->C4dbopi) &&
+                            !check_zero(s->C1dbopi2,t->C1dbopi2) && !check_zero(s->C2dbopi2,t->C2dbopi2) &&!check_zero(s->C3dbopi2,t->C3dbopi2) &&!check_zero(s->C4dbopi2,t->C4dbopi2) &&
+                            !check_zero (s->dln_BOp_s, t->dln_BOp_s ) && 
+                            !check_zero (s->dln_BOp_pi, t->dln_BOp_pi ) && 
+                            !check_zero (s->dln_BOp_pi2, t->dln_BOp_pi2 ) && 
+                            !check_zero (s->dBOp, t->dBOp )) {
+                        count ++;
+
+                        //Check the sym index and dbond index here for double checking
+                        // bond_ij on both device and hosts are matched now. 
+                        bond_order_data *ss, *ts;
+                        ss = & (src_sym->bo_data );
+                        ts = & (tgt_sym->bo_data );
+
+                        if ((src_sym->nbr != tgt_sym->nbr) || check_zero (src_sym->d,tgt_sym->d) || 
+                                check_zero (src_sym->dvec,tgt_sym->dvec) || !check_same (src_sym->rel_box, tgt_sym->rel_box)
+                                || check_zero (ss->Cdbo, ts->Cdbo)){
+
+                            fprintf (stderr, " Sym Index information does not match for atom %d \n", i);
+                            fprintf (stderr, " atom --> %d \n", i);
+                            fprintf (stderr, " nbr --> %d %d\n", src->nbr, tgt->nbr );
+                            fprintf (stderr, " d --> %f %f \n", src_sym->d, tgt_sym->d );
+                            fprintf (stderr, " sym Index nbr --> %d %d \n", src_sym->nbr, tgt_sym->nbr );
+                            fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", 
+                                    src_sym->dvec[0], src_sym->dvec[1], src_sym->dvec[2], 
+                                    tgt_sym->dvec[0], tgt_sym->dvec[1], tgt_sym->dvec[2] );
+                            fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", 
+                                    src_sym->rel_box[0], src_sym->rel_box[1], src_sym->rel_box[2], 
+                                    tgt_sym->rel_box[0], tgt_sym->rel_box[1], tgt_sym->rel_box[2] );
+
+                            fprintf (stderr, " sym index Cdbo (%4.10e %4.10e) \n", ss->Cdbo,ts->Cdbo );
+                            exit (-1);
+                        }
+
+                        break;
+                    }
+                    fprintf (stderr, " d --> %f %f \n", src->d, tgt->d );
+                    fprintf (stderr, " dvec (%f %f %f) (%f %f %f) \n", 
+                            src->dvec[0], src->dvec[1], src->dvec[2], 
+                            tgt->dvec[0], tgt->dvec[1], tgt->dvec[2] );
+                    fprintf (stderr, " ivec (%d %d %d) (%d %d %d) \n", 
+                            src->rel_box[0], src->rel_box[1], src->rel_box[2], 
+                            tgt->rel_box[0], tgt->rel_box[1], tgt->rel_box[2] );
+
+                    fprintf (stderr, "Bond_Order_Data does not match for atom %d neighbor (%d %d) BO (%e %e) BO_s (%e %e) BO_pi (%e %e) BO_pi2 (%e %e) \n", i, 
+                            src->nbr, tgt->nbr, 
+                            s->BO, t->BO, 
+                            s->BO_s, t->BO_s, 
+                            s->BO_pi, t->BO_pi, 
+                            s->BO_pi2, t->BO_pi2
+                        );
+                    fprintf (stderr, " dBOp (%e %e %e) (%e %e %e) \n", s->dBOp[0], s->dBOp[1], s->dBOp[2], 
+                            t->dBOp[0], t->dBOp[1], t->dBOp[2] );
+
+                    fprintf (stderr, " Cdbo (%4.10e %4.10e) \n", s->Cdbo,t->Cdbo );
+                    fprintf (stderr, " Cdbopi (%e %e) \n", s->Cdbopi,t->Cdbopi );
+                    fprintf (stderr, " Cdbopi2 (%e %e) \n", s->Cdbopi2,t->Cdbopi2 );
+                    fprintf (stderr, " C1dbo (%e %e %e)(%e %e %e) \n", s->C1dbo,s->C2dbo,s->C3dbo, t->C1dbo,t->C2dbo,t->C3dbo );
+                    fprintf (stderr, " C1dbopi (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi,s->C2dbopi,s->C3dbopi,s->C4dbopi, t->C1dbopi,t->C2dbopi,t->C3dbopi,t->C4dbopi);
+                    fprintf (stderr, " C1dbopi2 (%e %e %e %e) (%e %e %e %e)\n", s->C1dbopi2,s->C2dbopi2,s->C3dbopi2,s->C4dbopi2, t->C1dbopi2,t->C2dbopi2,t->C3dbopi2,t->C4dbopi2);
+                    fprintf (stderr, " dln_BOp_s (%e %e %e ) (%e %e %e) \n", 
+                            s->dln_BOp_s[0], s->dln_BOp_s[1], s->dln_BOp_s[2],
+                            t->dln_BOp_s[0], t->dln_BOp_s[1], t->dln_BOp_s[2] );
+                    fprintf (stderr, " dln_BOp_pi (%e %e %e ) (%e %e %e) \n", 
+                            s->dln_BOp_pi[0], s->dln_BOp_pi[1], s->dln_BOp_pi[2],
+                            t->dln_BOp_pi[0], t->dln_BOp_pi[1], t->dln_BOp_pi[2] );
+                    fprintf (stderr, " dln_BOp_pi2 (%e %e %e ) (%e %e %e) \n", 
+                            s->dln_BOp_pi2[0], s->dln_BOp_pi2[1], s->dln_BOp_pi2[2],
+                            t->dln_BOp_pi2[0], t->dln_BOp_pi2[1], t->dln_BOp_pi2[2] );
+
+                    //exit (-1);
+                } 
+            }
+
+            if (k >= End_Index (i, bonds)) {
+                miscount ++;
+                fprintf (stderr, " We have a problem with the atom %d and bond entry %d \n", i, j);
+                exit (-1);
+            }
+        }
+    }
+
+    fprintf (stderr, " Total bond order matched count %d miscount %d (%d) \n", count, miscount, (count+miscount));
+
+    /*
+       for (int i = 5423; i < 5424; i++) {
+       start = Start_Index (i, bonds);
+       end = End_Index (i, bonds);
+
+       index = d_start[i];
+
+       fprintf (stderr, "Bond Count %d \n", end-start);
+       for (int j = start; j < end; j++)
+       {
+       bond_data src, tgt;
+       src = bonds->select.bond_list[j];
+       tgt = d_bond_data[index];
+       index ++;
+
+    //compare here
+    if ((src.nbr != tgt.nbr) || (src.d != tgt.d) ||
+    memcmp (src.rel_box, tgt.rel_box, IVEC_SIZE) || 
+    memcmp (src.dvec, tgt.dvec, RVEC_SIZE) ) {
+    fprintf (stderr, "Entries does not MATCH with bond data at atom %d index %d \r\n src ( %d %f (%d %d %d) (%f %f %f) )  tgt (%d %f (%d %d %d) (%f %f %f))\n",
+    i, j, 
+    src.nbr, src.d, src.rel_box[0], src.rel_box[1], src.rel_box[2], 
+    src.dvec[0], src.dvec[1], src.dvec[2],
+    tgt.nbr, tgt.d, tgt.rel_box[0], tgt.rel_box[1], tgt.rel_box[2], 
+    tgt.dvec[0], tgt.dvec[1], tgt.dvec[2] );
+    }
+    }
+    }
+     */
+
+    //fprintf (stderr, "BOND LIST match on device and host \n");
+
+    free (d_start);
+    free (d_end);
+    free (d_bond_data);
+    return true;
 }
 
 bool validate_sym_dbond_indices (reax_system *system, static_storage *workspace, list **lists)
 {
-	int start, end, index, count, miscount;
-	int *d_start, *d_end;
-	bond_data *d_bond_data;
-	list *d_bonds = dev_lists + BONDS;
-	list *bonds = *lists + BONDS;
-
-	d_end = (int *)malloc (sizeof (int) * system->N);
-	d_start = (int *) malloc (sizeof (int) * system->N );
-	d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-	//fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds );
-
-	copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-	copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-	copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-
-	count = 0;
-	miscount = 0;
-	for (int i = 0; i < system->N; i++) {
-
-		for (int j = d_start[i]; j < d_end[i]; j++) {
-			bond_data *src, *tgt;
-			src = &d_bond_data[j];
-
-			tgt = &d_bond_data[ src->sym_index ];	
-
-			if ((src->dbond_index == tgt->dbond_index) )
-				count ++;
-			else 
-				miscount ++;
-		}
-	}
-	fprintf (stderr, "Sym and dbond indexes done count(device) --> %d  (%d)\n", count, miscount);
-
-	count = 0;
-	miscount = 0;
-	for (int i = 0; i < system->N; i++) {
-
-		for (int j = Start_Index (i, bonds); j < End_Index(i, bonds); j++) {
-			bond_data *src, *tgt;
-			src = &bonds->select.bond_list [j];
-
-			tgt = &bonds->select.bond_list [ src->sym_index ];	
-
-			if ((src->dbond_index == tgt->dbond_index) )
-				count ++;
-			else 
-				miscount ++;
-		}
-	}
-	fprintf (stderr, "Sym and dbond indexes done count (host) --> %d  (%d)\n", count, miscount);
-
-	free (d_start);
-	free (d_end);
-	free (d_bond_data);
-	return true;
+    int start, end, index, count, miscount;
+    int *d_start, *d_end;
+    bond_data *d_bond_data;
+    list *d_bonds = dev_lists + BONDS;
+    list *bonds = *lists + BONDS;
+
+    d_end = (int *)malloc (sizeof (int) * system->N);
+    d_start = (int *) malloc (sizeof (int) * system->N );
+    d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+    //fprintf (stderr, "Num bonds copied from device to host is --> %d \n", system->num_bonds );
+
+    copy_host_device (d_start, d_bonds->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (d_end, d_bonds->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+
+    count = 0;
+    miscount = 0;
+    for (int i = 0; i < system->N; i++) {
+
+        for (int j = d_start[i]; j < d_end[i]; j++) {
+            bond_data *src, *tgt;
+            src = &d_bond_data[j];
+
+            tgt = &d_bond_data[ src->sym_index ];    
+
+            if ((src->dbond_index == tgt->dbond_index) )
+                count ++;
+            else 
+                miscount ++;
+        }
+    }
+    fprintf (stderr, "Sym and dbond indexes done count(device) --> %d  (%d)\n", count, miscount);
+
+    count = 0;
+    miscount = 0;
+    for (int i = 0; i < system->N; i++) {
+
+        for (int j = Start_Index (i, bonds); j < End_Index(i, bonds); j++) {
+            bond_data *src, *tgt;
+            src = &bonds->select.bond_list [j];
+
+            tgt = &bonds->select.bond_list [ src->sym_index ];    
+
+            if ((src->dbond_index == tgt->dbond_index) )
+                count ++;
+            else 
+                miscount ++;
+        }
+    }
+    fprintf (stderr, "Sym and dbond indexes done count (host) --> %d  (%d)\n", count, miscount);
+
+    free (d_start);
+    free (d_end);
+    free (d_bond_data);
+    return true;
 }
 
 bool analyze_hbonds (reax_system *system, static_storage *workspace, list **lists)
 {
-	int hindex, nbr_hindex;
-	int pj, hj, hb_start_j, hb_end_j, j, nbr;
-	far_neighbor_data *nbr_pj;
-
-	list *far_nbrs = *lists + FAR_NBRS;	
-	list *hbonds = *lists + HBONDS;
-	hbond_data *src, *tgt, *h_bond_data;
-	int i, k, l;
-
-	for (i = 0; i < system->N; i ++)
-		for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++)
-		{
-			// check if the neighbor is of h_type
-			nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
-			j = nbr_pj->nbr;
-
-			if (workspace->hbond_index [j] != -1)
-			{    
-				hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
-				hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
-
-				if (hb_start_j == hb_end_j) fprintf (stderr, "start == end \n");
-
-				for ( hj = hb_start_j; hj < hb_end_j; hj ++ ) 
-				{    
-					h_bond_data = &( hbonds->select.hbond_list [hj] );
-					nbr = h_bond_data->nbr;
-
-					if (nbr == i) 
-						fprintf (stderr, "found it for atom %d and neighbor %d neighbor %d \n", i, j , nbr);
-					if (Start_Index (workspace->hbond_index [nbr], hbonds) == End_Index (workspace->hbond_index [nbr], hbonds))
-						fprintf (stderr, " neighbor start == end \n");
-
-					for ( k = Start_Index (workspace->hbond_index [nbr], hbonds);
-							k < End_Index (workspace->hbond_index [nbr], hbonds);
-							k ++)  
-					{    
-						if (hbonds->select.hbond_list [k].nbr == i) { 
-							fprintf (stderr, "found it for atom %d and neighbor %d \n", i, j);
-						}    
-					}    
-				}    
-			}    
-			else fprintf (stderr, "hbond index in workspace is -1\n");
-		}
-
-
-	for (i = 0; i < system->N; i++) 
-	{
-		hindex = workspace->hbond_index [i];
-		if (hindex != -1) 
-		{
-			for (j = Start_Index ( hindex, hbonds ); j < End_Index ( hindex, hbonds ); j ++)
-			{
-				src = &hbonds->select.hbond_list [j];
-
-				nbr_hindex = workspace->hbond_index [src->nbr];
-				if (nbr_hindex == -1) {
-					fprintf (stderr, " HBonds are NOT symmetric atom %d, neighbor %d\n", i, src->nbr);
-					exit (-1);
-				}
-
-				for (k = Start_Index ( nbr_hindex, hbonds ); k < End_Index ( nbr_hindex, hbonds ); k++)
-				{
-					tgt = &hbonds->select.hbond_list [k];
-					if ((tgt->nbr == i) && (src->scl == tgt->scl)) 
-					{
-						break;
-					}
-				}
-
-				if ( k >= End_Index (nbr_hindex, hbonds)) {
-					fprintf (stderr, " Could not find the other half of the hbonds \n");
-					exit (-1);
-				}
-			}
-		}
-	}
-
-	fprintf (stderr, "HBONDS list is symmetric \n");
+    int hindex, nbr_hindex;
+    int pj, hj, hb_start_j, hb_end_j, j, nbr;
+    far_neighbor_data *nbr_pj;
+
+    list *far_nbrs = *lists + FAR_NBRS;    
+    list *hbonds = *lists + HBONDS;
+    hbond_data *src, *tgt, *h_bond_data;
+    int i, k, l;
+
+    for (i = 0; i < system->N; i ++)
+        for (pj = Start_Index (i, far_nbrs); pj < End_Index (i, far_nbrs); pj ++)
+        {
+            // check if the neighbor is of h_type
+            nbr_pj = &( far_nbrs->select.far_nbr_list[pj] );
+            j = nbr_pj->nbr;
+
+            if (workspace->hbond_index [j] != -1)
+            {    
+                hb_start_j = Start_Index( workspace->hbond_index[j], hbonds );
+                hb_end_j   = End_Index  ( workspace->hbond_index[j], hbonds );
+
+                if (hb_start_j == hb_end_j) fprintf (stderr, "start == end \n");
+
+                for ( hj = hb_start_j; hj < hb_end_j; hj ++ ) 
+                {    
+                    h_bond_data = &( hbonds->select.hbond_list [hj] );
+                    nbr = h_bond_data->nbr;
+
+                    if (nbr == i) 
+                        fprintf (stderr, "found it for atom %d and neighbor %d neighbor %d \n", i, j , nbr);
+                    if (Start_Index (workspace->hbond_index [nbr], hbonds) == End_Index (workspace->hbond_index [nbr], hbonds))
+                        fprintf (stderr, " neighbor start == end \n");
+
+                    for ( k = Start_Index (workspace->hbond_index [nbr], hbonds);
+                            k < End_Index (workspace->hbond_index [nbr], hbonds);
+                            k ++)  
+                    {    
+                        if (hbonds->select.hbond_list [k].nbr == i) { 
+                            fprintf (stderr, "found it for atom %d and neighbor %d \n", i, j);
+                        }    
+                    }    
+                }    
+            }    
+            else fprintf (stderr, "hbond index in workspace is -1\n");
+        }
+
+
+    for (i = 0; i < system->N; i++) 
+    {
+        hindex = workspace->hbond_index [i];
+        if (hindex != -1) 
+        {
+            for (j = Start_Index ( hindex, hbonds ); j < End_Index ( hindex, hbonds ); j ++)
+            {
+                src = &hbonds->select.hbond_list [j];
+
+                nbr_hindex = workspace->hbond_index [src->nbr];
+                if (nbr_hindex == -1) {
+                    fprintf (stderr, " HBonds are NOT symmetric atom %d, neighbor %d\n", i, src->nbr);
+                    exit (-1);
+                }
+
+                for (k = Start_Index ( nbr_hindex, hbonds ); k < End_Index ( nbr_hindex, hbonds ); k++)
+                {
+                    tgt = &hbonds->select.hbond_list [k];
+                    if ((tgt->nbr == i) && (src->scl == tgt->scl)) 
+                    {
+                        break;
+                    }
+                }
+
+                if ( k >= End_Index (nbr_hindex, hbonds)) {
+                    fprintf (stderr, " Could not find the other half of the hbonds \n");
+                    exit (-1);
+                }
+            }
+        }
+    }
+
+    fprintf (stderr, "HBONDS list is symmetric \n");
 }
 
 
 bool validate_hbonds (reax_system *system, static_storage *workspace, list **lists)
 {
-	int *hbond_index, count;
-	int *d_start, *d_end, index, d_index;
-	hbond_data *data, src, tgt;
-	list *d_hbonds = dev_lists + HBONDS;
-	list *hbonds = *lists + HBONDS;
-
-	hbond_index = (int *) malloc (INT_SIZE * system->N);
-	copy_host_device (hbond_index, dev_workspace->hbond_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-
-	d_end = (int *)malloc (INT_SIZE * system->N);
-	d_start = (int *) malloc (INT_SIZE * system->N );
-
-	copy_host_device (d_start, d_hbonds->index, INT_SIZE * dev_workspace->num_H, cudaMemcpyDeviceToHost, __LINE__);
-	copy_host_device (d_end, d_hbonds->end_index, INT_SIZE * dev_workspace->num_H, cudaMemcpyDeviceToHost, __LINE__);
-
-	//fprintf (stderr, "Copying hbonds to host %d \n", system->num_hbonds);
-	data = (hbond_data *) malloc (HBOND_DATA_SIZE * system->num_hbonds);
-	copy_host_device (data, d_hbonds->select.hbond_list, HBOND_DATA_SIZE * system->num_hbonds, cudaMemcpyDeviceToHost, __LINE__);
-
-	/*
-	   Now the hbonds list is symmetric. will not work any longer
-
-	   for (int i = 0; i < system->N; i++)
-	   if (hbond_index[i] != workspace->hbond_index[i]) {
-	   fprintf (stderr, "hbond index does not match for atom %d (%d %d)\n", 
-	   i, workspace->hbond_index[i], hbond_index[i]);
-	   exit (-1);
-	   }
-
-	 */
-
-	//fprintf (stderr, "hbond_index match between host and device \n");
-
-	for (int i = 0; i < system->N; i++) {
-
-		if ( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 )
-		{
-			if (hbond_index[i] >= 0) {
-				if ((d_end[ hbond_index[i]] - d_start[hbond_index[i]])	 != 
-						(End_Index (workspace->hbond_index[i], hbonds) - Start_Index (workspace->hbond_index[i], hbonds))) {
-					fprintf (stderr, "%d %d - d(%d  %d) c(%d %d) \n",hbond_index[i], workspace->hbond_index[i],
-							d_start[hbond_index[i]], d_end[ hbond_index[i]], 
-							Start_Index (workspace->hbond_index[i], hbonds), 
-							End_Index (workspace->hbond_index[i], hbonds) );
-					exit (-1);
-				}
-			}
-		}
-	}
-	//fprintf (stderr, "hbonds count match between host and device \n");
-
-	count = 0;
-	for (int i = 0; i < system->N; i++) {
-
-		int d = workspace->hbond_index[i];
-		if (d == -1) continue;
-
-		d_index = hbond_index[i];
-		/*
-		   fprintf (stderr, " Count cpu %d gpu %d \n", 
-		   End_Index (workspace->hbond_index[i], hbonds) - index, 
-		   d_end[d_index] - d_start[d_index]);
-		 */
-		for (int j = d_start[d_index]; j < d_end[d_index]; j++ )
-		{
-			tgt = data[j];
-
-			int k = 0;
-			for (k = Start_Index (workspace->hbond_index[i], hbonds); 
-					k < End_Index (workspace->hbond_index[i], hbonds); k++) {
-				src = hbonds->select.hbond_list[k];
-
-				if ((src.nbr == tgt.nbr) || (src.scl == tgt.scl)) {
-					/*
-					   fprintf (stderr, "Mismatch  at atom %d index %d (%d %d) -- (%d %d) \n", i, k,
-					   src.nbr, src.scl, 
-					   tgt.nbr, tgt.scl);
-					 */
-					count ++;
-					break;
-				}
-			}
-
-			/*
-			   if ( 	((End_Index (workspace->hbond_index[i], hbonds) - index) != index ) && 
-			   (k >= End_Index (workspace->hbond_index[i], hbonds))) {
-			   fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, d_index );
-			   exit (-1);
-			   }
-			 */
-
-			if ( k >= (End_Index (workspace->hbond_index[i], hbonds) )){
-				fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, j);
-				exit (-1);
-			}
-		}
-
-		if ((End_Index (workspace->hbond_index[i], hbonds)- Start_Index(workspace->hbond_index[i], hbonds)) != (d_end[d_index] - d_start[d_index])){
-			fprintf (stderr, "End index does not match between device and host \n");
-			exit (-1);
-		}
-	}
-
-	//fprintf (stderr, "HBONDs match on device and Host count --> %d\n", count);
-
-	free (d_start);
-	free (d_end);
-	free (data);
-	return true;
+    int *hbond_index, count;
+    int *d_start, *d_end, index, d_index;
+    hbond_data *data, src, tgt;
+    list *d_hbonds = dev_lists + HBONDS;
+    list *hbonds = *lists + HBONDS;
+
+    hbond_index = (int *) malloc (INT_SIZE * system->N);
+    copy_host_device (hbond_index, dev_workspace->hbond_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+
+    d_end = (int *)malloc (INT_SIZE * system->N);
+    d_start = (int *) malloc (INT_SIZE * system->N );
+
+    copy_host_device (d_start, d_hbonds->index, INT_SIZE * dev_workspace->num_H, cudaMemcpyDeviceToHost, __LINE__);
+    copy_host_device (d_end, d_hbonds->end_index, INT_SIZE * dev_workspace->num_H, cudaMemcpyDeviceToHost, __LINE__);
+
+    //fprintf (stderr, "Copying hbonds to host %d \n", system->num_hbonds);
+    data = (hbond_data *) malloc (HBOND_DATA_SIZE * system->num_hbonds);
+    copy_host_device (data, d_hbonds->select.hbond_list, HBOND_DATA_SIZE * system->num_hbonds, cudaMemcpyDeviceToHost, __LINE__);
+
+    /*
+       Now the hbonds list is symmetric. will not work any longer
+
+       for (int i = 0; i < system->N; i++)
+       if (hbond_index[i] != workspace->hbond_index[i]) {
+       fprintf (stderr, "hbond index does not match for atom %d (%d %d)\n", 
+       i, workspace->hbond_index[i], hbond_index[i]);
+       exit (-1);
+       }
+
+     */
+
+    //fprintf (stderr, "hbond_index match between host and device \n");
+
+    for (int i = 0; i < system->N; i++) {
+
+        if ( system->reaxprm.sbp[ system->atoms[i].type ].p_hbond == 1 )
+        {
+            if (hbond_index[i] >= 0) {
+                if ((d_end[ hbond_index[i]] - d_start[hbond_index[i]])     != 
+                        (End_Index (workspace->hbond_index[i], hbonds) - Start_Index (workspace->hbond_index[i], hbonds))) {
+                    fprintf (stderr, "%d %d - d(%d  %d) c(%d %d) \n",hbond_index[i], workspace->hbond_index[i],
+                            d_start[hbond_index[i]], d_end[ hbond_index[i]], 
+                            Start_Index (workspace->hbond_index[i], hbonds), 
+                            End_Index (workspace->hbond_index[i], hbonds) );
+                    exit (-1);
+                }
+            }
+        }
+    }
+    //fprintf (stderr, "hbonds count match between host and device \n");
+
+    count = 0;
+    for (int i = 0; i < system->N; i++) {
+
+        int d = workspace->hbond_index[i];
+        if (d == -1) continue;
+
+        d_index = hbond_index[i];
+        /*
+           fprintf (stderr, " Count cpu %d gpu %d \n", 
+           End_Index (workspace->hbond_index[i], hbonds) - index, 
+           d_end[d_index] - d_start[d_index]);
+         */
+        for (int j = d_start[d_index]; j < d_end[d_index]; j++ )
+        {
+            tgt = data[j];
+
+            int k = 0;
+            for (k = Start_Index (workspace->hbond_index[i], hbonds); 
+                    k < End_Index (workspace->hbond_index[i], hbonds); k++) {
+                src = hbonds->select.hbond_list[k];
+
+                if ((src.nbr == tgt.nbr) || (src.scl == tgt.scl)) {
+                    /*
+                       fprintf (stderr, "Mismatch  at atom %d index %d (%d %d) -- (%d %d) \n", i, k,
+                       src.nbr, src.scl, 
+                       tgt.nbr, tgt.scl);
+                     */
+                    count ++;
+                    break;
+                }
+            }
+
+            /*
+               if (     ((End_Index (workspace->hbond_index[i], hbonds) - index) != index ) && 
+               (k >= End_Index (workspace->hbond_index[i], hbonds))) {
+               fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, d_index );
+               exit (-1);
+               }
+             */
+
+            if ( k >= (End_Index (workspace->hbond_index[i], hbonds) )){
+                fprintf (stderr, "Hbonds does not match for atom %d hbond_Index %d \n", i, j);
+                exit (-1);
+            }
+        }
+
+        if ((End_Index (workspace->hbond_index[i], hbonds)- Start_Index(workspace->hbond_index[i], hbonds)) != (d_end[d_index] - d_start[d_index])){
+            fprintf (stderr, "End index does not match between device and host \n");
+            exit (-1);
+        }
+    }
+
+    //fprintf (stderr, "HBONDs match on device and Host count --> %d\n", count);
+
+    free (d_start);
+    free (d_end);
+    free (data);
+    return true;
 }
 
 bool validate_neighbors (reax_system *system, list **lists)
 {
-	list *far_nbrs = *lists + FAR_NBRS;
-	list *d_nbrs = dev_lists + FAR_NBRS;
-	far_neighbor_data gpu, cpu;
-	int index, count, jicount;
-
-	int *end = (int *)malloc (sizeof (int) * system->N);
-	int *start = (int *) malloc (sizeof (int) * system->N );
-
-	//fprintf (stderr, "numnbrs %d \n", system->num_nbrs);
-
-	copy_host_device (start, d_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 1);
-	copy_host_device (end, d_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 2);
-
-	far_neighbor_data *data = (far_neighbor_data *) malloc (FAR_NEIGHBOR_SIZE * system->num_nbrs);
-	copy_host_device (data, d_nbrs->select.far_nbr_list, FAR_NEIGHBOR_SIZE * system->num_nbrs, cudaMemcpyDeviceToHost, 3);
-
-	int cpu_count = 0;
-	int gpu_count = 0;
-
-	for (int i = 0; i < system->N; i++){
-		cpu_count += Num_Entries (i, far_nbrs);
-		gpu_count += end[i] - start[i];
-	}
-
-	//fprintf (stderr, " Nbrs count cpu: %d -- gpu: %d \n", cpu_count, gpu_count );
-	for (int i = 0; i < system->N-1; i++){
-		if (end [i] > start [i+1])
-		{
-			fprintf (stderr, " Far Neighbors index over write  @ index %d\n", i);
-			exit (-1);
-		}
-	}
-
-
-
-	for (int i = 0; i < system->N; i++){
-		index = Start_Index (i, far_nbrs);
-
-		for (int j = start[i]; j < end[i]; j++){
-			gpu = data[j];
-
-			if (i < data[j].nbr) {
-				int src = data[j].nbr;
-				int dest = i;
-				int x;
-
-
-				for (x = start[src]; x < end[src]; x++) {
-					if (data[x].nbr != dest) continue;
-
-					gpu = data[x];
-					cpu = data[j];
-
-					if (  (gpu.d != cpu.d) ||
-							(cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
-							(cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
-						fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, 
-								data[j].d, 
-								data[j].rel_box[0],
-								data[j].rel_box[1],
-								data[j].rel_box[2],
-								data[j].dvec[0], 
-								data[j].dvec[1], 
-								data[j].dvec[2] 
-							);
-						fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr,
-								data[x].d,
-								data[x].rel_box[0],
-								data[x].rel_box[1],
-								data[x].rel_box[2],
-								data[x].dvec[0],
-								data[x].dvec[1],
-								data[x].dvec[2]
-							);
-						jicount++;
-
-						fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host \n");
-						exit (-1);
-					}
-					break;
-				}
-
-				if (x >= end[src]) {
-					fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src );
-					exit (-1);
-				}
-
-				continue;
-			}
-
-			cpu = far_nbrs->select.far_nbr_list[index];
-			//if (  (gpu.d != cpu.d) || (gpu.nbr != cpu.nbr) ||
-			//     (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
-			//    (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
-			//if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE)) {
-			if (  check_zero (gpu.d, cpu.d) || 
-					(gpu.nbr != cpu.nbr) ||
-					check_zero (cpu.dvec, gpu.dvec) || 
-					!check_same (cpu.rel_box, gpu.rel_box)) {
-
-				fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, start[i], end[i], j );
-				fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, far_nbrs), End_Index (i, far_nbrs), index);
-				fprintf (stdout, "Far neighbors does not match atom: %d \n", i );
-				fprintf (stdout, "neighbor %d ,  %d \n",  cpu.nbr, gpu.nbr);
-				fprintf (stdout, "d %f ,  %f \n", cpu.d, data[j].d);
-				fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", 
-						cpu.dvec[0], cpu.dvec[1], cpu.dvec[2],
-						gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] );
-
-				fprintf (stdout, "rel_box (%d %d %d) (%d %d %d) \n", 
-						cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2],
-						gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
-
-				fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host  **** \n");
-				exit (-1);
-				count ++;
-			}
-			index ++;
-		}    
-
-		if (index != End_Index (i, far_nbrs))
-		{    
-			fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, far_nbrs), End_Index(i, far_nbrs),
-					start[i], end[i]);
-			exit (10);
-		}    
-		}
-
-		//fprintf (stderr, "FAR Neighbors match between device and host \n");
-		free (start);
-		free (end);
-		free (data);
-		return true;
-		}
-
-		bool validate_workspace (reax_system *system, static_storage *workspace, list **lists) 
-		{
-			real *total_bond_order;
-			int count, tcount;
-
-			total_bond_order = (real *) malloc ( system->N * REAL_SIZE );
-			copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < system->N; i++) {
-
-				//if (abs (workspace->total_bond_order[i] - total_bond_order[i]) >= GPU_TOLERANCE){
-				if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){
-					fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n",
-							i, workspace->total_bond_order[i], total_bond_order[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			free (total_bond_order);
-			//fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count);
-
-
-			rvec *dDeltap_self;
-			dDeltap_self = (rvec *) calloc (system->N, RVEC_SIZE);
-			copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < system->N; i++ )
-			{
-				if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i]))
-				{
-					fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, 
-							workspace->dDeltap_self[i][0],
-							workspace->dDeltap_self[i][1],
-							workspace->dDeltap_self[i][2],
-							dDeltap_self[3*i+0],
-							dDeltap_self[3*i+1],
-							dDeltap_self[3*i+2] );
-					exit (-1);
-					count ++;
-				}
-			}
-			free (dDeltap_self);
-			//fprintf (stderr, "dDeltap_self mismatch count %d\n", count);
-
-			//exit for init_forces
-
-			real *test;
-			test = (real *) malloc (system->N * REAL_SIZE);
-
-			copy_host_device (test, dev_workspace->Deltap, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ )
-			{
-				if (check_zero (workspace->Deltap[i], test[i]))
-				{
-					fprintf (stderr, "Deltap: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "Deltap mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->Deltap_boc, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ )
-			{
-				if (check_zero (workspace->Deltap_boc[i], test[i]))
-				{
-					fprintf (stderr, "Deltap_boc: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap_boc[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "dDeltap_boc mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->Delta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->Delta[i], test[i])) {
-					fprintf (stderr, "Delta: Mismatch index --> %d (%f %f) \n", i, workspace->Delta[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "Delta mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->Delta_e, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->Delta_e[i], test[i])) {
-					fprintf (stderr, "Delta_e: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_e[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "Delta_e mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->vlpex, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->vlpex[i], test[i])) {
-					fprintf (stderr, "vlpex: Mismatch index --> %d (%f %f) \n", i, workspace->vlpex[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "vlpex mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->nlp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->nlp[i], test[i])) {
-					fprintf (stderr, "nlp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "nlp mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->Delta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->Delta_lp[i], test[i])) {
-					fprintf (stderr, "Delta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "Delta_lp mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->Clp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->Clp[i], test[i])) {
-					fprintf (stderr, "Clp: Mismatch index --> %d (%f %f) \n", i, workspace->Clp[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "Clp mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->dDelta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->dDelta_lp[i], test[i])) {
-					fprintf (stderr, "dDelta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "dDelta_lp mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->nlp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->nlp_temp[i], test[i])) {
-					fprintf (stderr, "nlp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp_temp[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "nlp_temp mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->Delta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->Delta_lp_temp[i], test[i])) {
-					fprintf (stderr, "Delta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp_temp[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count);
-
-			copy_host_device (test, dev_workspace->dDelta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->dDelta_lp_temp[i], test[i])) {
-					fprintf (stderr, "dDelta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp_temp[i], test[i]);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count);
-
-			//exit for Bond order calculations
-
-
-			copy_host_device (test, dev_workspace->CdDelta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->CdDelta[i], test[i])) {
-					fprintf (stderr, " CdDelta does NOT match (%f %f) for atom  %d \n", workspace->CdDelta[i], test[i], i);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "CdDelta mismatch count %d\n", count);
-			//exit for Bond Energy calculations
-
-			/*
-			   copy_host_device (test, dev_workspace->droptol, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-			   count = 0;
-			   for (int i = 0; i < system->N; i++ ) {
-			   if (check_zero (workspace->droptol[i], test[i])) {
-			   fprintf (stderr, " Droptol Does not match (%f %f) \n", workspace->droptol[i], test[i]);
-			   exit (-1);
-			   count ++;
-			   }
-			   }
-			//fprintf (stderr, "droptol mismatch count %d\n", count);
-			 */
-
-
-			//exit for  QEa calculations
-			/*
-			   real *t_s;
-
-			   t_s = (real *) malloc (REAL_SIZE * (system->N * 2) );
-			   copy_host_device (t_s, dev_workspace->b_prm, REAL_SIZE * (system->N * 2), cudaMemcpyDeviceToHost, __LINE__);
-
-			   count = 0;
-			   for (int i = 0; i < (system->N * 2); i++ ) {
-			   if (check_zero (workspace->b_prm[i], t_s[i])) {
-			   fprintf (stderr, " (%f %f) \n", workspace->b_prm[i], t_s[i]);
-			   exit (-1);
-			   count ++;
-			   }
-			   }
-			//fprintf (stderr, "b_prm mismatch count %d\n", count);
-
-			t_s = (real *) malloc (REAL_SIZE * 5 * system->N);
-			copy_host_device (t_s, dev_workspace->s, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < 5*system->N; i++ ) {
-			if (check_zero (workspace->s[i], t_s[i])) {
-			//fprintf (stderr, " (%f %f)  @ index %d \n", workspace->s[i], t_s[i], i);
-			count ++;
-			}
-			}
-			fprintf (stderr, "s mismatch count %d\n", count);
-
-
-			t_s = (real *) malloc (REAL_SIZE * 5 * system->N);
-			copy_host_device (t_s, dev_workspace->t, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < 5*system->N; i++ ) {
-			if (check_zero (workspace->t[i], t_s[i])) {
-			//fprintf (stderr, " (%f %f) @ index : %d\n", workspace->t[i], t_s[i], i);
-			count ++;
-			}
-			}
-			fprintf (stderr, "t mismatch count %d\n", count);
-
-
-			t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * system->N);
-			copy_host_device (t_s, dev_workspace->v, system->N * REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < (RESTART + 1)*system->N; i++ ) {
-			if (check_zero (workspace->v[i], t_s[i])) {
-			//fprintf (stderr, " (%f %f) @ index %d \n", workspace->v[i], t_s[i], i);
-			count ++;
-			}
-			}
-			fprintf (stderr, "v mismatch count %d\n", count);
-
-			t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
-			copy_host_device (t_s, dev_workspace->y, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < (RESTART + 1); i++ ) {
-			if (check_zero (workspace->y[i], t_s[i])) {
-			//fprintf (stderr, " (%f %f) \n", workspace->y[i], t_s[i]);
-			count ++;
-			}
-			}
-			fprintf (stderr, "y mismatch count %d\n", count);
-
-			t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
-			copy_host_device (t_s, dev_workspace->hc, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < (RESTART + 1); i++ ) {
-			if (check_zero (workspace->hc[i], t_s[i])) {
-				//fprintf (stderr, " (%f %f) \n", workspace->hc[i], t_s[i]);
-				count ++;
-			}
-			}
-			fprintf (stderr, "hc mismatch count %d\n", count);
-
-			t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
-			copy_host_device (t_s, dev_workspace->hs, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < (RESTART + 1); i++ ) {
-				if (check_zero (workspace->hs[i], t_s[i])) {
-					//fprintf (stderr, " (%f %f) \n", workspace->hs[i], t_s[i]);
-					count ++;
-				}
-			}
-			fprintf (stderr, "hs mismatch count %d\n", count);
-
-			t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * (RESTART+1) );
-			copy_host_device (t_s, dev_workspace->h, REAL_SIZE * (RESTART+1)*(RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < (RESTART+1)*(RESTART+1); i++ ) {
-				if (check_zero (workspace->h[i], t_s[i])) {
-					//fprintf (stderr, " (%f %f) \n", workspace->h[i], t_s[i]);
-					count ++;
-				}
-			}
-			fprintf (stderr, "h mismatch count %d\n", count);
-
-			t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
-			copy_host_device (t_s, dev_workspace->g, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < (RESTART + 1); i++ ) {
-				if (check_zero (workspace->g[i], t_s[i])) {
-					//fprintf (stderr, " (%f %f) @ index %d\n", workspace->g[i], t_s[i], i);
-					count ++;
-				}
-			}
-			fprintf (stderr, "g mismatch count %d\n", count);
-			*/
-
-				rvec *r_s = (rvec *) malloc (RVEC_SIZE * system->N );
-			copy_host_device (r_s, dev_workspace->v_const, RVEC_SIZE * system->N,  cudaMemcpyDeviceToHost, __LINE__);
-
-			count = 0;
-			for (int i = 0; i < system->N; i++ ) {
-				if (check_zero (workspace->v_const[i], r_s[i])) {
-					fprintf (stderr, " v_const (%f %f %f) (%f %f %f) @ index %d\n", 
-							workspace->v_const[i][0], 
-							workspace->v_const[i][1], 
-							workspace->v_const[i][2], 
-							r_s[i][0], 
-							r_s[i][1], 
-							r_s[i][2], 
-							i);
-					exit (-1);
-					count ++;
-				}
-			}
-			//fprintf (stderr, "v_const mismatch count %d\n", count);
-
-			free (test);
-			free (r_s);
-			return true;
-			}
-
-			bool validate_data (reax_system *system, simulation_data *host)
-			{
-				simulation_data device;
-
-				copy_host_device (&device, host->d_simulation_data, SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, __LINE__);
-
-				if (check_zero (host->E_BE, device.E_BE)){
-					fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->E_BE, device.E_BE);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Lp, device.E_Lp)){
-					fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->E_Lp, device.E_Lp);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Ov, device.E_Ov)){
-					fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->E_Ov, device.E_Ov);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Un, device.E_Un)){
-					fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->E_Un, device.E_Un);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Tor, device.E_Tor)) {
-					fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->E_Tor, device.E_Tor);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Con, device.E_Con)) {
-					fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->E_Con, device.E_Con);
-					exit (-1);
-				}
-
-				if (check_zero (host->ext_press, device.ext_press)) {
-					fprintf (stderr, "ext_press does not match (%4.10e %4.10e) \n", host->ext_press, device.ext_press);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_HB, device.E_HB)) {
-					fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->E_HB, device.E_HB);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Ang, device.E_Ang)) {
-					fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->E_Ang, device.E_Ang);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Pen, device.E_Pen)) {
-					fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->E_Pen, device.E_Pen);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Coa, device.E_Coa)) {
-					fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->E_Coa, device.E_Coa);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_vdW, device.E_vdW)) {
-					fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->E_vdW, device.E_vdW);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Ele, device.E_Ele)) {
-					fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->E_Ele, device.E_Ele);
-					exit (-1);
-				}
-
-				if (check_zero (host->E_Pol, device.E_Pol)) {
-					fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->E_Pol, device.E_Pol);
-					exit (-1);
-				}
-
-
-				//fprintf (stderr, "Simulation Data match between host and device \n");
-				return true;
-			}
-
-			void print_bond_data (bond_order_data *s)
-			{
-				/*
-				   fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", 
-				   s->BO, 
-				   s->BO_s, 
-				   s->BO_pi,
-				   s->BO_pi2 );
-				 */
-				fprintf (stderr, " Cdbo (%e) ", s->Cdbo );
-				fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi );
-				fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 );
-			}
-
-			void print_bond_list (reax_system *system, static_storage *workspace, list **lists)
-			{
-				list *bonds = *lists + BONDS;
-
-				for (int i = 1; i < 2; i++)
-				{
-					fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i);
-					for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) 
-					{
-						bond_data *data = &bonds->select.bond_list [j];
-						fprintf (stderr, "  %d, ", data->nbr );
-						print_bond_data (&data->bo_data);
-						fprintf (stderr, ")\n");
-					}
-				}
-
-				int *b_start = (int *) malloc (INT_SIZE * system->N);
-				int *b_end = (int *) malloc (INT_SIZE * system->N);
-				list *d_bonds = dev_lists + BONDS;
-				bond_data *d_bond_data;
-
-				d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-
-				copy_host_device ( b_start, d_bonds->index, 
-						INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( b_end, d_bonds->end_index, 
-						INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-				for (int i = 0; i < 2; i++)
-				{
-					fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i);
-					for (int j = b_start[i]; j < b_end[i]; j ++) {
-						bond_data *src = &d_bond_data[j];
-						fprintf (stderr, "  %d, ", src->nbr );
-						print_bond_data (&src->bo_data);
-						fprintf (stderr, ")\n");
-					}
-				}
-			}
-
-
-
-			void count_three_bodies (reax_system *system, static_storage *workspace, list **lists)
-			{
-				list *three = *lists + THREE_BODIES;
-				list *bonds = *lists + BONDS;
-
-				list *d_three = dev_lists + THREE_BODIES;
-				list *d_bonds = dev_lists + BONDS;
-				bond_data *d_bond_data;
-				real *test;
-
-				three_body_interaction_data *data = (three_body_interaction_data *) 
-					malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
-				int *start = (int *) malloc (INT_SIZE * system->num_bonds);
-				int *end = (int *) malloc (INT_SIZE * system->num_bonds);
-
-				int *b_start = (int *) malloc (INT_SIZE * system->N);
-				int *b_end = (int *) malloc (INT_SIZE * system->N);
-				int count;
-				int hcount, dcount;
-
-				copy_host_device ( start, d_three->index, 
-						INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( end, d_three->end_index, 
-						INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( data, d_three->select.three_body_list, 
-						sizeof (three_body_interaction_data) * system->num_thbodies, 
-						cudaMemcpyDeviceToHost, __LINE__);
-
-				d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-
-				copy_host_device ( b_start, d_bonds->index, 
-						INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( b_end, d_bonds->end_index, 
-						INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-
-				count = 0;
-				hcount = dcount = 0;
-				for (int i = 0; i < system->N; i++)
-				{
-					for (int j = b_start[i]; j < b_end[i]; j ++) {
-						dcount += end[j] - start[j];
-					}
-				}
-
-				fprintf (stderr, "Total Actual Three Body Count ---> %d \n", dcount);
-
-				free (data);
-				free (start);
-				free (end);
-				free (b_start);
-				free (b_end);
-				free (d_bond_data);
-			}
-
-
-
-			bool validate_three_bodies (reax_system *system, static_storage *workspace, list **lists)
-			{
-				list *three = *lists + THREE_BODIES;
-				list *bonds = *lists + BONDS;
-
-				list *d_three = dev_lists + THREE_BODIES;
-				list *d_bonds = dev_lists + BONDS;
-				bond_data *d_bond_data;
-				real *test;
-
-				three_body_interaction_data *data = (three_body_interaction_data *) 
-					malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
-				int *start = (int *) malloc (INT_SIZE * system->num_bonds);
-				int *end = (int *) malloc (INT_SIZE * system->num_bonds);
-
-				int *b_start = (int *) malloc (INT_SIZE * system->N);
-				int *b_end = (int *) malloc (INT_SIZE * system->N);
-				int count;
-				int hcount, dcount;
-
-
-
-				copy_host_device ( start, d_three->index, 
-						INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( end, d_three->end_index, 
-						INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( data, d_three->select.three_body_list, 
-						sizeof (three_body_interaction_data) * system->num_thbodies, 
-						cudaMemcpyDeviceToHost, __LINE__);
-
-				d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-
-				copy_host_device ( b_start, d_bonds->index, 
-						INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( b_end, d_bonds->end_index, 
-						INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-
-				//test = (real *) malloc (REAL_SIZE * system->num_bonds);
-				//memset (test, 0, REAL_SIZE * system->num_bonds);
-				//copy_host_device (test, testdata, REAL_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-
-				count = 0;
-				for (int i = 0; i < system->N; i++)
-				{
-					//for (int j = bonds->index[i]; j < bonds->end_index[i]; j ++)
-
-					hcount = dcount = 0;
-					for (int j = b_start[i]; j < b_end[i]; j ++) {
-						dcount += end[j] - start[j];
-						hcount += Num_Entries (j, three);
-
-						/*
-						   if ((end[j] - start[j]) != (End_Index (j, three) - Start_Index (j, three)))
-						   {
-						   fprintf (stderr, " Three body count does not match between host and device\n");
-						   fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (j, three), End_Index (j, three));
-						   fprintf (stderr, " Device count: (%d, %d)\n", start[j], end[j]);
-						   }
-						 */
-					}
-
-
-					if ((dcount != hcount)) {
-
-						fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); 
-
-						for (int j = b_start[i]; j < b_end[i]; j ++) {
-							bond_order_data *src = &d_bond_data[j].bo_data;
-							dcount = end[j] - start[j];
-							hcount = Num_Entries (j, three);
-							fprintf (stderr, "device \n");
-							print_bond_data (src);
-
-							fprintf (stderr, "\n");
-							src = &bonds->select.bond_list[j].bo_data;
-							fprintf (stderr, "host \n");
-							print_bond_data (src);
-							fprintf (stderr, "\n");
-
-							//fprintf (stderr, "--- Device bo is %f \n", test[j]);
-							fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i],  
-									Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds));
-							fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j);
-							fprintf (stderr, "------\n");
-						}
-						fprintf (stderr, " Three Bodies count does not match between host and device \n");
-						exit (-1);
-					}
-				}
-
-				//fprintf (stderr, "Three body count on DEVICE %d  HOST %d \n", dcount, hcount);
-
-				count = 0;
-				for (int i = 0; i < system->N; i++)
-				{
-					int x, y, z;
-					for (x = b_start[i]; x < b_end[i]; x++)
-					{
-						int t_start = start[x];
-						int t_end = end[x];
-
-						bond_data *dev_bond = &d_bond_data [x];
-						bond_data *host_bond;
-						for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
-						{
-							host_bond = &bonds->select.bond_list [z];
-							if ((dev_bond->nbr == host_bond->nbr) &&
-									check_same (dev_bond->rel_box, host_bond->rel_box) && 
-									!check_zero (dev_bond->dvec, host_bond->dvec) &&
-									!check_zero (dev_bond->d, host_bond->d) )
-							{
-								break;
-							}
-						}
-						if (z >= End_Index (i, bonds)){
-							fprintf (stderr, "Could not find the matching bond on host and device \n");
-							exit (-1);
-						}
-
-						//find this bond in the bonds on the host side.
-
-						for (y = t_start; y < t_end; y++)
-						{
-
-							three_body_interaction_data *device = data + y;
-							three_body_interaction_data *host;
-
-							//fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb);
-
-							int xx;	
-							for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++)
-							{
-								host = &three->select.three_body_list [xx];
-								//fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb);
-								//if ((host->thb == device->thb) && (host->pthb == device->pthb))
-								if ((host->thb == device->thb) && !check_zero (host->theta, device->theta))
-								{
-									count ++;
-									break;
-								}
-							}
-
-							if ( xx >= End_Index (z, three) ) {
-								fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, 
-										Start_Index (z, three), End_Index (z, three), start[x], end[x] );
-								exit (-1);
-							}// else fprintf (stderr, "----------------- \n");
-						}
-					}
-				}
-				free (data);
-				free (start);
-				free (end);
-				free (b_start);
-				free (b_end);
-				free (d_bond_data);
-
-				//fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count);
-				return true;
-			}
-
-			bool bin_three_bodies (reax_system *system, static_storage *workspace, list **lists)
-			{
-				list *d_three = dev_lists + THREE_BODIES;
-				list *d_bonds = dev_lists + BONDS;
-				list *three = *lists + THREE_BODIES;
-				list *bonds = *lists + BONDS;
-				bond_data *d_bond_data;
-
-				three_body_interaction_data *data = (three_body_interaction_data *) 
-					malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
-				int *start = (int *) malloc (INT_SIZE * system->num_bonds);
-				int *end = (int *) malloc (INT_SIZE * system->num_bonds);
-
-				int *b_start = (int *) malloc (INT_SIZE * system->N);
-				int *b_end = (int *) malloc (INT_SIZE * system->N);
-
-				int *a = (int *) malloc (2 * INT_SIZE * system->N );
-				int *b = (int *) malloc (2 * INT_SIZE * system->N );
-				int *c = (int *) malloc (2 * INT_SIZE * system->N );
-				int *d = (int *) malloc (2 * INT_SIZE * system->N );
-
-				for (int i = 0; i < 2 * system->N; i++)
-					a[i] = b[i] = c[i] = d[i] = -1;
-
-				int count;
-				int hcount, dcount;
-				int index_a, index_b, index_c, index_d;
-				index_a = index_b = index_c = index_d = 0;
-
-				copy_host_device ( start, d_three->index, 
-						INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( end, d_three->end_index, 
-						INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( data, d_three->select.three_body_list, 
-						sizeof (three_body_interaction_data) * system->num_thbodies, 
-						cudaMemcpyDeviceToHost, __LINE__);
-
-				d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
-
-				copy_host_device ( b_start, d_bonds->index, 
-						INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device ( b_end, d_bonds->end_index, 
-						INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
-				copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
-
-				count = 0;
-				hcount = dcount = 0;
-
-				/*
-				   for (int i = 0; i < 20; i++)
-				   {
-				   for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++)
-				   {
-				   for ( int k = Start_Index (j, three); k < End_Index (j, three); k ++)
-				   {
-				   three_body_interaction_data *host = &three->select.three_body_list [k];
-				   fprintf (stderr, " atom %d bond (%d %d) -- %d,  (%d %d)\n", 
-				   i, Start_Index (i, bonds), End_Index (i, bonds), j, host->thb, host->pthb );
-
-				   }
-				   }
-				   }
-				   exit (-1);
-				 */
-
-				count = 0;
-				for (int i = 0; i < system->N; i++)
-				{
-					for (int j = b_start[i]; j < b_end[i]; j ++) {
-
-						/*
-						   bond_data *src;
-						   src = &d_bond_data[j];
-						   fprintf (stderr, " atom %d Neighbor %d \n", i, src->nbr );
-						 */
-
-						for (int x = start[j]; x < end[j]; x ++)
-						{
-							three_body_interaction_data *device = data + x;
-
-							int center = device->j;
-							int d_i = device->i;
-							int d_k = device->k;
-
-
-							//fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d %d) -- (%d %d)\n", 
-							//i, b_start[i], b_end[i], j, center, d_i, d_k, device->thb, device->pthb);
-
-							if ((a[system->N + center] != -1)) {
-								a[d_i] = a[d_k] = 1;
-								continue;
-							} else if ((b[system->N + center] != -1)) {
-								b[d_i] = b[d_k] = 1;
-								continue;
-							} else if ((c[system->N + center] != -1)) {
-								c[d_i] = c[d_k] = 1;
-								continue;
-							} else if ((d[system->N + center] != -1)) {
-								d[d_i] = d[d_k] = 1;
-								continue;
-							}
-
-							if ((a[center] == -1) && (a[d_i] == -1) && (a[d_k] == -1)) {
-								a[center] = a[d_i] = a[d_k] = 1;
-								a[system->N + center] = 1;
-							} else if ((b[center] == -1) && (b[d_i] == -1) && (b[d_k] == -1)) {
-								b[center] =  b[d_i] = b[d_k] = 1;
-								b[system->N + center] = 1;
-							} else if ((c[center] == -1) && (c[d_i] == -1) && (c[d_k] == -1)) {
-								c[center] =  c[d_i] = c[d_k] = 1;
-								c[system->N + center] = 1;
-							} else if ((d[center] == -1) && (d[d_i] == -1) && (d[d_k] == -1)) {
-								d[center] =  d[d_i] = d[d_k] = 1;
-								d[system->N + center]= 1;
-							}
-							else {
-								count ++;
-								break;
-								fprintf (stderr, "We have a problem with the four bins atom %d bond (%d %d) -- %d, (%d %d %d)\n", 
-										i, b_start[i], b_end[i], j, center, d_i, d_k);
-								fprintf (stderr, "A's contents %d %d %d (%d %d %d)\n", 
-										a[system->N + center], a[system->N + d_i], a[system->N + d_k], a[center], a[d_i], a[d_k]);
-								fprintf (stderr, "B's contents %d %d %d (%d %d %d)\n", 
-										b[system->N + center], b[system->N + d_i], b[system->N + d_k], b[center], b[d_i], b[d_k]);
-								fprintf (stderr, "C's contents %d %d %d (%d %d %d)\n", 
-										c[system->N + center], c[system->N + d_i], c[system->N + d_k], c[center], c[d_i], c[d_k]);
-								fprintf (stderr, "D's contents %d %d %d (%d %d %d)\n", 
-										d[system->N + center], d[system->N + d_i], d[system->N + d_k], d[center], d[d_i], d[d_k]);
-
-							}
-						}
-					}
-				}
-				fprintf (stderr, "Miscount is %d \n", count);
-				exit (-1);
-
-				count = 0;
-				for (int i = 0; i < system->N; i++)
-				{
-					if (a[system->N + i] != -1) count ++;
-					if (b[system->N + i] != -1) count ++;
-					if (c[system->N + i] != -1) count ++;
-					if (d[system->N + i] != -1) count ++;
-				}
-
-				fprintf (stderr, "binned so many atoms --> %d \n", count );
-			}
-
-			bool validate_grid (reax_system *system)
-			{
-				int total = system->g.ncell[0] * system->g.ncell[1] * system->g.ncell[2];
-				int count = 0;
-
-				int *dtop = (int *) malloc (INT_SIZE * total );
-				copy_host_device (dtop, system->d_g.top, INT_SIZE * total, cudaMemcpyDeviceToHost, __LINE__);
-
-				for (int i = 0; i < total; i++){
-					if (system->g.top[i] != dtop[i]){
-						fprintf (stderr, " top count does not match (%d %d) @ index %d \n", system->g.top[i], dtop[i], i );
-						exit (-1);
-					}
-				}
-				free (dtop);
-
-				int *datoms = (int *) malloc (INT_SIZE * total * system->d_g.max_atoms);
-				copy_host_device (datoms, system->d_g.atoms, INT_SIZE * total * system->d_g.max_atoms, cudaMemcpyDeviceToHost, __LINE__);
-				for (int i = 0; i < total*system->d_g.max_atoms; i++){
-					if (system->g.atoms[i] != datoms[i]){
-						fprintf (stderr, " atoms count does not match (%d %d) @ index %d \n", system->g.atoms[i], datoms[i], i );
-						exit (-1);
-					}
-				}
-				free (datoms);
-
-				ivec *dnbrs = (ivec *) malloc (IVEC_SIZE * total * system->d_g.max_nbrs);
-				copy_host_device (dnbrs, system->d_g.nbrs, IVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__);
-				for (int i = 0; i < total*system->d_g.max_nbrs; i++){
-					if (!check_same (system->g.nbrs[i], dnbrs[i])){
-						fprintf (stderr, " nbrs count does not match @ index %d \n", i );
-						exit (-1);
-					}
-				}
-				free (dnbrs);
-
-				rvec *dnbrs_cp = (rvec *) malloc (RVEC_SIZE * total * system->d_g.max_nbrs);
-				copy_host_device (dnbrs_cp, system->d_g.nbrs_cp, RVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__);
-				for (int i = 0; i < total*system->d_g.max_nbrs; i++){
-					if (check_zero (system->g.nbrs_cp[i], dnbrs_cp[i])){
-						fprintf (stderr, " nbrs_cp count does not match @ index %d \n", i );
-						exit (-1);
-					}
-				}
-				free (dnbrs_cp);
-
-				//fprintf (stderr, " Grid match between device and host \n");
-				return true;
-			}
-
-			void print_atoms (reax_system *system)
-			{
-				int start, end, index;
-
-				reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N);
-				copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
-
-				//for (int i = 0; i < system->N; i++) 
-				for (int i = 0; i < 10; i++) 
-				{
-					fprintf (stderr, "Atom:%d: Type:%d", i, test[i].type);
-					fprintf (stderr, " x(%6.10f %6.10f %6.10f)", test[i].x[0], test[i].x[1], test[i].x[2] );
-					fprintf (stderr, " v(%6.10f %6.10f %6.10f)", test[i].v[0], test[i].v[1], test[i].v[2] );
-					fprintf (stderr, " f(%6.10f %6.10f %6.10f)", test[i].f[0], test[i].f[1], test[i].f[2] );
-					fprintf (stderr, " q(%6.10f) \n", test[i].q );
-				}
-			}
-
-			void print_sys_atoms (reax_system *system)
-			{
-				for (int i = 0; i < 10; i++) 
-				{
-					fprintf (stderr, "Atom:%d: Type:%d", i, system->atoms[i].type);
-					fprintf (stderr, " x(%6.10f %6.10f %6.10f)",system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );
-					fprintf (stderr, " v(%6.10f %6.10f %6.10f)",system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] );
-					fprintf (stderr, " f(%6.10f %6.10f %6.10f)", system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2] );
-					fprintf (stderr, " q(%6.10f) \n", system->atoms[i].q );
-				}
-			}
-
-
-			void print_grid (reax_system *system)
-			{
-				int i, j, k, x;
-				grid *g = &system->g;
-
-				for( i = 0; i < g->ncell[0]; i++ )
-					for( j = 0; j < g->ncell[1]; j++ )
-						for( k = 0; k < g->ncell[2]; k++ ){
-							fprintf (stderr, "Cell [%d,%d,%d]--(", i, j, k);
-							for (x = 0; x < g->top[index_grid_3d (i,j,k,g) ]; x++){
-								fprintf (stderr, "%d,", g->atoms[ index_grid_atoms (i,j,k,x,g) ]);
-							}
-							fprintf (stderr, ")\n");
-						}
-			}
+    list *far_nbrs = *lists + FAR_NBRS;
+    list *d_nbrs = dev_lists + FAR_NBRS;
+    far_neighbor_data gpu, cpu;
+    int index, count, jicount;
+
+    int *end = (int *)malloc (sizeof (int) * system->N);
+    int *start = (int *) malloc (sizeof (int) * system->N );
+
+    //fprintf (stderr, "numnbrs %d \n", system->num_nbrs);
+
+    copy_host_device (start, d_nbrs->index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 1);
+    copy_host_device (end, d_nbrs->end_index, INT_SIZE * system->N, cudaMemcpyDeviceToHost, 2);
+
+    far_neighbor_data *data = (far_neighbor_data *) malloc (FAR_NEIGHBOR_SIZE * system->num_nbrs);
+    copy_host_device (data, d_nbrs->select.far_nbr_list, FAR_NEIGHBOR_SIZE * system->num_nbrs, cudaMemcpyDeviceToHost, 3);
+
+    int cpu_count = 0;
+    int gpu_count = 0;
+
+    for (int i = 0; i < system->N; i++){
+        cpu_count += Num_Entries (i, far_nbrs);
+        gpu_count += end[i] - start[i];
+    }
+
+    //fprintf (stderr, " Nbrs count cpu: %d -- gpu: %d \n", cpu_count, gpu_count );
+    for (int i = 0; i < system->N-1; i++){
+        if (end [i] > start [i+1])
+        {
+            fprintf (stderr, " Far Neighbors index over write  @ index %d\n", i);
+            exit (-1);
+        }
+    }
+
+
+
+    for (int i = 0; i < system->N; i++){
+        index = Start_Index (i, far_nbrs);
+
+        for (int j = start[i]; j < end[i]; j++){
+            gpu = data[j];
+
+            if (i < data[j].nbr) {
+                int src = data[j].nbr;
+                int dest = i;
+                int x;
+
+
+                for (x = start[src]; x < end[src]; x++) {
+                    if (data[x].nbr != dest) continue;
+
+                    gpu = data[x];
+                    cpu = data[j];
+
+                    if (  (gpu.d != cpu.d) ||
+                            (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
+                            (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
+                        fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", i, data[j].nbr, 
+                                data[j].d, 
+                                data[j].rel_box[0],
+                                data[j].rel_box[1],
+                                data[j].rel_box[2],
+                                data[j].dvec[0], 
+                                data[j].dvec[1], 
+                                data[j].dvec[2] 
+                            );
+                        fprintf (stderr, " atom %d neighbor %d  (%f, %d, %d, %d - %f %f %f) \n", data[j].nbr, data[x].nbr,
+                                data[x].d,
+                                data[x].rel_box[0],
+                                data[x].rel_box[1],
+                                data[x].rel_box[2],
+                                data[x].dvec[0],
+                                data[x].dvec[1],
+                                data[x].dvec[2]
+                            );
+                        jicount++;
+
+                        fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host \n");
+                        exit (-1);
+                    }
+                    break;
+                }
+
+                if (x >= end[src]) {
+                    fprintf (stderr, "could not find the neighbor duplicate data for ij (%d %d)\n", i, src );
+                    exit (-1);
+                }
+
+                continue;
+            }
+
+            cpu = far_nbrs->select.far_nbr_list[index];
+            //if (  (gpu.d != cpu.d) || (gpu.nbr != cpu.nbr) ||
+            //     (cpu.dvec[0] != gpu.dvec[0]) || (cpu.dvec[1] != gpu.dvec[1]) || (cpu.dvec[2] != gpu.dvec[2]) ||
+            //    (cpu.rel_box[0] != gpu.rel_box[0]) || (cpu.rel_box[1] != gpu.rel_box[1]) || (cpu.rel_box[2] != gpu.rel_box[2])) {
+            //if (memcmp (&gpu, &cpu, FAR_NEIGHBOR_SIZE)) {
+            if (  check_zero (gpu.d, cpu.d) || 
+                    (gpu.nbr != cpu.nbr) ||
+                    check_zero (cpu.dvec, gpu.dvec) || 
+                    !check_same (cpu.rel_box, gpu.rel_box)) {
+
+                fprintf (stderr, "GPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, start[i], end[i], j );
+                fprintf (stderr, "CPU:atom --> %d (s: %d , e: %d, i: %d )\n", i, Start_Index(i, far_nbrs), End_Index (i, far_nbrs), index);
+                fprintf (stdout, "Far neighbors does not match atom: %d \n", i );
+                fprintf (stdout, "neighbor %d ,  %d \n",  cpu.nbr, gpu.nbr);
+                fprintf (stdout, "d %f ,  %f \n", cpu.d, data[j].d);
+                fprintf (stdout, "dvec (%f %f %f) (%f %f %f) \n", 
+                        cpu.dvec[0], cpu.dvec[1], cpu.dvec[2],
+                        gpu.dvec[0], gpu.dvec[1], gpu.dvec[2] );
+
+                fprintf (stdout, "rel_box (%d %d %d) (%d %d %d) \n", 
+                        cpu.rel_box[0], cpu.rel_box[1], cpu.rel_box[2],
+                        gpu.rel_box[0], gpu.rel_box[1], gpu.rel_box[2] );
+
+                fprintf (stderr, " Far Neighbors DOES NOT match between Deivce and Host  **** \n");
+                exit (-1);
+                count ++;
+            }
+            index ++;
+        }    
+
+        if (index != End_Index (i, far_nbrs))
+        {    
+            fprintf (stderr, "End index does not match for atom --> %d end index (%d) Cpu (%d, %d ) gpu (%d, %d)\n", i, index, Start_Index (i, far_nbrs), End_Index(i, far_nbrs),
+                    start[i], end[i]);
+            exit (10);
+        }    
+        }
+
+        //fprintf (stderr, "FAR Neighbors match between device and host \n");
+        free (start);
+        free (end);
+        free (data);
+        return true;
+        }
+
+        bool validate_workspace (reax_system *system, static_storage *workspace, list **lists) 
+        {
+            real *total_bond_order;
+            int count, tcount;
+
+            total_bond_order = (real *) malloc ( system->N * REAL_SIZE );
+            copy_host_device (total_bond_order, dev_workspace->total_bond_order, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < system->N; i++) {
+
+                //if (abs (workspace->total_bond_order[i] - total_bond_order[i]) >= GPU_TOLERANCE){
+                if ( check_zero (workspace->total_bond_order[i], total_bond_order[i])){
+                    fprintf (stderr, "Total bond order does not match for atom %d (%4.15e %4.15e)\n",
+                            i, workspace->total_bond_order[i], total_bond_order[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            free (total_bond_order);
+            //fprintf (stderr, "TOTAL Bond Order mismatch count %d\n", count);
+
+
+            rvec *dDeltap_self;
+            dDeltap_self = (rvec *) calloc (system->N, RVEC_SIZE);
+            copy_host_device (dDeltap_self, dev_workspace->dDeltap_self, system->N * RVEC_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < system->N; i++ )
+            {
+                if (check_zero (workspace->dDeltap_self[i], dDeltap_self[i]))
+                {
+                    fprintf (stderr, "index: %d c (%f %f %f) g (%f %f %f )\n", i, 
+                            workspace->dDeltap_self[i][0],
+                            workspace->dDeltap_self[i][1],
+                            workspace->dDeltap_self[i][2],
+                            dDeltap_self[3*i+0],
+                            dDeltap_self[3*i+1],
+                            dDeltap_self[3*i+2] );
+                    exit (-1);
+                    count ++;
+                }
+            }
+            free (dDeltap_self);
+            //fprintf (stderr, "dDeltap_self mismatch count %d\n", count);
+
+            //exit for init_forces
+
+            real *test;
+            test = (real *) malloc (system->N * REAL_SIZE);
+
+            copy_host_device (test, dev_workspace->Deltap, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ )
+            {
+                if (check_zero (workspace->Deltap[i], test[i]))
+                {
+                    fprintf (stderr, "Deltap: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "Deltap mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->Deltap_boc, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ )
+            {
+                if (check_zero (workspace->Deltap_boc[i], test[i]))
+                {
+                    fprintf (stderr, "Deltap_boc: Mismatch index --> %d (%f %f) \n", i, workspace->Deltap_boc[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "dDeltap_boc mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->Delta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->Delta[i], test[i])) {
+                    fprintf (stderr, "Delta: Mismatch index --> %d (%f %f) \n", i, workspace->Delta[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "Delta mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->Delta_e, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->Delta_e[i], test[i])) {
+                    fprintf (stderr, "Delta_e: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_e[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "Delta_e mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->vlpex, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->vlpex[i], test[i])) {
+                    fprintf (stderr, "vlpex: Mismatch index --> %d (%f %f) \n", i, workspace->vlpex[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "vlpex mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->nlp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->nlp[i], test[i])) {
+                    fprintf (stderr, "nlp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "nlp mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->Delta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->Delta_lp[i], test[i])) {
+                    fprintf (stderr, "Delta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "Delta_lp mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->Clp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->Clp[i], test[i])) {
+                    fprintf (stderr, "Clp: Mismatch index --> %d (%f %f) \n", i, workspace->Clp[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "Clp mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->dDelta_lp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->dDelta_lp[i], test[i])) {
+                    fprintf (stderr, "dDelta_lp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "dDelta_lp mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->nlp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->nlp_temp[i], test[i])) {
+                    fprintf (stderr, "nlp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->nlp_temp[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "nlp_temp mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->Delta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->Delta_lp_temp[i], test[i])) {
+                    fprintf (stderr, "Delta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->Delta_lp_temp[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "Delta_lp_temp mismatch count %d\n", count);
+
+            copy_host_device (test, dev_workspace->dDelta_lp_temp, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->dDelta_lp_temp[i], test[i])) {
+                    fprintf (stderr, "dDelta_lp_temp: Mismatch index --> %d (%f %f) \n", i, workspace->dDelta_lp_temp[i], test[i]);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "dDelta_lp_temp mismatch count %d\n", count);
+
+            //exit for Bond order calculations
+
+
+            copy_host_device (test, dev_workspace->CdDelta, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->CdDelta[i], test[i])) {
+                    fprintf (stderr, " CdDelta does NOT match (%f %f) for atom  %d \n", workspace->CdDelta[i], test[i], i);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "CdDelta mismatch count %d\n", count);
+            //exit for Bond Energy calculations
+
+            /*
+               copy_host_device (test, dev_workspace->droptol, system->N * REAL_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+               count = 0;
+               for (int i = 0; i < system->N; i++ ) {
+               if (check_zero (workspace->droptol[i], test[i])) {
+               fprintf (stderr, " Droptol Does not match (%f %f) \n", workspace->droptol[i], test[i]);
+               exit (-1);
+               count ++;
+               }
+               }
+            //fprintf (stderr, "droptol mismatch count %d\n", count);
+             */
+
+
+            //exit for  QEa calculations
+            /*
+               real *t_s;
+
+               t_s = (real *) malloc (REAL_SIZE * (system->N * 2) );
+               copy_host_device (t_s, dev_workspace->b_prm, REAL_SIZE * (system->N * 2), cudaMemcpyDeviceToHost, __LINE__);
+
+               count = 0;
+               for (int i = 0; i < (system->N * 2); i++ ) {
+               if (check_zero (workspace->b_prm[i], t_s[i])) {
+               fprintf (stderr, " (%f %f) \n", workspace->b_prm[i], t_s[i]);
+               exit (-1);
+               count ++;
+               }
+               }
+            //fprintf (stderr, "b_prm mismatch count %d\n", count);
+
+            t_s = (real *) malloc (REAL_SIZE * 5 * system->N);
+            copy_host_device (t_s, dev_workspace->s, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < 5*system->N; i++ ) {
+            if (check_zero (workspace->s[i], t_s[i])) {
+            //fprintf (stderr, " (%f %f)  @ index %d \n", workspace->s[i], t_s[i], i);
+            count ++;
+            }
+            }
+            fprintf (stderr, "s mismatch count %d\n", count);
+
+
+            t_s = (real *) malloc (REAL_SIZE * 5 * system->N);
+            copy_host_device (t_s, dev_workspace->t, system->N * REAL_SIZE * 5, cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < 5*system->N; i++ ) {
+            if (check_zero (workspace->t[i], t_s[i])) {
+            //fprintf (stderr, " (%f %f) @ index : %d\n", workspace->t[i], t_s[i], i);
+            count ++;
+            }
+            }
+            fprintf (stderr, "t mismatch count %d\n", count);
+
+
+            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * system->N);
+            copy_host_device (t_s, dev_workspace->v, system->N * REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < (RESTART + 1)*system->N; i++ ) {
+            if (check_zero (workspace->v[i], t_s[i])) {
+            //fprintf (stderr, " (%f %f) @ index %d \n", workspace->v[i], t_s[i], i);
+            count ++;
+            }
+            }
+            fprintf (stderr, "v mismatch count %d\n", count);
+
+            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
+            copy_host_device (t_s, dev_workspace->y, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < (RESTART + 1); i++ ) {
+            if (check_zero (workspace->y[i], t_s[i])) {
+            //fprintf (stderr, " (%f %f) \n", workspace->y[i], t_s[i]);
+            count ++;
+            }
+            }
+            fprintf (stderr, "y mismatch count %d\n", count);
+
+            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
+            copy_host_device (t_s, dev_workspace->hc, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < (RESTART + 1); i++ ) {
+            if (check_zero (workspace->hc[i], t_s[i])) {
+                //fprintf (stderr, " (%f %f) \n", workspace->hc[i], t_s[i]);
+                count ++;
+            }
+            }
+            fprintf (stderr, "hc mismatch count %d\n", count);
+
+            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
+            copy_host_device (t_s, dev_workspace->hs, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < (RESTART + 1); i++ ) {
+                if (check_zero (workspace->hs[i], t_s[i])) {
+                    //fprintf (stderr, " (%f %f) \n", workspace->hs[i], t_s[i]);
+                    count ++;
+                }
+            }
+            fprintf (stderr, "hs mismatch count %d\n", count);
+
+            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) * (RESTART+1) );
+            copy_host_device (t_s, dev_workspace->h, REAL_SIZE * (RESTART+1)*(RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < (RESTART+1)*(RESTART+1); i++ ) {
+                if (check_zero (workspace->h[i], t_s[i])) {
+                    //fprintf (stderr, " (%f %f) \n", workspace->h[i], t_s[i]);
+                    count ++;
+                }
+            }
+            fprintf (stderr, "h mismatch count %d\n", count);
+
+            t_s = (real *) malloc (REAL_SIZE * (RESTART+1) );
+            copy_host_device (t_s, dev_workspace->g, REAL_SIZE * (RESTART+1), cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < (RESTART + 1); i++ ) {
+                if (check_zero (workspace->g[i], t_s[i])) {
+                    //fprintf (stderr, " (%f %f) @ index %d\n", workspace->g[i], t_s[i], i);
+                    count ++;
+                }
+            }
+            fprintf (stderr, "g mismatch count %d\n", count);
+            */
+
+                rvec *r_s = (rvec *) malloc (RVEC_SIZE * system->N );
+            copy_host_device (r_s, dev_workspace->v_const, RVEC_SIZE * system->N,  cudaMemcpyDeviceToHost, __LINE__);
+
+            count = 0;
+            for (int i = 0; i < system->N; i++ ) {
+                if (check_zero (workspace->v_const[i], r_s[i])) {
+                    fprintf (stderr, " v_const (%f %f %f) (%f %f %f) @ index %d\n", 
+                            workspace->v_const[i][0], 
+                            workspace->v_const[i][1], 
+                            workspace->v_const[i][2], 
+                            r_s[i][0], 
+                            r_s[i][1], 
+                            r_s[i][2], 
+                            i);
+                    exit (-1);
+                    count ++;
+                }
+            }
+            //fprintf (stderr, "v_const mismatch count %d\n", count);
+
+            free (test);
+            free (r_s);
+            return true;
+            }
+
+            bool validate_data (reax_system *system, simulation_data *host)
+            {
+                simulation_data device;
+
+                copy_host_device (&device, host->d_simulation_data, SIMULATION_DATA_SIZE, cudaMemcpyDeviceToHost, __LINE__);
+
+                if (check_zero (host->E_BE, device.E_BE)){
+                    fprintf (stderr, "E_BE does not match (%4.15e %4.15e) \n", host->E_BE, device.E_BE);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Lp, device.E_Lp)){
+                    fprintf (stderr, "E_Lp does not match (%4.10e %4.10e) \n", host->E_Lp, device.E_Lp);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Ov, device.E_Ov)){
+                    fprintf (stderr, "E_Ov does not match (%4.10e %4.10e) \n", host->E_Ov, device.E_Ov);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Un, device.E_Un)){
+                    fprintf (stderr, "E_Un does not match (%4.10e %4.10e) \n", host->E_Un, device.E_Un);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Tor, device.E_Tor)) {
+                    fprintf (stderr, "E_Tor does not match (%4.10e %4.10e) \n", host->E_Tor, device.E_Tor);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Con, device.E_Con)) {
+                    fprintf (stderr, "E_Con does not match (%4.10e %4.10e) \n", host->E_Con, device.E_Con);
+                    exit (-1);
+                }
+
+                if (check_zero (host->ext_press, device.ext_press)) {
+                    fprintf (stderr, "ext_press does not match (%4.10e %4.10e) \n", host->ext_press, device.ext_press);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_HB, device.E_HB)) {
+                    fprintf (stderr, "E_Hb does not match (%4.10e %4.10e) \n", host->E_HB, device.E_HB);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Ang, device.E_Ang)) {
+                    fprintf (stderr, "E_Ang does not match (%4.10e %4.10e) \n", host->E_Ang, device.E_Ang);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Pen, device.E_Pen)) {
+                    fprintf (stderr, "E_Pen does not match (%4.10e %4.10e) \n", host->E_Pen, device.E_Pen);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Coa, device.E_Coa)) {
+                    fprintf (stderr, "E_Coa does not match (%4.10e %4.10e) \n", host->E_Coa, device.E_Coa);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_vdW, device.E_vdW)) {
+                    fprintf (stderr, "E_vdW does not match (%4.20e %4.20e) \n", host->E_vdW, device.E_vdW);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Ele, device.E_Ele)) {
+                    fprintf (stderr, "E_Ele does not match (%4.20e %4.20e) \n", host->E_Ele, device.E_Ele);
+                    exit (-1);
+                }
+
+                if (check_zero (host->E_Pol, device.E_Pol)) {
+                    fprintf (stderr, "E_Pol does not match (%4.10e %4.10e) \n", host->E_Pol, device.E_Pol);
+                    exit (-1);
+                }
+
+
+                //fprintf (stderr, "Simulation Data match between host and device \n");
+                return true;
+            }
+
+            void print_bond_data (bond_order_data *s)
+            {
+                /*
+                   fprintf (stderr, "Bond_Order_Data BO (%f ) BO_s (%f ) BO_pi (%f ) BO_pi2 (%f ) ", 
+                   s->BO, 
+                   s->BO_s, 
+                   s->BO_pi,
+                   s->BO_pi2 );
+                 */
+                fprintf (stderr, " Cdbo (%e) ", s->Cdbo );
+                fprintf (stderr, " Cdbopi (%e) ", s->Cdbopi );
+                fprintf (stderr, " Cdbopi2 (%e) ", s->Cdbopi2 );
+            }
+
+            void print_bond_list (reax_system *system, static_storage *workspace, list **lists)
+            {
+                list *bonds = *lists + BONDS;
+
+                for (int i = 1; i < 2; i++)
+                {
+                    fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i);
+                    for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++) 
+                    {
+                        bond_data *data = &bonds->select.bond_list [j];
+                        fprintf (stderr, "  %d, ", data->nbr );
+                        print_bond_data (&data->bo_data);
+                        fprintf (stderr, ")\n");
+                    }
+                }
+
+                int *b_start = (int *) malloc (INT_SIZE * system->N);
+                int *b_end = (int *) malloc (INT_SIZE * system->N);
+                list *d_bonds = dev_lists + BONDS;
+                bond_data *d_bond_data;
+
+                d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+
+                copy_host_device ( b_start, d_bonds->index, 
+                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( b_end, d_bonds->end_index, 
+                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+                for (int i = 0; i < 2; i++)
+                {
+                    fprintf (stderr, "Atom %d Bond_data ( nbrs \n", i);
+                    for (int j = b_start[i]; j < b_end[i]; j ++) {
+                        bond_data *src = &d_bond_data[j];
+                        fprintf (stderr, "  %d, ", src->nbr );
+                        print_bond_data (&src->bo_data);
+                        fprintf (stderr, ")\n");
+                    }
+                }
+            }
+
+
+
+            void count_three_bodies (reax_system *system, static_storage *workspace, list **lists)
+            {
+                list *three = *lists + THREE_BODIES;
+                list *bonds = *lists + BONDS;
+
+                list *d_three = dev_lists + THREE_BODIES;
+                list *d_bonds = dev_lists + BONDS;
+                bond_data *d_bond_data;
+                real *test;
+
+                three_body_interaction_data *data = (three_body_interaction_data *) 
+                    malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
+                int *start = (int *) malloc (INT_SIZE * system->num_bonds);
+                int *end = (int *) malloc (INT_SIZE * system->num_bonds);
+
+                int *b_start = (int *) malloc (INT_SIZE * system->N);
+                int *b_end = (int *) malloc (INT_SIZE * system->N);
+                int count;
+                int hcount, dcount;
+
+                copy_host_device ( start, d_three->index, 
+                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( end, d_three->end_index, 
+                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( data, d_three->select.three_body_list, 
+                        sizeof (three_body_interaction_data) * system->num_thbodies, 
+                        cudaMemcpyDeviceToHost, __LINE__);
+
+                d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+
+                copy_host_device ( b_start, d_bonds->index, 
+                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( b_end, d_bonds->end_index, 
+                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+
+                count = 0;
+                hcount = dcount = 0;
+                for (int i = 0; i < system->N; i++)
+                {
+                    for (int j = b_start[i]; j < b_end[i]; j ++) {
+                        dcount += end[j] - start[j];
+                    }
+                }
+
+                fprintf (stderr, "Total Actual Three Body Count ---> %d \n", dcount);
+
+                free (data);
+                free (start);
+                free (end);
+                free (b_start);
+                free (b_end);
+                free (d_bond_data);
+            }
+
+
+
+            bool validate_three_bodies (reax_system *system, static_storage *workspace, list **lists)
+            {
+                list *three = *lists + THREE_BODIES;
+                list *bonds = *lists + BONDS;
+
+                list *d_three = dev_lists + THREE_BODIES;
+                list *d_bonds = dev_lists + BONDS;
+                bond_data *d_bond_data;
+                real *test;
+
+                three_body_interaction_data *data = (three_body_interaction_data *) 
+                    malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
+                int *start = (int *) malloc (INT_SIZE * system->num_bonds);
+                int *end = (int *) malloc (INT_SIZE * system->num_bonds);
+
+                int *b_start = (int *) malloc (INT_SIZE * system->N);
+                int *b_end = (int *) malloc (INT_SIZE * system->N);
+                int count;
+                int hcount, dcount;
+
+
+
+                copy_host_device ( start, d_three->index, 
+                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( end, d_three->end_index, 
+                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( data, d_three->select.three_body_list, 
+                        sizeof (three_body_interaction_data) * system->num_thbodies, 
+                        cudaMemcpyDeviceToHost, __LINE__);
+
+                d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+
+                copy_host_device ( b_start, d_bonds->index, 
+                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( b_end, d_bonds->end_index, 
+                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+
+                //test = (real *) malloc (REAL_SIZE * system->num_bonds);
+                //memset (test, 0, REAL_SIZE * system->num_bonds);
+                //copy_host_device (test, testdata, REAL_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+
+                count = 0;
+                for (int i = 0; i < system->N; i++)
+                {
+                    //for (int j = bonds->index[i]; j < bonds->end_index[i]; j ++)
+
+                    hcount = dcount = 0;
+                    for (int j = b_start[i]; j < b_end[i]; j ++) {
+                        dcount += end[j] - start[j];
+                        hcount += Num_Entries (j, three);
+
+                        /*
+                           if ((end[j] - start[j]) != (End_Index (j, three) - Start_Index (j, three)))
+                           {
+                           fprintf (stderr, " Three body count does not match between host and device\n");
+                           fprintf (stderr, " Host count : (%d, %d)\n", Start_Index (j, three), End_Index (j, three));
+                           fprintf (stderr, " Device count: (%d, %d)\n", start[j], end[j]);
+                           }
+                         */
+                    }
+
+
+                    if ((dcount != hcount)) {
+
+                        fprintf (stderr, " Three body count does not match for the bond %d - %d \n", hcount, dcount); 
+
+                        for (int j = b_start[i]; j < b_end[i]; j ++) {
+                            bond_order_data *src = &d_bond_data[j].bo_data;
+                            dcount = end[j] - start[j];
+                            hcount = Num_Entries (j, three);
+                            fprintf (stderr, "device \n");
+                            print_bond_data (src);
+
+                            fprintf (stderr, "\n");
+                            src = &bonds->select.bond_list[j].bo_data;
+                            fprintf (stderr, "host \n");
+                            print_bond_data (src);
+                            fprintf (stderr, "\n");
+
+                            //fprintf (stderr, "--- Device bo is %f \n", test[j]);
+                            fprintf (stderr, "Device %d %d bonds (%d %d) - Host %d %d bonds (%d %d) \n", start[j], end[j],b_start[i], b_end[i],  
+                                    Start_Index (j, three), End_Index (j, three), Start_Index (i, bonds), End_Index (i, bonds));
+                            fprintf (stderr, "Host %d Device %d -- atom %d index %d \n", hcount, dcount, i, j);
+                            fprintf (stderr, "------\n");
+                        }
+                        fprintf (stderr, " Three Bodies count does not match between host and device \n");
+                        exit (-1);
+                    }
+                }
+
+                //fprintf (stderr, "Three body count on DEVICE %d  HOST %d \n", dcount, hcount);
+
+                count = 0;
+                for (int i = 0; i < system->N; i++)
+                {
+                    int x, y, z;
+                    for (x = b_start[i]; x < b_end[i]; x++)
+                    {
+                        int t_start = start[x];
+                        int t_end = end[x];
+
+                        bond_data *dev_bond = &d_bond_data [x];
+                        bond_data *host_bond;
+                        for (z = Start_Index (i, bonds); z < End_Index (i, bonds); z++)
+                        {
+                            host_bond = &bonds->select.bond_list [z];
+                            if ((dev_bond->nbr == host_bond->nbr) &&
+                                    check_same (dev_bond->rel_box, host_bond->rel_box) && 
+                                    !check_zero (dev_bond->dvec, host_bond->dvec) &&
+                                    !check_zero (dev_bond->d, host_bond->d) )
+                            {
+                                break;
+                            }
+                        }
+                        if (z >= End_Index (i, bonds)){
+                            fprintf (stderr, "Could not find the matching bond on host and device \n");
+                            exit (-1);
+                        }
+
+                        //find this bond in the bonds on the host side.
+
+                        for (y = t_start; y < t_end; y++)
+                        {
+
+                            three_body_interaction_data *device = data + y;
+                            three_body_interaction_data *host;
+
+                            //fprintf (stderr, "Device thb %d pthb %d \n", device->thb, device->pthb);
+
+                            int xx;    
+                            for (xx = Start_Index (z, three); xx < End_Index (z, three); xx++)
+                            {
+                                host = &three->select.three_body_list [xx];
+                                //fprintf (stderr, "Host thb %d pthb %d \n", host->thb, host->pthb);
+                                //if ((host->thb == device->thb) && (host->pthb == device->pthb))
+                                if ((host->thb == device->thb) && !check_zero (host->theta, device->theta))
+                                {
+                                    count ++;
+                                    break;
+                                }
+                            }
+
+                            if ( xx >= End_Index (z, three) ) {
+                                fprintf (stderr, " Could not match for atom %d bonds %d (%d) Three body(%d %d) (%d %d) \n", i, x, z, 
+                                        Start_Index (z, three), End_Index (z, three), start[x], end[x] );
+                                exit (-1);
+                            }// else fprintf (stderr, "----------------- \n");
+                        }
+                    }
+                }
+                free (data);
+                free (start);
+                free (end);
+                free (b_start);
+                free (b_end);
+                free (d_bond_data);
+
+                //fprintf (stderr, "Three Body Interaction Data MATCH on device and HOST --> %d \n", count);
+                return true;
+            }
+
+            bool bin_three_bodies (reax_system *system, static_storage *workspace, list **lists)
+            {
+                list *d_three = dev_lists + THREE_BODIES;
+                list *d_bonds = dev_lists + BONDS;
+                list *three = *lists + THREE_BODIES;
+                list *bonds = *lists + BONDS;
+                bond_data *d_bond_data;
+
+                three_body_interaction_data *data = (three_body_interaction_data *) 
+                    malloc ( sizeof (three_body_interaction_data) * system->num_thbodies);
+                int *start = (int *) malloc (INT_SIZE * system->num_bonds);
+                int *end = (int *) malloc (INT_SIZE * system->num_bonds);
+
+                int *b_start = (int *) malloc (INT_SIZE * system->N);
+                int *b_end = (int *) malloc (INT_SIZE * system->N);
+
+                int *a = (int *) malloc (2 * INT_SIZE * system->N );
+                int *b = (int *) malloc (2 * INT_SIZE * system->N );
+                int *c = (int *) malloc (2 * INT_SIZE * system->N );
+                int *d = (int *) malloc (2 * INT_SIZE * system->N );
+
+                for (int i = 0; i < 2 * system->N; i++)
+                    a[i] = b[i] = c[i] = d[i] = -1;
+
+                int count;
+                int hcount, dcount;
+                int index_a, index_b, index_c, index_d;
+                index_a = index_b = index_c = index_d = 0;
+
+                copy_host_device ( start, d_three->index, 
+                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( end, d_three->end_index, 
+                        INT_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( data, d_three->select.three_body_list, 
+                        sizeof (three_body_interaction_data) * system->num_thbodies, 
+                        cudaMemcpyDeviceToHost, __LINE__);
+
+                d_bond_data = (bond_data *) malloc (BOND_DATA_SIZE * system->num_bonds );
+
+                copy_host_device ( b_start, d_bonds->index, 
+                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device ( b_end, d_bonds->end_index, 
+                        INT_SIZE * system->N, cudaMemcpyDeviceToHost, __LINE__);
+                copy_host_device (d_bond_data, d_bonds->select.bond_list, BOND_DATA_SIZE * system->num_bonds, cudaMemcpyDeviceToHost, __LINE__);
+
+                count = 0;
+                hcount = dcount = 0;
+
+                /*
+                   for (int i = 0; i < 20; i++)
+                   {
+                   for (int j = Start_Index (i, bonds); j < End_Index (i, bonds); j++)
+                   {
+                   for ( int k = Start_Index (j, three); k < End_Index (j, three); k ++)
+                   {
+                   three_body_interaction_data *host = &three->select.three_body_list [k];
+                   fprintf (stderr, " atom %d bond (%d %d) -- %d,  (%d %d)\n", 
+                   i, Start_Index (i, bonds), End_Index (i, bonds), j, host->thb, host->pthb );
+
+                   }
+                   }
+                   }
+                   exit (-1);
+                 */
+
+                count = 0;
+                for (int i = 0; i < system->N; i++)
+                {
+                    for (int j = b_start[i]; j < b_end[i]; j ++) {
+
+                        /*
+                           bond_data *src;
+                           src = &d_bond_data[j];
+                           fprintf (stderr, " atom %d Neighbor %d \n", i, src->nbr );
+                         */
+
+                        for (int x = start[j]; x < end[j]; x ++)
+                        {
+                            three_body_interaction_data *device = data + x;
+
+                            int center = device->j;
+                            int d_i = device->i;
+                            int d_k = device->k;
+
+
+                            //fprintf (stderr, " atom %d bond (%d %d) -- %d, (%d %d %d) -- (%d %d)\n", 
+                            //i, b_start[i], b_end[i], j, center, d_i, d_k, device->thb, device->pthb);
+
+                            if ((a[system->N + center] != -1)) {
+                                a[d_i] = a[d_k] = 1;
+                                continue;
+                            } else if ((b[system->N + center] != -1)) {
+                                b[d_i] = b[d_k] = 1;
+                                continue;
+                            } else if ((c[system->N + center] != -1)) {
+                                c[d_i] = c[d_k] = 1;
+                                continue;
+                            } else if ((d[system->N + center] != -1)) {
+                                d[d_i] = d[d_k] = 1;
+                                continue;
+                            }
+
+                            if ((a[center] == -1) && (a[d_i] == -1) && (a[d_k] == -1)) {
+                                a[center] = a[d_i] = a[d_k] = 1;
+                                a[system->N + center] = 1;
+                            } else if ((b[center] == -1) && (b[d_i] == -1) && (b[d_k] == -1)) {
+                                b[center] =  b[d_i] = b[d_k] = 1;
+                                b[system->N + center] = 1;
+                            } else if ((c[center] == -1) && (c[d_i] == -1) && (c[d_k] == -1)) {
+                                c[center] =  c[d_i] = c[d_k] = 1;
+                                c[system->N + center] = 1;
+                            } else if ((d[center] == -1) && (d[d_i] == -1) && (d[d_k] == -1)) {
+                                d[center] =  d[d_i] = d[d_k] = 1;
+                                d[system->N + center]= 1;
+                            }
+                            else {
+                                count ++;
+                                break;
+                                fprintf (stderr, "We have a problem with the four bins atom %d bond (%d %d) -- %d, (%d %d %d)\n", 
+                                        i, b_start[i], b_end[i], j, center, d_i, d_k);
+                                fprintf (stderr, "A's contents %d %d %d (%d %d %d)\n", 
+                                        a[system->N + center], a[system->N + d_i], a[system->N + d_k], a[center], a[d_i], a[d_k]);
+                                fprintf (stderr, "B's contents %d %d %d (%d %d %d)\n", 
+                                        b[system->N + center], b[system->N + d_i], b[system->N + d_k], b[center], b[d_i], b[d_k]);
+                                fprintf (stderr, "C's contents %d %d %d (%d %d %d)\n", 
+                                        c[system->N + center], c[system->N + d_i], c[system->N + d_k], c[center], c[d_i], c[d_k]);
+                                fprintf (stderr, "D's contents %d %d %d (%d %d %d)\n", 
+                                        d[system->N + center], d[system->N + d_i], d[system->N + d_k], d[center], d[d_i], d[d_k]);
+
+                            }
+                        }
+                    }
+                }
+                fprintf (stderr, "Miscount is %d \n", count);
+                exit (-1);
+
+                count = 0;
+                for (int i = 0; i < system->N; i++)
+                {
+                    if (a[system->N + i] != -1) count ++;
+                    if (b[system->N + i] != -1) count ++;
+                    if (c[system->N + i] != -1) count ++;
+                    if (d[system->N + i] != -1) count ++;
+                }
+
+                fprintf (stderr, "binned so many atoms --> %d \n", count );
+            }
+
+            bool validate_grid (reax_system *system)
+            {
+                int total = system->g.ncell[0] * system->g.ncell[1] * system->g.ncell[2];
+                int count = 0;
+
+                int *dtop = (int *) malloc (INT_SIZE * total );
+                copy_host_device (dtop, system->d_g.top, INT_SIZE * total, cudaMemcpyDeviceToHost, __LINE__);
+
+                for (int i = 0; i < total; i++){
+                    if (system->g.top[i] != dtop[i]){
+                        fprintf (stderr, " top count does not match (%d %d) @ index %d \n", system->g.top[i], dtop[i], i );
+                        exit (-1);
+                    }
+                }
+                free (dtop);
+
+                int *datoms = (int *) malloc (INT_SIZE * total * system->d_g.max_atoms);
+                copy_host_device (datoms, system->d_g.atoms, INT_SIZE * total * system->d_g.max_atoms, cudaMemcpyDeviceToHost, __LINE__);
+                for (int i = 0; i < total*system->d_g.max_atoms; i++){
+                    if (system->g.atoms[i] != datoms[i]){
+                        fprintf (stderr, " atoms count does not match (%d %d) @ index %d \n", system->g.atoms[i], datoms[i], i );
+                        exit (-1);
+                    }
+                }
+                free (datoms);
+
+                ivec *dnbrs = (ivec *) malloc (IVEC_SIZE * total * system->d_g.max_nbrs);
+                copy_host_device (dnbrs, system->d_g.nbrs, IVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__);
+                for (int i = 0; i < total*system->d_g.max_nbrs; i++){
+                    if (!check_same (system->g.nbrs[i], dnbrs[i])){
+                        fprintf (stderr, " nbrs count does not match @ index %d \n", i );
+                        exit (-1);
+                    }
+                }
+                free (dnbrs);
+
+                rvec *dnbrs_cp = (rvec *) malloc (RVEC_SIZE * total * system->d_g.max_nbrs);
+                copy_host_device (dnbrs_cp, system->d_g.nbrs_cp, RVEC_SIZE * total * system->d_g.max_nbrs, cudaMemcpyDeviceToHost, __LINE__);
+                for (int i = 0; i < total*system->d_g.max_nbrs; i++){
+                    if (check_zero (system->g.nbrs_cp[i], dnbrs_cp[i])){
+                        fprintf (stderr, " nbrs_cp count does not match @ index %d \n", i );
+                        exit (-1);
+                    }
+                }
+                free (dnbrs_cp);
+
+                //fprintf (stderr, " Grid match between device and host \n");
+                return true;
+            }
+
+            void print_atoms (reax_system *system)
+            {
+                int start, end, index;
+
+                reax_atom *test = (reax_atom *) malloc (REAX_ATOM_SIZE * system->N);
+                copy_host_device (test, system->d_atoms, REAX_ATOM_SIZE * system->N, cudaMemcpyDeviceToHost, RES_SYSTEM_ATOMS );
+
+                //for (int i = 0; i < system->N; i++) 
+                for (int i = 0; i < 10; i++) 
+                {
+                    fprintf (stderr, "Atom:%d: Type:%d", i, test[i].type);
+                    fprintf (stderr, " x(%6.10f %6.10f %6.10f)", test[i].x[0], test[i].x[1], test[i].x[2] );
+                    fprintf (stderr, " v(%6.10f %6.10f %6.10f)", test[i].v[0], test[i].v[1], test[i].v[2] );
+                    fprintf (stderr, " f(%6.10f %6.10f %6.10f)", test[i].f[0], test[i].f[1], test[i].f[2] );
+                    fprintf (stderr, " q(%6.10f) \n", test[i].q );
+                }
+            }
+
+            void print_sys_atoms (reax_system *system)
+            {
+                for (int i = 0; i < 10; i++) 
+                {
+                    fprintf (stderr, "Atom:%d: Type:%d", i, system->atoms[i].type);
+                    fprintf (stderr, " x(%6.10f %6.10f %6.10f)",system->atoms[i].x[0], system->atoms[i].x[1], system->atoms[i].x[2] );
+                    fprintf (stderr, " v(%6.10f %6.10f %6.10f)",system->atoms[i].v[0], system->atoms[i].v[1], system->atoms[i].v[2] );
+                    fprintf (stderr, " f(%6.10f %6.10f %6.10f)", system->atoms[i].f[0], system->atoms[i].f[1], system->atoms[i].f[2] );
+                    fprintf (stderr, " q(%6.10f) \n", system->atoms[i].q );
+                }
+            }
+
+
+            void print_grid (reax_system *system)
+            {
+                int i, j, k, x;
+                grid *g = &system->g;
+
+                for( i = 0; i < g->ncell[0]; i++ )
+                    for( j = 0; j < g->ncell[1]; j++ )
+                        for( k = 0; k < g->ncell[2]; k++ ){
+                            fprintf (stderr, "Cell [%d,%d,%d]--(", i, j, k);
+                            for (x = 0; x < g->top[index_grid_3d (i,j,k,g) ]; x++){
+                                fprintf (stderr, "%d,", g->atoms[ index_grid_atoms (i,j,k,x,g) ]);
+                            }
+                            fprintf (stderr, ")\n");
+                        }
+            }
 
 
diff --git a/PuReMD-GPU/src/vector.cu b/PuReMD-GPU/src/vector.cu
index 9da80d03..7cf06eb8 100644
--- a/PuReMD-GPU/src/vector.cu
+++ b/PuReMD-GPU/src/vector.cu
@@ -23,316 +23,316 @@
 
 int Vector_isZero( real* v, int k )
 {
-	for( --k; k>=0; --k )
-		if( fabs( v[k] ) > ALMOST_ZERO )
-			return 0;
+    for( --k; k>=0; --k )
+        if( fabs( v[k] ) > ALMOST_ZERO )
+            return 0;
 
-	return 1;
+    return 1;
 }
 
 
 void Vector_MakeZero( real *v, int k )
 {
-	for( --k; k>=0; --k )
-		v[k] = 0;
+    for( --k; k>=0; --k )
+        v[k] = 0;
 }
 
 
 void Vector_Copy( real* dest, real* v, int k )
 {
-	for( --k; k>=0; --k )
-		dest[k] = v[k];
+    for( --k; k>=0; --k )
+        dest[k] = v[k];
 }
 
 
 void Vector_Print( FILE *fout, char *vname, real *v, int k )
 {
-	int i;
+    int i;
 
-	fprintf( fout, "%s:\n", vname );
-	for( i = 0; i < k; ++i )
-		fprintf( fout, "%24.15e\n", v[i] );
-	fprintf( fout, "\n" );
+    fprintf( fout, "%s:\n", vname );
+    for( i = 0; i < k; ++i )
+        fprintf( fout, "%24.15e\n", v[i] );
+    fprintf( fout, "\n" );
 }
 
 
 real Norm( real* v1, int k )
 {
-	real ret = 0;
+    real ret = 0;
 
-	for( --k; k>=0; --k )
-		ret +=  SQR( v1[k] );
+    for( --k; k>=0; --k )
+        ret +=  SQR( v1[k] );
 
-	return SQRT( ret );
+    return SQRT( ret );
 }
 
 
 void rvec_Sum( rvec ret, rvec v1 ,rvec v2 )
 {
-	ret[0] = v1[0] + v2[0];
-	ret[1] = v1[1] + v2[1];
-	ret[2] = v1[2] + v2[2];
+    ret[0] = v1[0] + v2[0];
+    ret[1] = v1[1] + v2[1];
+    ret[2] = v1[2] + v2[2];
 }
 
 
 real rvec_ScaledDot( real c1, rvec v1, real c2, rvec v2 )
 {
-	return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);
+    return (c1*c2) * (v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]);
 }
 
 
 void rvec_Multiply( rvec r, rvec v1, rvec v2 )
 {
-	r[0] = v1[0] * v2[0];
-	r[1] = v1[1] * v2[1];
-	r[2] = v1[2] * v2[2];
+    r[0] = v1[0] * v2[0];
+    r[1] = v1[1] * v2[1];
+    r[2] = v1[2] * v2[2];
 }
 
 
 void rvec_Divide( rvec r, rvec v1, rvec v2 )
 {
-	r[0] = v1[0] / v2[0];
-	r[1] = v1[1] / v2[1];
-	r[2] = v1[2] / v2[2];
+    r[0] = v1[0] / v2[0];
+    r[1] = v1[1] / v2[1];
+    r[2] = v1[2] / v2[2];
 }
 
 
 void rvec_iDivide( rvec r, rvec v1, ivec v2 )
 {
-	r[0] = v1[0] / v2[0];
-	r[1] = v1[1] / v2[1];
-	r[2] = v1[2] / v2[2];
+    r[0] = v1[0] / v2[0];
+    r[1] = v1[1] / v2[1];
+    r[2] = v1[2] / v2[2];
 }
 
 
 void rvec_Invert( rvec r, rvec v )
 {
-	r[0] = 1. / v[0];
-	r[1] = 1. / v[1];
-	r[2] = 1. / v[2];
+    r[0] = 1. / v[0];
+    r[1] = 1. / v[1];
+    r[2] = 1. / v[2];
 }
 
 
 void rvec_OuterProduct( rtensor r, rvec v1, rvec v2 )
 {
-	int i, j;
+    int i, j;
 
-	for( i = 0; i < 3; ++i )
-		for( j = 0; j < 3; ++j )
-			r[i][j] = v1[i] * v2[j];
+    for( i = 0; i < 3; ++i )
+        for( j = 0; j < 3; ++j )
+            r[i][j] = v1[i] * v2[j];
 }
 
 
 
 int rvec_isZero( rvec v )
 {
-	if( fabs(v[0]) > ALMOST_ZERO || 
-			fabs(v[1]) > ALMOST_ZERO || 
-			fabs(v[2]) > ALMOST_ZERO )
-		return 0;
-	return 1;
+    if( fabs(v[0]) > ALMOST_ZERO || 
+            fabs(v[1]) > ALMOST_ZERO || 
+            fabs(v[2]) > ALMOST_ZERO )
+        return 0;
+    return 1;
 }
 
 
 void rtensor_Multiply( rtensor ret, rtensor m1, rtensor m2 )
 {
-	int i, j, k;
-	rtensor temp;
-
-	// check if the result matrix is the same as one of m1, m2.
-	// if so, we cannot modify the contents of m1 or m2, so 
-	// we have to use a temp matrix.
-	if( ret == m1 || ret == m2 )
-	{
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-			{
-				temp[i][j] = 0;	    
-				for( k = 0; k < 3; ++k )
-					temp[i][j] += m1[i][k] * m2[k][j];
-			}
-
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-				ret[i][j] = temp[i][j];	
-	}
-	else
-	{
-		for( i = 0; i < 3; ++i )
-			for( j = 0; j < 3; ++j )
-			{
-				ret[i][j] = 0;	    
-				for( k = 0; k < 3; ++k )
-					ret[i][j] += m1[i][k] * m2[k][j];
-			}
-	}
+    int i, j, k;
+    rtensor temp;
+
+    // check if the result matrix is the same as one of m1, m2.
+    // if so, we cannot modify the contents of m1 or m2, so 
+    // we have to use a temp matrix.
+    if( ret == m1 || ret == m2 )
+    {
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+            {
+                temp[i][j] = 0;        
+                for( k = 0; k < 3; ++k )
+                    temp[i][j] += m1[i][k] * m2[k][j];
+            }
+
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+                ret[i][j] = temp[i][j];    
+    }
+    else
+    {
+        for( i = 0; i < 3; ++i )
+            for( j = 0; j < 3; ++j )
+            {
+                ret[i][j] = 0;        
+                for( k = 0; k < 3; ++k )
+                    ret[i][j] += m1[i][k] * m2[k][j];
+            }
+    }
 }
 
 
 void rtensor_MatVec( rvec ret, rtensor m, rvec v )
 {
-	int i;
-	rvec temp;
-
-	// if ret is the same vector as v, we cannot modify the 
-	// contents of v until all computation is finished.
-	if( ret == v )
-	{
-		for( i = 0; i < 3; ++i )
-			temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
-
-		for( i = 0; i < 3; ++i )
-			ret[i] = temp[i];
-	}
-	else
-	{
-		for( i = 0; i < 3; ++i )
-			ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
-	}
+    int i;
+    rvec temp;
+
+    // if ret is the same vector as v, we cannot modify the 
+    // contents of v until all computation is finished.
+    if( ret == v )
+    {
+        for( i = 0; i < 3; ++i )
+            temp[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+
+        for( i = 0; i < 3; ++i )
+            ret[i] = temp[i];
+    }
+    else
+    {
+        for( i = 0; i < 3; ++i )
+            ret[i] = m[i][0] * v[0] + m[i][1] * v[1] + m[i][2] * v[2];
+    }
 }
 
 
 void rtensor_Scale( rtensor ret, real c, rtensor m )
 {
-	int i, j;
+    int i, j;
 
-	for( i = 0; i < 3; ++i )
-		for( j = 0; j < 3; ++j )
-			ret[i][j] = c * m[i][j];
+    for( i = 0; i < 3; ++i )
+        for( j = 0; j < 3; ++j )
+            ret[i][j] = c * m[i][j];
 }
 
 
 void rtensor_Add( rtensor ret, rtensor t )
 {
-	int i, j;
+    int i, j;
 
-	for( i = 0; i < 3; ++i )
-		for( j = 0; j < 3; ++j )
-			ret[i][j] += t[i][j];
+    for( i = 0; i < 3; ++i )
+        for( j = 0; j < 3; ++j )
+            ret[i][j] += t[i][j];
 }
 
 
 void rtensor_ScaledAdd( rtensor ret, real c, rtensor t )
 {
-	int i, j;
+    int i, j;
 
-	for( i = 0; i < 3; ++i )
-		for( j = 0; j < 3; ++j )
-			ret[i][j] += c * t[i][j];
+    for( i = 0; i < 3; ++i )
+        for( j = 0; j < 3; ++j )
+            ret[i][j] += c * t[i][j];
 }
 
 
 void rtensor_Sum( rtensor ret, rtensor t1, rtensor t2 )
 {
-	int i, j;
+    int i, j;
 
-	for( i = 0; i < 3; ++i )
-		for( j = 0; j < 3; ++j )
-			ret[i][j] = t1[i][j] + t2[i][j];
+    for( i = 0; i < 3; ++i )
+        for( j = 0; j < 3; ++j )
+            ret[i][j] = t1[i][j] + t2[i][j];
 }
 
 
 void rtensor_ScaledSum( rtensor ret, real c1, rtensor t1, 
-		real c2, rtensor t2 )
+        real c2, rtensor t2 )
 {
-	int i, j;
+    int i, j;
 
-	for( i = 0; i < 3; ++i )
-		for( j = 0; j < 3; ++j )
-			ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j];
+    for( i = 0; i < 3; ++i )
+        for( j = 0; j < 3; ++j )
+            ret[i][j] = c1 * t1[i][j] + c2 * t2[i][j];
 }
 
 
 void rtensor_Copy( rtensor ret, rtensor t )
 {
-	int i, j;
+    int i, j;
 
-	for( i = 0; i < 3; ++i )
-		for( j = 0; j < 3; ++j )
-			ret[i][j] = t[i][j];
+    for( i = 0; i < 3; ++i )
+        for( j = 0; j < 3; ++j )
+            ret[i][j] = t[i][j];
 }
 
 
 void rtensor_Identity( rtensor t )
 {
-	t[0][0] = t[1][1] = t[2][2] = 1;
-	t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = ZERO;
+    t[0][0] = t[1][1] = t[2][2] = 1;
+    t[0][1] = t[0][2] = t[1][0] = t[1][2] = t[2][0] = t[2][1] = ZERO;
 }
 
 
 void rtensor_MakeZero( rtensor t )
 {
-	t[0][0] = t[0][1] = t[0][2] = ZERO;
-	t[1][0] = t[1][1] = t[1][2] = ZERO;
-	t[2][0] = t[2][1] = t[2][2] = ZERO;
+    t[0][0] = t[0][1] = t[0][2] = ZERO;
+    t[1][0] = t[1][1] = t[1][2] = ZERO;
+    t[2][0] = t[2][1] = t[2][2] = ZERO;
 }
 
 
 void rtensor_Transpose( rtensor ret, rtensor t )
 {
-	ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
-	ret[0][1] = t[1][0], ret[0][2] = t[2][0];
-	ret[1][0] = t[0][1], ret[1][2] = t[2][1];
-	ret[2][0] = t[0][2], ret[2][1] = t[1][2];
+    ret[0][0] = t[0][0], ret[1][1] = t[1][1], ret[2][2] = t[2][2];
+    ret[0][1] = t[1][0], ret[0][2] = t[2][0];
+    ret[1][0] = t[0][1], ret[1][2] = t[2][1];
+    ret[2][0] = t[0][2], ret[2][1] = t[1][2];
 }
 
 
 real rtensor_Det( rtensor t )
 {
-	return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
-			t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
-			t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
+    return ( t[0][0] * (t[1][1] * t[2][2] - t[1][2] * t[2][1] ) +
+            t[0][1] * (t[1][2] * t[2][0] - t[1][0] * t[2][2] ) +
+            t[0][2] * (t[1][0] * t[2][1] - t[1][1] * t[2][0] ) );
 }
 
 
 real rtensor_Trace( rtensor t )
 {
-	return (t[0][0] + t[1][1] + t[2][2]);
+    return (t[0][0] + t[1][1] + t[2][2]);
 }
 
 
 void Print_rTensor(FILE* fp, rtensor t)
 {
-	int i, j;
-
-	for (i=0; i < 3; i++)
-	{
-		fprintf(fp,"[");
-		for (j=0; j < 3; j++)
-			fprintf(fp,"%8.3f,\t",t[i][j]);
-		fprintf(fp,"]\n");
-	}
+    int i, j;
+
+    for (i=0; i < 3; i++)
+    {
+        fprintf(fp,"[");
+        for (j=0; j < 3; j++)
+            fprintf(fp,"%8.3f,\t",t[i][j]);
+        fprintf(fp,"]\n");
+    }
 }
 
 
 void ivec_MakeZero( ivec v )
 {
-	v[0] = v[1] = v[2] = 0;
+    v[0] = v[1] = v[2] = 0;
 }
 
 
 void ivec_rScale( ivec dest, real C, rvec src )
 {
-	dest[0] = (int)(C * src[0]);
-	dest[1] = (int)(C * src[1]);
-	dest[2] = (int)(C * src[2]);
+    dest[0] = (int)(C * src[0]);
+    dest[1] = (int)(C * src[1]);
+    dest[2] = (int)(C * src[2]);
 }
 
 
 int ivec_isZero( ivec v )
 {
-	if( v[0]==0 && v[1]==0 && v[2]==0 )
-		return 1;
-	return 0;
+    if( v[0]==0 && v[1]==0 && v[2]==0 )
+        return 1;
+    return 0;
 }
 
 
 int ivec_isEqual( ivec v1, ivec v2 )
 {
-	if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] )
-		return 1;
+    if( v1[0]==v2[0] && v1[1]==v2[1] && v1[2]==v2[2] )
+        return 1;
 
-	return 0;
+    return 0;
 }
 
 
-- 
GitLab